Reorganisation of the source tree

Most of the other users of the fptools build system have migrated to Cabal, and with the move to darcs we can now flatten the source tree without losing history, so here goes. The main change is that the ghc/ subdir is gone, and most of what it contained is now at the top level. The build system now makes no pretense at being multi-project, it is just the GHC build system. No doubt this will break many things, and there will be a period of instability while we fix the dependencies. A straightforward build should work, but I haven't yet fixed binary/source distributions. Changes to the Building Guide will follow, too.
author: Simon Marlow <simonmar@microsoft.com> 2006-04-07 02:05:11 +0000
committer: Simon Marlow <simonmar@microsoft.com> 2006-04-07 02:05:11 +0000
commit: 0065d5ab628975892cea1ec7303f968c3338cbe1 (patch)
tree: 8e2afe0ab48ee33cf95009809d67c9649573ef92 /rts
parent: 28a464a75e14cece5db40f2765a29348273ff2d2 (diff)
download: haskell-0065d5ab628975892cea1ec7303f968c3338cbe1.tar.gz
684 files changed, 164348 insertions, 0 deletions
diff --git a/rts/Adjustor.c b/rts/Adjustor.c
new file mode 100644
index 0000000000..f3e5bfe6aa
--- /dev/null
+++ b/rts/Adjustor.c
@@ -0,0 +1,1110 @@
+/* -----------------------------------------------------------------------------
+ * Foreign export adjustor thunks
+ *
+ * Copyright (c) 1998.
+ *
+ * ---------------------------------------------------------------------------*/
+
+/* A little bit of background...
+
+An adjustor thunk is a dynamically allocated code snippet that allows
+Haskell closures to be viewed as C function pointers. 
+
+Stable pointers provide a way for the outside world to get access to,
+and evaluate, Haskell heap objects, with the RTS providing a small
+range of ops for doing so. So, assuming we've got a stable pointer in
+our hand in C, we can jump into the Haskell world and evaluate a callback
+procedure, say. This works OK in some cases where callbacks are used, but
+does require the external code to know about stable pointers and how to deal
+with them. We'd like to hide the Haskell-nature of a callback and have it
+be invoked just like any other C function pointer. 
+
+Enter adjustor thunks. An adjustor thunk is a little piece of code
+that's generated on-the-fly (one per Haskell closure being exported)
+that, when entered using some 'universal' calling convention (e.g., the
+C calling convention on platform X), pushes an implicit stable pointer
+(to the Haskell callback) before calling another (static) C function stub
+which takes care of entering the Haskell code via its stable pointer.
+
+An adjustor thunk is allocated on the C heap, and is called from within
+Haskell just before handing out the function pointer to the Haskell (IO)
+action. User code should never have to invoke it explicitly.
+
+An adjustor thunk differs from a C function pointer in one respect: when
+the code is through with it, it has to be freed in order to release Haskell
+and C resources. Failure to do so result in memory leaks on both the C and
+Haskell side.
+*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsExternal.h"
+#include "RtsUtils.h"
+#include <stdlib.h>
+
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
+#if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
+#include <string.h>
+#endif
+
+#ifdef LEADING_UNDERSCORE
+#define UNDERSCORE "_"
+#else 
+#define UNDERSCORE ""
+#endif
+#if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
+/* 
+  Now here's something obscure for you:
+
+  When generating an adjustor thunk that uses the C calling
+  convention, we have to make sure that the thunk kicks off
+  the process of jumping into Haskell with a tail jump. Why?
+  Because as a result of jumping in into Haskell we may end
+  up freeing the very adjustor thunk we came from using
+  freeHaskellFunctionPtr(). Hence, we better not return to
+  the adjustor code on our way  out, since it could by then
+  point to junk.
+  
+  The fix is readily at hand, just include the opcodes
+  for the C stack fixup code that we need to perform when
+  returning in some static piece of memory and arrange
+  to return to it before tail jumping from the adjustor thunk.
+*/
+static void  GNUC3_ATTRIBUTE(used) obscure_ccall_wrapper(void)
+{
+  __asm__ (
+     ".globl " UNDERSCORE "obscure_ccall_ret_code\n"
+     UNDERSCORE "obscure_ccall_ret_code:\n\t"
+     "addl $0x4, %esp\n\t"
+     "ret"
+   );
+}
+extern void obscure_ccall_ret_code(void);
+
+#if defined(openbsd_HOST_OS)
+static unsigned char *obscure_ccall_ret_code_dyn;
+#endif
+
+#endif
+
+#if defined(x86_64_HOST_ARCH)
+static void GNUC3_ATTRIBUTE(used) obscure_ccall_wrapper(void)
+{
+  __asm__ (
+   ".globl " UNDERSCORE "obscure_ccall_ret_code\n"
+   UNDERSCORE "obscure_ccall_ret_code:\n\t"
+   "addq $0x8, %rsp\n\t"
+   "ret"
+  );
+}
+extern void obscure_ccall_ret_code(void);
+#endif
+
+#if defined(alpha_HOST_ARCH)
+/* To get the definition of PAL_imb: */
+# if defined(linux_HOST_OS)
+#  include <asm/pal.h>
+# else
+#  include <machine/pal.h>
+# endif
+#endif
+
+#if defined(ia64_HOST_ARCH)
+#include "Storage.h"
+
+/* Layout of a function descriptor */
+typedef struct _IA64FunDesc {
+    StgWord64 ip;
+    StgWord64 gp;
+} IA64FunDesc;
+
+static void *
+stgAllocStable(size_t size_in_bytes, StgStablePtr *stable)
+{
+  StgArrWords* arr;
+  nat data_size_in_words, total_size_in_words;
+  
+  /* round up to a whole number of words */
+  data_size_in_words  = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
+  total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
+  
+  /* allocate and fill it in */
+  arr = (StgArrWords *)allocate(total_size_in_words);
+  SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
+ 
+  /* obtain a stable ptr */
+  *stable = getStablePtr((StgPtr)arr);
+
+  /* and return a ptr to the goods inside the array */
+  return(&(arr->payload));
+}
+#endif
+
+#if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
+__asm__("obscure_ccall_ret_code:\n\t"
+        "lwz 1,0(1)\n\t"
+        "lwz 0,4(1)\n\t"
+        "mtlr 0\n\t"
+        "blr");
+extern void obscure_ccall_ret_code(void);
+#endif
+
+#if defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
+#if !(defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS))
+
+/* !!! !!! WARNING: !!! !!!
+ * This structure is accessed from AdjustorAsm.s
+ * Any changes here have to be mirrored in the offsets there.
+ */
+
+typedef struct AdjustorStub {
+#if defined(powerpc_HOST_ARCH) && defined(darwin_HOST_OS)
+    unsigned        lis;
+    unsigned        ori;
+    unsigned        lwz;
+    unsigned        mtctr;
+    unsigned        bctr;
+    StgFunPtr       code;
+#elif defined(powerpc64_HOST_ARCH) && defined(darwin_HOST_OS)
+        /* powerpc64-darwin: just guessing that it won't use fundescs. */
+    unsigned        lis;
+    unsigned        ori;
+    unsigned        rldimi;
+    unsigned        oris;
+    unsigned        ori2;
+    unsigned        lwz;
+    unsigned        mtctr;
+    unsigned        bctr;
+    StgFunPtr       code;
+#else
+        /* fundesc-based ABIs */
+#define         FUNDESCS
+    StgFunPtr       code;
+    struct AdjustorStub
+                    *toc;
+    void            *env;
+#endif
+    StgStablePtr    hptr;
+    StgFunPtr       wptr;
+    StgInt          negative_framesize;
+    StgInt          extrawords_plus_one;
+} AdjustorStub;
+
+#endif
+#endif
+
+#if defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
+
+/* !!! !!! WARNING: !!! !!!
+ * This structure is accessed from AdjustorAsm.s
+ * Any changes here have to be mirrored in the offsets there.
+ */
+
+typedef struct AdjustorStub {
+    unsigned char   call[8];
+    StgStablePtr    hptr;
+    StgFunPtr       wptr;
+    StgInt          frame_size;
+    StgInt          argument_size;
+} AdjustorStub;
+#endif
+
+#if defined(darwin_HOST_OS) || defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
+static int totalArgumentSize(char *typeString)
+{
+    int sz = 0;
+    while(*typeString)
+    {
+        char t = *typeString++;
+
+        switch(t)
+        {
+                // on 32-bit platforms, Double and Int64 occupy two words.
+            case 'd':
+            case 'l':
+                if(sizeof(void*) == 4)
+                {
+                    sz += 2;
+                    break;
+                }
+                // everything else is one word.
+            default:
+                sz += 1;
+        }
+    }
+    return sz;
+}
+#endif
+
+void*
+createAdjustor(int cconv, StgStablePtr hptr,
+	       StgFunPtr wptr,
+	       char *typeString
+#if !defined(powerpc_HOST_ARCH) && !defined(powerpc64_HOST_ARCH) && !defined(x86_64_HOST_ARCH)
+	          STG_UNUSED
+#endif
+              )
+{
+  void *adjustor = NULL;
+
+  switch (cconv)
+  {
+  case 0: /* _stdcall */
+#if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
+    /* Magic constant computed by inspecting the code length of
+       the following assembly language snippet
+       (offset and machine code prefixed):
+
+     <0>:	58	          popl   %eax              # temp. remove ret addr..
+     <1>:	68 fd fc fe fa    pushl  0xfafefcfd  	   # constant is large enough to
+        			   	           	   # hold a StgStablePtr
+     <6>:	50	          pushl  %eax		   # put back ret. addr
+     <7>:	b8 fa ef ff 00	  movl   $0x00ffeffa, %eax # load up wptr
+     <c>: 	ff e0             jmp    %eax        	   # and jump to it.
+		# the callee cleans up the stack
+    */
+    adjustor = stgMallocBytesRWX(14);
+    {
+	unsigned char *const adj_code = (unsigned char *)adjustor;
+	adj_code[0x00] = (unsigned char)0x58;  /* popl %eax  */
+
+	adj_code[0x01] = (unsigned char)0x68;  /* pushl hptr (which is a dword immediate ) */
+	*((StgStablePtr*)(adj_code + 0x02)) = (StgStablePtr)hptr;
+
+	adj_code[0x06] = (unsigned char)0x50; /* pushl %eax */
+
+	adj_code[0x07] = (unsigned char)0xb8; /* movl  $wptr, %eax */
+	*((StgFunPtr*)(adj_code + 0x08)) = (StgFunPtr)wptr;
+
+	adj_code[0x0c] = (unsigned char)0xff; /* jmp %eax */
+	adj_code[0x0d] = (unsigned char)0xe0;
+    }
+#endif
+    break;
+
+  case 1: /* _ccall */
+#if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
+  /* Magic constant computed by inspecting the code length of
+     the following assembly language snippet
+     (offset and machine code prefixed):
+
+  <00>: 68 ef be ad de     pushl  $0xdeadbeef  	   # constant is large enough to
+        			   	           # hold a StgStablePtr
+  <05>:	b8 fa ef ff 00	   movl   $0x00ffeffa, %eax # load up wptr
+  <0a>: 68 ef be ad de     pushl  $obscure_ccall_ret_code # push the return address
+  <0f>: ff e0              jmp    *%eax            # jump to wptr
+
+    The ccall'ing version is a tad different, passing in the return
+    address of the caller to the auto-generated C stub (which enters
+    via the stable pointer.) (The auto-generated C stub is in on this
+    game, don't worry :-)
+
+    See the comment next to obscure_ccall_ret_code why we need to
+    perform a tail jump instead of a call, followed by some C stack
+    fixup.
+
+    Note: The adjustor makes the assumption that any return value
+    coming back from the C stub is not stored on the stack.
+    That's (thankfully) the case here with the restricted set of 
+    return types that we support.
+  */
+    adjustor = stgMallocBytesRWX(17);
+    {
+	unsigned char *const adj_code = (unsigned char *)adjustor;
+
+	adj_code[0x00] = (unsigned char)0x68;  /* pushl hptr (which is a dword immediate ) */
+	*((StgStablePtr*)(adj_code+0x01)) = (StgStablePtr)hptr;
+
+	adj_code[0x05] = (unsigned char)0xb8;  /* movl  $wptr, %eax */
+	*((StgFunPtr*)(adj_code + 0x06)) = (StgFunPtr)wptr;
+
+	adj_code[0x0a] = (unsigned char)0x68;  /* pushl obscure_ccall_ret_code */
+	*((StgFunPtr*)(adj_code + 0x0b)) = 
+#if !defined(openbsd_HOST_OS)
+			(StgFunPtr)obscure_ccall_ret_code;
+#else
+			(StgFunPtr)obscure_ccall_ret_code_dyn;
+#endif
+
+	adj_code[0x0f] = (unsigned char)0xff; /* jmp *%eax */
+	adj_code[0x10] = (unsigned char)0xe0; 
+    }
+#elif defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
+    {
+        /*
+          What's special about Darwin/Mac OS X on i386?
+          It wants the stack to stay 16-byte aligned.
+          
+          We offload most of the work to AdjustorAsm.S.
+        */
+        AdjustorStub *adjustorStub = stgMallocBytesRWX(sizeof(AdjustorStub));
+        adjustor = adjustorStub;
+
+        extern void adjustorCode(void);
+        int sz = totalArgumentSize(typeString);
+        
+        adjustorStub->call[0] = 0xe8;
+        *(long*)&adjustorStub->call[1] = ((char*)&adjustorCode) - ((char*)adjustorStub + 5);
+        adjustorStub->hptr = hptr;
+        adjustorStub->wptr = wptr;
+        
+            // The adjustor puts the following things on the stack:
+            // 1.) %ebp link
+            // 2.) padding and (a copy of) the arguments
+            // 3.) a dummy argument
+            // 4.) hptr
+            // 5.) return address (for returning to the adjustor)
+            // All these have to add up to a multiple of 16. 
+
+            // first, include everything in frame_size
+        adjustorStub->frame_size = sz * 4 + 16;
+            // align to 16 bytes
+        adjustorStub->frame_size = (adjustorStub->frame_size + 15) & ~15;
+            // only count 2.) and 3.) as part of frame_size
+        adjustorStub->frame_size -= 12; 
+        adjustorStub->argument_size = sz;
+    }
+    
+#elif defined(x86_64_HOST_ARCH)
+    /*
+      stack at call:
+               argn
+	       ...
+	       arg7
+               return address
+	       %rdi,%rsi,%rdx,%rcx,%r8,%r9 = arg0..arg6
+
+      if there are <6 integer args, then we can just push the
+      StablePtr into %edi and shuffle the other args up.
+
+      If there are >=6 integer args, then we have to flush one arg
+      to the stack, and arrange to adjust the stack ptr on return.
+      The stack will be rearranged to this:
+
+             argn
+	     ...
+	     arg7
+	     return address  *** <-- dummy arg in stub fn.
+	     arg6
+	     obscure_ccall_ret_code
+
+      This unfortunately means that the type of the stub function
+      must have a dummy argument for the original return address
+      pointer inserted just after the 6th integer argument.
+
+      Code for the simple case:
+
+   0:   4d 89 c1                mov    %r8,%r9
+   3:   49 89 c8                mov    %rcx,%r8
+   6:   48 89 d1                mov    %rdx,%rcx
+   9:   48 89 f2                mov    %rsi,%rdx
+   c:   48 89 fe                mov    %rdi,%rsi
+   f:   48 8b 3d 0a 00 00 00    mov    10(%rip),%rdi
+  16:   ff 25 0c 00 00 00       jmpq   *12(%rip)
+  ... 
+  20: .quad 0  # aligned on 8-byte boundary
+  28: .quad 0  # aligned on 8-byte boundary
+
+
+  And the version for >=6 integer arguments:
+
+   0:   41 51                   push   %r9
+   2:   ff 35 20 00 00 00       pushq  32(%rip)        # 28 <ccall_adjustor+0x28>
+   8:   4d 89 c1                mov    %r8,%r9
+   b:   49 89 c8                mov    %rcx,%r8
+   e:   48 89 d1                mov    %rdx,%rcx
+  11:   48 89 f2                mov    %rsi,%rdx
+  14:   48 89 fe                mov    %rdi,%rsi
+  17:   48 8b 3d 12 00 00 00    mov    18(%rip),%rdi        # 30 <ccall_adjustor+0x30>
+  1e:   ff 25 14 00 00 00       jmpq   *20(%rip)        # 38 <ccall_adjustor+0x38>
+  ...
+  28: .quad 0  # aligned on 8-byte boundary
+  30: .quad 0  # aligned on 8-byte boundary
+  38: .quad 0  # aligned on 8-byte boundary
+    */
+
+    /* we assume the small code model (gcc -mcmmodel=small) where
+     * all symbols are <2^32, so hence wptr should fit into 32 bits.
+     */
+    ASSERT(((long)wptr >> 32) == 0);
+
+    {  
+	int i = 0;
+	char *c;
+
+	// determine whether we have 6 or more integer arguments,
+	// and therefore need to flush one to the stack.
+	for (c = typeString; *c != '\0'; c++) {
+	    if (*c == 'i' || *c == 'l') i++;
+	    if (i == 6) break;
+	}
+
+	if (i < 6) {
+	    adjustor = stgMallocBytesRWX(0x30);
+
+	    *(StgInt32 *)adjustor        = 0x49c1894d;
+	    *(StgInt32 *)(adjustor+0x4)  = 0x8948c889;
+	    *(StgInt32 *)(adjustor+0x8)  = 0xf28948d1;
+	    *(StgInt32 *)(adjustor+0xc)  = 0x48fe8948;
+	    *(StgInt32 *)(adjustor+0x10) = 0x000a3d8b;
+	    *(StgInt32 *)(adjustor+0x14) = 0x25ff0000;
+	    *(StgInt32 *)(adjustor+0x18) = 0x0000000c;
+	    *(StgInt64 *)(adjustor+0x20) = (StgInt64)hptr;
+	    *(StgInt64 *)(adjustor+0x28) = (StgInt64)wptr;
+	}
+	else
+	{
+	    adjustor = stgMallocBytesRWX(0x40);
+
+	    *(StgInt32 *)adjustor        = 0x35ff5141;
+	    *(StgInt32 *)(adjustor+0x4)  = 0x00000020;
+	    *(StgInt32 *)(adjustor+0x8)  = 0x49c1894d;
+	    *(StgInt32 *)(adjustor+0xc)  = 0x8948c889;
+	    *(StgInt32 *)(adjustor+0x10) = 0xf28948d1;
+	    *(StgInt32 *)(adjustor+0x14) = 0x48fe8948;
+	    *(StgInt32 *)(adjustor+0x18) = 0x00123d8b;
+	    *(StgInt32 *)(adjustor+0x1c) = 0x25ff0000;
+	    *(StgInt32 *)(adjustor+0x20) = 0x00000014;
+	    
+	    *(StgInt64 *)(adjustor+0x28) = (StgInt64)obscure_ccall_ret_code;
+	    *(StgInt64 *)(adjustor+0x30) = (StgInt64)hptr;
+	    *(StgInt64 *)(adjustor+0x38) = (StgInt64)wptr;
+	}
+    }
+#elif defined(sparc_HOST_ARCH)
+  /* Magic constant computed by inspecting the code length of the following
+     assembly language snippet (offset and machine code prefixed):
+
+     <00>: 9C23A008   sub   %sp, 8, %sp         ! make room for %o4/%o5 in caller's frame
+     <04>: DA23A060   st    %o5, [%sp + 96]     ! shift registers by 2 positions
+     <08>: D823A05C   st    %o4, [%sp + 92]
+     <0C>: 9A10000B   mov   %o3, %o5
+     <10>: 9810000A   mov   %o2, %o4
+     <14>: 96100009   mov   %o1, %o3
+     <18>: 94100008   mov   %o0, %o2
+     <1C>: 13000000   sethi %hi(wptr), %o1      ! load up wptr (1 of 2)
+     <20>: 11000000   sethi %hi(hptr), %o0      ! load up hptr (1 of 2)
+     <24>: 81C26000   jmp   %o1 + %lo(wptr)     ! jump to wptr (load 2 of 2)
+     <28>: 90122000   or    %o0, %lo(hptr), %o0 ! load up hptr (2 of 2, delay slot)
+     <2C>  00000000                             ! place for getting hptr back easily
+
+     ccall'ing on SPARC is easy, because we are quite lucky to push a
+     multiple of 8 bytes (1 word hptr + 1 word dummy arg) in front of the
+     existing arguments (note that %sp must stay double-word aligned at
+     all times, see ABI spec at http://www.sparc.org/standards/psABI3rd.pdf).
+     To do this, we extend the *caller's* stack frame by 2 words and shift
+     the output registers used for argument passing (%o0 - %o5, we are a *leaf*
+     procedure because of the tail-jump) by 2 positions. This makes room in
+     %o0 and %o1 for the additinal arguments, namely  hptr and a dummy (used
+     for destination addr of jump on SPARC, return address on x86, ...). This
+     shouldn't cause any problems for a C-like caller: alloca is implemented
+     similarly, and local variables should be accessed via %fp, not %sp. In a
+     nutshell: This should work! (Famous last words! :-)
+  */
+    adjustor = stgMallocBytesRWX(4*(11+1));
+    {
+        unsigned long *const adj_code = (unsigned long *)adjustor;
+
+        adj_code[ 0]  = 0x9C23A008UL;   /* sub   %sp, 8, %sp         */
+        adj_code[ 1]  = 0xDA23A060UL;   /* st    %o5, [%sp + 96]     */
+        adj_code[ 2]  = 0xD823A05CUL;   /* st    %o4, [%sp + 92]     */
+        adj_code[ 3]  = 0x9A10000BUL;   /* mov   %o3, %o5            */
+        adj_code[ 4]  = 0x9810000AUL;   /* mov   %o2, %o4            */
+        adj_code[ 5]  = 0x96100009UL;   /* mov   %o1, %o3            */
+        adj_code[ 6]  = 0x94100008UL;   /* mov   %o0, %o2            */
+        adj_code[ 7]  = 0x13000000UL;   /* sethi %hi(wptr), %o1      */
+        adj_code[ 7] |= ((unsigned long)wptr) >> 10;
+        adj_code[ 8]  = 0x11000000UL;   /* sethi %hi(hptr), %o0      */
+        adj_code[ 8] |= ((unsigned long)hptr) >> 10;
+        adj_code[ 9]  = 0x81C26000UL;   /* jmp   %o1 + %lo(wptr)     */
+        adj_code[ 9] |= ((unsigned long)wptr) & 0x000003FFUL;
+        adj_code[10]  = 0x90122000UL;   /* or    %o0, %lo(hptr), %o0 */
+        adj_code[10] |= ((unsigned long)hptr) & 0x000003FFUL;
+
+        adj_code[11]  = (unsigned long)hptr;
+
+        /* flush cache */
+        asm("flush %0" : : "r" (adj_code     ));
+        asm("flush %0" : : "r" (adj_code +  2));
+        asm("flush %0" : : "r" (adj_code +  4));
+        asm("flush %0" : : "r" (adj_code +  6));
+        asm("flush %0" : : "r" (adj_code + 10));
+
+        /* max. 5 instructions latency, and we need at >= 1 for returning */
+        asm("nop");
+        asm("nop");
+        asm("nop");
+        asm("nop");
+    }
+#elif defined(alpha_HOST_ARCH)
+  /* Magic constant computed by inspecting the code length of
+     the following assembly language snippet
+     (offset and machine code prefixed; note that the machine code
+     shown is longwords stored in little-endian order):
+
+  <00>: 46520414	mov	a2, a4
+  <04>: 46100412	mov	a0, a2
+  <08>: a61b0020	ldq     a0, 0x20(pv)	# load up hptr
+  <0c>: 46730415	mov	a3, a5
+  <10>: a77b0028	ldq     pv, 0x28(pv)	# load up wptr
+  <14>: 46310413	mov	a1, a3
+  <18>: 6bfb----	jmp     (pv), <hint>	# jump to wptr (with hint)
+  <1c>: 00000000				# padding for alignment
+  <20>: [8 bytes for hptr quadword]
+  <28>: [8 bytes for wptr quadword]
+
+     The "computed" jump at <08> above is really a jump to a fixed
+     location.  Accordingly, we place an always-correct hint in the
+     jump instruction, namely the address offset from <0c> to wptr,
+     divided by 4, taking the lowest 14 bits.
+
+     We only support passing 4 or fewer argument words, for the same
+     reason described under sparc_HOST_ARCH above by JRS, 21 Aug 01.
+     On the Alpha the first 6 integer arguments are in a0 through a5,
+     and the rest on the stack.  Hence we want to shuffle the original
+     caller's arguments by two.
+
+     On the Alpha the calling convention is so complex and dependent
+     on the callee's signature -- for example, the stack pointer has
+     to be a multiple of 16 -- that it seems impossible to me [ccshan]
+     to handle the general case correctly without changing how the
+     adjustor is called from C.  For now, our solution of shuffling
+     registers only and ignoring the stack only works if the original
+     caller passed 4 or fewer argument words.
+
+TODO: Depending on how much allocation overhead stgMallocBytes uses for
+      header information (more precisely, if the overhead is no more than
+      4 bytes), we should move the first three instructions above down by
+      4 bytes (getting rid of the nop), hence saving memory. [ccshan]
+  */
+    ASSERT(((StgWord64)wptr & 3) == 0);
+    adjustor = stgMallocBytesRWX(48);
+    {
+	StgWord64 *const code = (StgWord64 *)adjustor;
+
+	code[0] = 0x4610041246520414L;
+	code[1] = 0x46730415a61b0020L;
+	code[2] = 0x46310413a77b0028L;
+	code[3] = 0x000000006bfb0000L
+		| (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
+
+	code[4] = (StgWord64)hptr;
+	code[5] = (StgWord64)wptr;
+
+	/* Ensure that instruction cache is consistent with our new code */
+	__asm__ volatile("call_pal %0" : : "i" (PAL_imb));
+    }
+#elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
+
+#define OP_LO(op,lo)  ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
+#define OP_HI(op,hi)  ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
+    {
+        /* The PowerPC Linux (32-bit) calling convention is annoyingly complex.
+           We need to calculate all the details of the stack frame layout,
+           taking into account the types of all the arguments, and then
+           generate code on the fly. */
+    
+        int src_gpr = 3, dst_gpr = 5;
+        int fpr = 3;
+        int src_offset = 0, dst_offset = 0;
+        int n = strlen(typeString),i;
+        int src_locs[n], dst_locs[n];
+        int frameSize;
+        unsigned *code;
+      
+            /* Step 1:
+               Calculate where the arguments should go.
+               src_locs[] will contain the locations of the arguments in the
+               original stack frame passed to the adjustor.
+               dst_locs[] will contain the locations of the arguments after the
+               adjustor runs, on entry to the wrapper proc pointed to by wptr.
+
+               This algorithm is based on the one described on page 3-19 of the
+               System V ABI PowerPC Processor Supplement.
+            */
+        for(i=0;typeString[i];i++)
+        {
+            char t = typeString[i];
+            if((t == 'f' || t == 'd') && fpr <= 8)
+                src_locs[i] = dst_locs[i] = -32-(fpr++);
+            else
+            {
+                if(t == 'l' && src_gpr <= 9)
+                {
+                    if((src_gpr & 1) == 0)
+                        src_gpr++;
+                    src_locs[i] = -src_gpr;
+                    src_gpr += 2;
+                }
+                else if(t == 'i' && src_gpr <= 10)
+                {
+                    src_locs[i] = -(src_gpr++);
+                }
+                else
+                {
+                    if(t == 'l' || t == 'd')
+                    {
+                        if(src_offset % 8)
+                            src_offset += 4;
+                    }
+                    src_locs[i] = src_offset;
+                    src_offset += (t == 'l' || t == 'd') ? 8 : 4;
+                }
+
+                if(t == 'l' && dst_gpr <= 9)
+                {
+                    if((dst_gpr & 1) == 0)
+                        dst_gpr++;
+                    dst_locs[i] = -dst_gpr;
+                    dst_gpr += 2;
+                }
+                else if(t == 'i' && dst_gpr <= 10)
+                {
+                    dst_locs[i] = -(dst_gpr++);
+                }
+                else
+                {
+                    if(t == 'l' || t == 'd')
+                    {
+                        if(dst_offset % 8)
+                            dst_offset += 4;
+                    }
+                    dst_locs[i] = dst_offset;
+                    dst_offset += (t == 'l' || t == 'd') ? 8 : 4;
+                }
+            }
+        }
+
+        frameSize = dst_offset + 8;
+        frameSize = (frameSize+15) & ~0xF;
+
+            /* Step 2:
+               Build the adjustor.
+            */
+                    // allocate space for at most 4 insns per parameter
+                    // plus 14 more instructions.
+        adjustor = stgMallocBytesRWX(4 * (4*n + 14));
+        code = (unsigned*)adjustor;
+        
+        *code++ = 0x48000008; // b *+8
+            // * Put the hptr in a place where freeHaskellFunctionPtr
+            //   can get at it.
+        *code++ = (unsigned) hptr;
+
+            // * save the link register
+        *code++ = 0x7c0802a6; // mflr r0;
+        *code++ = 0x90010004; // stw r0, 4(r1);
+            // * and build a new stack frame
+        *code++ = OP_LO(0x9421, -frameSize); // stwu r1, -frameSize(r1)
+
+            // * now generate instructions to copy arguments
+            //   from the old stack frame into the new stack frame.
+        for(i=n-1;i>=0;i--)
+        {
+            if(src_locs[i] < -32)
+                ASSERT(dst_locs[i] == src_locs[i]);
+            else if(src_locs[i] < 0)
+            {
+                // source in GPR.
+                ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
+                if(dst_locs[i] < 0)
+                {
+                    ASSERT(dst_locs[i] > -32);
+                        // dst is in GPR, too.
+
+                    if(typeString[i] == 'l')
+                    {
+                            // mr dst+1, src+1
+                        *code++ = 0x7c000378
+                                | ((-dst_locs[i]+1) << 16)
+                                | ((-src_locs[i]+1) << 11)
+                                | ((-src_locs[i]+1) << 21);
+                    }
+                    // mr dst, src
+                    *code++ = 0x7c000378
+                            | ((-dst_locs[i]) << 16)
+                            | ((-src_locs[i]) << 11)
+                            | ((-src_locs[i]) << 21);
+                }
+                else
+                {
+                    if(typeString[i] == 'l')
+                    {
+                            // stw src+1, dst_offset+4(r1)
+                        *code++ = 0x90010000
+                                | ((-src_locs[i]+1) << 21)
+                                | (dst_locs[i] + 4);
+                    }
+                    
+                        // stw src, dst_offset(r1)
+                    *code++ = 0x90010000
+                            | ((-src_locs[i]) << 21)
+                            | (dst_locs[i] + 8);
+                }
+            }
+            else
+            {
+                ASSERT(dst_locs[i] >= 0);
+                ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
+
+                if(typeString[i] == 'l')
+                {
+                    // lwz r0, src_offset(r1)
+                        *code++ = 0x80010000
+                                | (src_locs[i] + frameSize + 8 + 4);
+                    // stw r0, dst_offset(r1)
+                        *code++ = 0x90010000
+                                | (dst_locs[i] + 8 + 4);
+                    }
+                // lwz r0, src_offset(r1)
+                    *code++ = 0x80010000
+                            | (src_locs[i] + frameSize + 8);
+                // stw r0, dst_offset(r1)
+                    *code++ = 0x90010000
+                            | (dst_locs[i] + 8);
+           }
+        }
+
+            // * hptr will be the new first argument.
+            // lis r3, hi(hptr)
+        *code++ = OP_HI(0x3c60, hptr);
+            // ori r3,r3,lo(hptr)
+        *code++ = OP_LO(0x6063, hptr);
+
+            // * we need to return to a piece of code
+            //   which will tear down the stack frame.
+            // lis r11,hi(obscure_ccall_ret_code)
+        *code++ = OP_HI(0x3d60, obscure_ccall_ret_code);
+            // ori r11,r11,lo(obscure_ccall_ret_code)
+        *code++ = OP_LO(0x616b, obscure_ccall_ret_code);
+            // mtlr r11
+        *code++ = 0x7d6803a6;
+
+            // * jump to wptr
+            // lis r11,hi(wptr)
+        *code++ = OP_HI(0x3d60, wptr);
+            // ori r11,r11,lo(wptr)
+        *code++ = OP_LO(0x616b, wptr);
+            // mtctr r11
+        *code++ = 0x7d6903a6;
+            // bctr
+        *code++ = 0x4e800420;
+
+        // Flush the Instruction cache:
+        {
+            unsigned *p = adjustor;
+            while(p < code)
+            {
+                __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
+                                 : : "r" (p));
+                p++;
+            }
+            __asm__ volatile ("sync\n\tisync");
+        }
+    }
+
+#elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
+        
+#define OP_LO(op,lo)  ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
+#define OP_HI(op,hi)  ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
+    {
+        /* The following code applies to all PowerPC and PowerPC64 platforms
+           whose stack layout is based on the AIX ABI.
+
+           Besides (obviously) AIX, this includes
+            Mac OS 9 and BeOS/PPC (may they rest in peace),
+                which use the 32-bit AIX ABI
+            powerpc64-linux,
+                which uses the 64-bit AIX ABI
+            and Darwin (Mac OS X),
+                which uses the same stack layout as AIX,
+                but no function descriptors.
+
+           The actual stack-frame shuffling is implemented out-of-line
+           in the function adjustorCode, in AdjustorAsm.S.
+           Here, we set up an AdjustorStub structure, which
+           is a function descriptor (on platforms that have function
+           descriptors) or a short piece of stub code (on Darwin) to call
+           adjustorCode with a pointer to the AdjustorStub struct loaded
+           into register r2.
+
+           One nice thing about this is that there is _no_ code generated at
+           runtime on the platforms that have function descriptors.
+        */
+        AdjustorStub *adjustorStub;
+        int sz = 0, extra_sz, total_sz;
+
+            // from AdjustorAsm.s
+            // not declared as a function so that AIX-style
+            // fundescs can never get in the way.
+        extern void *adjustorCode;
+        
+#ifdef FUNDESCS
+        adjustorStub = stgMallocBytes(sizeof(AdjustorStub), "createAdjustor");
+#else
+        adjustorStub = stgMallocBytesRWX(sizeof(AdjustorStub));
+#endif
+        adjustor = adjustorStub;
+            
+        adjustorStub->code = (void*) &adjustorCode;
+
+#ifdef FUNDESCS
+            // function descriptors are a cool idea.
+            // We don't need to generate any code at runtime.
+        adjustorStub->toc = adjustorStub;
+#else
+
+            // no function descriptors :-(
+            // We need to do things "by hand".
+#if defined(powerpc_HOST_ARCH)
+            // lis  r2, hi(adjustorStub)
+        adjustorStub->lis = OP_HI(0x3c40, adjustorStub);
+            // ori  r2, r2, lo(adjustorStub)
+        adjustorStub->ori = OP_LO(0x6042, adjustorStub);
+            // lwz r0, code(r2)
+        adjustorStub->lwz = OP_LO(0x8002, (char*)(&adjustorStub->code)
+                                        - (char*)adjustorStub);
+            // mtctr r0
+        adjustorStub->mtctr = 0x7c0903a6;
+            // bctr
+        adjustorStub->bctr = 0x4e800420;
+#else
+        barf("adjustor creation not supported on this platform");
+#endif
+
+        // Flush the Instruction cache:
+        {
+            int n = sizeof(AdjustorStub)/sizeof(unsigned);
+            unsigned *p = (unsigned*)adjustor;
+            while(n--)
+            {
+                __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
+                                    : : "r" (p));
+                p++;
+            }
+            __asm__ volatile ("sync\n\tisync");
+        }
+#endif
+
+            // Calculate the size of the stack frame, in words.
+        sz = totalArgumentSize(typeString);
+        
+            // The first eight words of the parameter area
+            // are just "backing store" for the parameters passed in
+            // the GPRs. extra_sz is the number of words beyond those first
+            // 8 words.
+        extra_sz = sz - 8;
+        if(extra_sz < 0)
+            extra_sz = 0;
+
+            // Calculate the total size of the stack frame.
+        total_sz = (6 /* linkage area */
+                  + 8 /* minimum parameter area */
+                  + 2 /* two extra arguments */
+                  + extra_sz)*sizeof(StgWord);
+       
+            // align to 16 bytes.
+            // AIX only requires 8 bytes, but who cares?
+        total_sz = (total_sz+15) & ~0xF;
+       
+            // Fill in the information that adjustorCode in AdjustorAsm.S
+            // will use to create a new stack frame with the additional args.
+        adjustorStub->hptr = hptr;
+        adjustorStub->wptr = wptr;
+        adjustorStub->negative_framesize = -total_sz;
+        adjustorStub->extrawords_plus_one = extra_sz + 1;
+    }
+
+#elif defined(ia64_HOST_ARCH)
+/*
+    Up to 8 inputs are passed in registers.  We flush the last two inputs to
+    the stack, initially into the 16-byte scratch region left by the caller.
+    We then shuffle the others along by 4 (taking 2 registers for ourselves
+    to save return address and previous function state - we need to come back
+    here on the way out to restore the stack, so this is a real function
+    rather than just a trampoline).
+    
+    The function descriptor we create contains the gp of the target function
+    so gp is already loaded correctly.
+
+	[MLX]       alloc r16=ar.pfs,10,2,0
+		    movl r17=wptr
+	[MII]       st8.spill [r12]=r38,8		// spill in6 (out4)
+		    mov r41=r37				// out7 = in5 (out3)
+		    mov r40=r36;;			// out6 = in4 (out2)
+	[MII]       st8.spill [r12]=r39			// spill in7 (out5)
+		    mov.sptk b6=r17,50
+		    mov r38=r34;;			// out4 = in2 (out0)
+	[MII]       mov r39=r35				// out5 = in3 (out1)
+		    mov r37=r33				// out3 = in1 (loc1)
+		    mov r36=r32				// out2 = in0 (loc0)
+	[MLX]       adds r12=-24,r12			// update sp
+		    movl r34=hptr;;			// out0 = hptr
+	[MIB]       mov r33=r16				// loc1 = ar.pfs
+		    mov r32=b0				// loc0 = retaddr
+		    br.call.sptk.many b0=b6;;
+
+	[MII]       adds r12=-16,r12
+		    mov b0=r32
+		    mov.i ar.pfs=r33
+	[MFB]       nop.m 0x0
+		    nop.f 0x0
+		    br.ret.sptk.many b0;;
+*/
+
+/* These macros distribute a long constant into the two words of an MLX bundle */
+#define BITS(val,start,count)	(((val) >> (start)) & ((1 << (count))-1))
+#define MOVL_LOWORD(val)	(BITS(val,22,18) << 46)
+#define MOVL_HIWORD(val)	(BITS(val,40,23) | (BITS(val,0,7) << 36) | (BITS(val,7,9) << 50) \
+				| (BITS(val,16,5) << 55) | (BITS(val,21,1) << 44) | BITS(val,63,1) << 59)
+
+    {
+	StgStablePtr stable;
+	IA64FunDesc *wdesc = (IA64FunDesc *)wptr;
+	StgWord64 wcode = wdesc->ip;
+	IA64FunDesc *fdesc;
+	StgWord64 *code;
+
+	/* we allocate on the Haskell heap since malloc'd memory isn't executable - argh */
+	adjustor = stgAllocStable(sizeof(IA64FunDesc)+18*8, &stable);
+
+	fdesc = (IA64FunDesc *)adjustor;
+	code = (StgWord64 *)(fdesc + 1);
+	fdesc->ip = (StgWord64)code;
+	fdesc->gp = wdesc->gp;
+
+	code[0]  = 0x0000058004288004 | MOVL_LOWORD(wcode);
+	code[1]  = 0x6000000220000000 | MOVL_HIWORD(wcode);
+	code[2]  = 0x029015d818984001;
+	code[3]  = 0x8401200500420094;
+	code[4]  = 0x886011d8189c0001;
+	code[5]  = 0x84011004c00380c0;
+	code[6]  = 0x0250210046013800;
+	code[7]  = 0x8401000480420084;
+	code[8]  = 0x0000233f19a06005 | MOVL_LOWORD((StgWord64)hptr);
+	code[9]  = 0x6000000440000000 | MOVL_HIWORD((StgWord64)hptr);
+	code[10] = 0x0200210020010811;
+	code[11] = 0x1080006800006200;
+	code[12] = 0x0000210018406000;
+	code[13] = 0x00aa021000038005;
+	code[14] = 0x000000010000001d;
+	code[15] = 0x0084000880000200;
+
+	/* save stable pointers in convenient form */
+	code[16] = (StgWord64)hptr;
+	code[17] = (StgWord64)stable;
+    }
+#else
+    barf("adjustor creation not supported on this platform");
+#endif
+    break;
+  
+  default:
+    ASSERT(0);
+    break;
+  }
+
+  /* Have fun! */
+  return adjustor;
+}
+
+
+void
+freeHaskellFunctionPtr(void* ptr)
+{
+#if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
+ if ( *(unsigned char*)ptr != 0x68 &&
+      *(unsigned char*)ptr != 0x58 ) {
+   errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
+   return;
+ }
+
+ /* Free the stable pointer first..*/
+ if (*(unsigned char*)ptr == 0x68) { /* Aha, a ccall adjustor! */
+    freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x01)));
+ } else {
+    freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
+ }
+#elif defined(x86_TARGET_ARCH) && defined(darwin_HOST_OS)
+if ( *(unsigned char*)ptr != 0xe8 ) {
+   errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
+   return;
+ }
+ freeStablePtr(((AdjustorStub*)ptr)->hptr);
+#elif defined(x86_64_HOST_ARCH)
+ if ( *(StgWord16 *)ptr == 0x894d ) {
+     freeStablePtr(*(StgStablePtr*)(ptr+0x20));
+ } else if ( *(StgWord16 *)ptr == 0x5141 ) {
+     freeStablePtr(*(StgStablePtr*)(ptr+0x30));
+ } else {
+   errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
+   return;
+ }
+#elif defined(sparc_HOST_ARCH)
+ if ( *(unsigned long*)ptr != 0x9C23A008UL ) {
+   errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
+   return;
+ }
+
+ /* Free the stable pointer first..*/
+ freeStablePtr(*((StgStablePtr*)((unsigned long*)ptr + 11)));
+#elif defined(alpha_HOST_ARCH)
+ if ( *(StgWord64*)ptr != 0xa77b0018a61b0010L ) {
+   errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
+   return;
+ }
+
+ /* Free the stable pointer first..*/
+ freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
+#elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
+ if ( *(StgWord*)ptr != 0x48000008 ) {
+   errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
+   return;
+ }
+ freeStablePtr(((StgStablePtr*)ptr)[1]);
+#elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
+ extern void* adjustorCode;
+ if ( ((AdjustorStub*)ptr)->code != (StgFunPtr) &adjustorCode ) {
+   errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
+   return;
+ }
+ freeStablePtr(((AdjustorStub*)ptr)->hptr);
+#elif defined(ia64_HOST_ARCH)
+ IA64FunDesc *fdesc = (IA64FunDesc *)ptr;
+ StgWord64 *code = (StgWord64 *)(fdesc+1);
+
+ if (fdesc->ip != (StgWord64)code) {
+   errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
+   return;
+ }
+ freeStablePtr((StgStablePtr)code[16]);
+ freeStablePtr((StgStablePtr)code[17]);
+ return;
+#else
+ ASSERT(0);
+#endif
+ *((unsigned char*)ptr) = '\0';
+
+ stgFree(ptr);
+}
+
+
+/*
+ * Function: initAdjustor()
+ *
+ * Perform initialisation of adjustor thunk layer (if needed.)
+ */
+void
+initAdjustor(void)
+{
+#if defined(i386_HOST_ARCH) && defined(openbsd_HOST_OS)
+    obscure_ccall_ret_code_dyn = stgMallocBytesRWX(4);
+    obscure_ccall_ret_code_dyn[0] = ((unsigned char *)obscure_ccall_ret_code)[0];
+    obscure_ccall_ret_code_dyn[1] = ((unsigned char *)obscure_ccall_ret_code)[1];
+    obscure_ccall_ret_code_dyn[2] = ((unsigned char *)obscure_ccall_ret_code)[2];
+    obscure_ccall_ret_code_dyn[3] = ((unsigned char *)obscure_ccall_ret_code)[3];
+#endif
+}
diff --git a/rts/AdjustorAsm.S b/rts/AdjustorAsm.S
new file mode 100644
index 0000000000..cfdef68349
--- /dev/null
+++ b/rts/AdjustorAsm.S
@@ -0,0 +1,189 @@
+#include "../includes/ghcconfig.h"
+
+/* ******************************** PowerPC ******************************** */
+
+#if defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
+#if !(defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS))
+    /* The following code applies, with some differences,
+       to all powerpc platforms except for powerpc32-linux,
+       whose calling convention is annoyingly complex.
+    */
+
+
+    /* The code is "almost" the same for
+       32-bit and for 64-bit
+    */
+#if defined(powerpc64_HOST_ARCH)
+#define WS          8
+#define LOAD        ld
+#define STORE       std
+#else
+#define WS          4
+#define LOAD        lwz
+#define STORE       stw
+#endif
+
+    /* Some info about stack frame layout */
+#define LINK_SLOT           (2*WS)
+#define LINKAGE_AREA_SIZE   (6*WS)
+
+    /* The following defines mirror struct AdjustorStub
+       from Adjustor.c. Make sure to keep these in sync.
+    */
+#if defined(powerpc_HOST_ARCH) && defined(darwin_HOST_OS)
+#define HEADER_WORDS   6
+#elif defined(powerpc64_HOST_ARCH) && defined(darwin_HOST_OS)
+#else
+#define HEADER_WORDS   3
+#endif
+
+#define HPTR_OFF        ((HEADER_WORDS    )*WS)
+#define WPTR_OFF        ((HEADER_WORDS + 1)*WS)
+#define FRAMESIZE_OFF   ((HEADER_WORDS + 2)*WS)
+#define EXTRA_WORDS_OFF ((HEADER_WORDS + 3)*WS)
+
+    /* Darwin insists on register names, everyone else prefers
+       to use numbers. */
+#if !defined(darwin_HOST_OS)
+#define r0 0
+#define r1 1
+#define r2 2
+#define r3 3
+#define r4 4
+#define r5 5
+#define r6 6
+#define r7 7
+#define r8 8
+#define r9 9
+#define r10 10
+#define r11 11
+#define r12 12
+
+#define r30 30
+#define r31 31
+#endif
+
+
+.text
+#if LEADING_UNDERSCORE
+    .globl _adjustorCode
+_adjustorCode:
+#else
+    .globl adjustorCode
+        /* Note that we don't build a function descriptor
+           for AIX-derived ABIs here. This will happen at runtime
+           in createAdjustor().
+        */
+adjustorCode:
+#endif
+    /* On entry, r2 will point to the AdjustorStub data structure. */
+
+        /* save the link */
+    mflr    r0
+    STORE   r0, LINK_SLOT(r1)
+    
+        /* set up stack frame */
+    LOAD    r12, FRAMESIZE_OFF(r2)
+#ifdef powerpc64_HOST_ARCH
+    stdux   r1, r1, r12
+#else   
+    stwux   r1, r1, r12
+#endif
+
+        /* Save some regs so that we can use them.
+           Note that we use the "Red Zone" below the stack pointer.
+        */
+    STORE   r31, -WS(r1)
+    STORE   r30, -2*WS(r1)
+
+    mr      r31, r1
+    subf    r30, r12, r31
+
+    LOAD    r12, EXTRA_WORDS_OFF(r2)
+    mtctr   r12
+    b       2f
+1:
+    LOAD    r0, LINKAGE_AREA_SIZE +  8*WS(r30)
+    STORE   r0, LINKAGE_AREA_SIZE + 10*WS(r31)
+    addi    r30, r30, WS
+    addi    r31, r31, WS
+2:
+    bdnz    1b
+
+        /* Restore r30 and r31 now.
+        */
+    LOAD    r31, -WS(r1)
+    LOAD    r30, -2*WS(r1)
+
+    STORE   r10, LINKAGE_AREA_SIZE + 9*WS(r1)
+    STORE   r9,  LINKAGE_AREA_SIZE + 8*WS(r1)
+    mr      r10, r8
+    mr      r9, r7
+    mr      r8, r6
+    mr      r7, r5
+    mr      r6, r4
+    mr      r5, r3
+
+    LOAD    r3, HPTR_OFF(r2)
+
+    LOAD    r12, WPTR_OFF(r2)
+#if defined(darwin_HOST_OS)
+    mtctr   r12
+#else
+    LOAD    r0, 0(r12)
+        /* The function we're calling will never be a nested function,
+           so we don't load r11. 
+        */
+    mtctr   r0
+    LOAD    r2, WS(r12)
+#endif
+    bctrl
+
+    LOAD    r1, 0(r1)
+    LOAD    r0, LINK_SLOT(r1)
+    mtlr    r0
+    blr
+#endif
+
+/* ********************************* i386 ********************************** */
+
+#elif defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
+
+#define WS              4
+#define RETVAL_OFF      5
+#define HEADER_BYTES    8
+
+#define HPTR_OFF        HEADER_BYTES
+#define WPTR_OFF        (HEADER_BYTES + 1*WS)
+#define FRAMESIZE_OFF   (HEADER_BYTES + 2*WS)
+#define ARGWORDS_OFF    (HEADER_BYTES + 3*WS)
+
+    .globl _adjustorCode
+_adjustorCode:
+    popl    %eax
+    subl    $RETVAL_OFF, %eax
+    
+    pushl   %ebp
+    movl    %esp, %ebp
+    
+    subl    FRAMESIZE_OFF(%eax), %esp
+
+    pushl   %esi
+    pushl   %edi
+        
+    leal    8(%ebp), %esi
+    leal    12(%esp), %edi
+    movl    ARGWORDS_OFF(%eax), %ecx
+    rep
+    movsl
+    
+    popl    %edi
+    popl    %esi
+    
+    pushl   HPTR_OFF(%eax)
+    call    *WPTR_OFF(%eax)
+    
+    leave
+    ret
+#endif
+
diff --git a/rts/Apply.cmm b/rts/Apply.cmm
new file mode 100644
index 0000000000..e0ca03944c
--- /dev/null
+++ b/rts/Apply.cmm
@@ -0,0 +1,268 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The University of Glasgow 2004
+ *
+ * Application-related bits.
+ *
+ * This file is written in a subset of C--, extended with various
+ * features specific to GHC.  It is compiled by GHC directly.  For the
+ * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
+ *
+ * -------------------------------------------------------------------------- */
+
+#include "Cmm.h"
+
+/* ----------------------------------------------------------------------------
+ * Evaluate a closure and return it.
+ *
+ * There isn't an info table / return address version of stg_ap_0, because
+ * everything being returned is guaranteed evaluated, so it would be a no-op.
+ */
+
+STRING(stg_ap_0_ret_str,"stg_ap_0_ret... ")
+
+stg_ap_0_fast
+{ 
+    // fn is in R1, no args on the stack
+
+    IF_DEBUG(apply,
+	foreign "C" debugBelch(stg_ap_0_ret_str) [R1];
+	foreign "C" printClosure(R1 "ptr") [R1]);
+
+    IF_DEBUG(sanity,
+	foreign "C" checkStackChunk(Sp "ptr",
+				    CurrentTSO + TSO_OFFSET_StgTSO_stack +
+				    WDS(TO_W_(StgTSO_stack_size(CurrentTSO))) "ptr") [R1]);
+
+    ENTER();
+}
+
+/* -----------------------------------------------------------------------------
+   Entry Code for a PAP.
+
+   This entry code is *only* called by one of the stg_ap functions.
+   On entry: Sp points to the remaining arguments on the stack.  If
+   the stack check fails, we can just push the PAP on the stack and
+   return to the scheduler.
+
+   On entry: R1 points to the PAP.  The rest of the function's
+   arguments (apart from those that are already in the PAP) are on the
+   stack, starting at Sp(0).  R2 contains an info table which
+   describes these arguments, which is used in the event that the
+   stack check in the entry code below fails.  The info table is
+   currently one of the stg_ap_*_ret family, as this code is always
+   entered from those functions.
+
+   The idea is to copy the chunk of stack from the PAP object onto the
+   stack / into registers, and enter the function.
+   -------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_PAP,/*special layout*/0,0,PAP,"PAP","PAP")
+{  foreign "C" barf("PAP object entered!"); }
+    
+stg_PAP_apply
+{
+  W_ Words;
+  W_ pap;
+    
+  pap = R1;
+
+  Words = TO_W_(StgPAP_n_args(pap));
+
+  //
+  // Check for stack overflow and bump the stack pointer.
+  // We have a hand-rolled stack check fragment here, because none of
+  // the canned ones suit this situation.
+  //
+  if ((Sp - WDS(Words)) < SpLim) {
+      // there is a return address in R2 in the event of a
+      // stack check failure.  The various stg_apply functions arrange
+      // this before calling stg_PAP_entry.
+      Sp_adj(-1); 
+      Sp(0) = R2;
+      jump stg_gc_unpt_r1;
+  }
+  Sp_adj(-Words);
+
+  // profiling
+  TICK_ENT_PAP();
+  LDV_ENTER(pap);
+  // Enter PAP cost centre 
+  ENTER_CCS_PAP_CL(pap);
+
+  R1 = StgPAP_fun(pap);
+
+  // Reload the stack 
+  W_ i;
+  W_ p;
+  p = pap + SIZEOF_StgHeader + OFFSET_StgPAP_payload;
+  i = 0;
+for:
+  if (i < Words) {
+    Sp(i) = W_[p];
+    p = p + WDS(1);
+    i = i + 1;
+    goto for;
+  }
+
+  // Off we go! 
+  TICK_ENT_VIA_NODE();
+
+#ifdef NO_ARG_REGS
+  jump %GET_ENTRY(R1);
+#else
+      W_ info;
+      info = %GET_FUN_INFO(R1);
+      W_ type;
+      type = TO_W_(StgFunInfoExtra_fun_type(info));
+      if (type == ARG_GEN) {
+	  jump StgFunInfoExtra_slow_apply(info);
+      }
+      if (type == ARG_GEN_BIG) {
+	  jump StgFunInfoExtra_slow_apply(info);
+      }
+      if (type == ARG_BCO) {
+	  Sp_adj(-2);
+	  Sp(1) = R1;
+	  Sp(0) = stg_apply_interp_info;
+	  jump stg_yield_to_interpreter;
+      }
+      jump W_[stg_ap_stack_entries + 
+		WDS(TO_W_(StgFunInfoExtra_fun_type(info)))];
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+   Entry Code for an AP (a PAP with arity zero).
+
+   The entry code is very similar to a PAP, except there are no
+   further arguments on the stack to worry about, so the stack check
+   is simpler.  We must also push an update frame on the stack before
+   applying the function.
+   -------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_AP,/*special layout*/0,0,AP,"AP","AP")
+{
+  W_ Words;
+  W_ ap;
+    
+  ap = R1;
+  
+  Words = TO_W_(StgAP_n_args(ap));
+
+  /* 
+   * Check for stack overflow.  IMPORTANT: use a _NP check here,
+   * because if the check fails, we might end up blackholing this very
+   * closure, in which case we must enter the blackhole on return rather
+   * than continuing to evaluate the now-defunct closure.
+   */
+  STK_CHK_NP(WDS(Words) + SIZEOF_StgUpdateFrame);
+
+  PUSH_UPD_FRAME(Sp - SIZEOF_StgUpdateFrame, R1);
+  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(Words);
+
+  TICK_ENT_AP();
+  LDV_ENTER(ap);
+
+  // Enter PAP cost centre
+  ENTER_CCS_PAP_CL(ap);   // ToDo: ENTER_CC_AP_CL 
+
+  R1 = StgAP_fun(ap);
+
+  // Reload the stack 
+  W_ i;
+  W_ p;
+  p = ap + SIZEOF_StgHeader + OFFSET_StgAP_payload;
+  i = 0;
+for:
+  if (i < Words) {
+    Sp(i) = W_[p];
+    p = p + WDS(1);
+    i = i + 1;
+    goto for;
+  }
+
+  // Off we go! 
+  TICK_ENT_VIA_NODE();
+
+#ifdef NO_ARG_REGS
+  jump %GET_ENTRY(R1);
+#else
+      W_ info;
+      info = %GET_FUN_INFO(R1);
+      W_ type;
+      type = TO_W_(StgFunInfoExtra_fun_type(info));
+      if (type == ARG_GEN) {
+	  jump StgFunInfoExtra_slow_apply(info);
+      }
+      if (type == ARG_GEN_BIG) {
+	  jump StgFunInfoExtra_slow_apply(info);
+      }
+      if (type == ARG_BCO) {
+	  Sp_adj(-2);
+	  Sp(1) = R1;
+	  Sp(0) = stg_apply_interp_info;
+	  jump stg_yield_to_interpreter;
+      }
+      jump W_[stg_ap_stack_entries + 
+		WDS(TO_W_(StgFunInfoExtra_fun_type(info)))];
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+   Entry Code for an AP_STACK.
+
+   Very similar to a PAP and AP.  The layout is the same as PAP
+   and AP, except that the payload is a chunk of stack instead of
+   being described by the function's info table.  Like an AP,
+   there are no further arguments on the stack to worry about.
+   However, the function closure (ap->fun) does not necessarily point
+   directly to a function, so we have to enter it using stg_ap_0.
+   -------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_AP_STACK,/*special layout*/0,0,AP_STACK,"AP_STACK","AP_STACK")
+{
+  W_ Words;
+  W_ ap;
+
+  ap = R1;
+  
+  Words = StgAP_STACK_size(ap);
+
+  /* 
+   * Check for stack overflow.  IMPORTANT: use a _NP check here,
+   * because if the check fails, we might end up blackholing this very
+   * closure, in which case we must enter the blackhole on return rather
+   * than continuing to evaluate the now-defunct closure.
+   */
+  STK_CHK_NP(WDS(Words) + SIZEOF_StgUpdateFrame);
+
+  PUSH_UPD_FRAME(Sp - SIZEOF_StgUpdateFrame, R1);
+  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(Words);
+
+  TICK_ENT_AP();
+  LDV_ENTER(ap);
+
+  // Enter PAP cost centre
+  ENTER_CCS_PAP_CL(ap);   // ToDo: ENTER_CC_AP_CL 
+
+  R1 = StgAP_STACK_fun(ap);
+
+  // Reload the stack
+  W_ i;
+  W_ p;
+  p = ap + SIZEOF_StgHeader + OFFSET_StgAP_STACK_payload;
+  i = 0;
+for:
+  if (i < Words) {
+    Sp(i) = W_[p];
+    p = p + WDS(1);
+    i = i + 1;
+    goto for;
+  }
+
+  // Off we go!
+  TICK_ENT_VIA_NODE();
+
+  ENTER();
+}
diff --git a/rts/Apply.h b/rts/Apply.h
new file mode 100644
index 0000000000..76e36cb9fb
--- /dev/null
+++ b/rts/Apply.h
@@ -0,0 +1,29 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The University of Glasgow 2002-2004
+ *
+ * Declarations for things defined in AutoApply.cmm
+ *
+ * -------------------------------------------------------------------------- */
+
+#ifndef APPLY_H
+#define APPLY_H
+
+// canned slow entry points, indexed by arg type (ARG_P, ARG_PP, etc.)
+#ifdef IN_STG_CODE
+extern StgWord stg_ap_stack_entries[];
+#else
+extern StgFun *stg_ap_stack_entries[];
+#endif
+
+// canned register save code for heap check failure in a function
+#ifdef IN_STG_CODE
+extern StgWord stg_stack_save_entries[];
+#else
+extern StgFun *stg_stack_save_entries[];
+#endif
+
+// canned bitmap for each arg type
+extern StgWord stg_arg_bitmaps[];
+
+#endif /* APPLY_H */
diff --git a/rts/Arena.c b/rts/Arena.c
new file mode 100644
index 0000000000..76ac23cf88
--- /dev/null
+++ b/rts/Arena.c
@@ -0,0 +1,120 @@
+/* -----------------------------------------------------------------------------
+   (c) The University of Glasgow 2001
+
+   Arena allocation.  Arenas provide fast memory allocation at the
+   expense of fine-grained recycling of storage: memory may be
+   only be returned to the system by freeing the entire arena, it
+   isn't possible to return individual objects within an arena.
+
+   Do not assume that sequentially allocated objects will be adjacent
+   in memory.
+   
+   Quirks: this allocator makes use of the RTS block allocator.  If
+   the current block doesn't have enough room for the requested
+   object, then a new block is allocated.  This means that allocating
+   large objects will tend to result in wasted space at the end of
+   each block.  In the worst case, half of the allocated space is
+   wasted.  This allocator is therefore best suited to situations in
+   which most allocations are small.
+   -------------------------------------------------------------------------- */
+
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "BlockAlloc.h"
+#include "Arena.h"
+
+#include <stdlib.h>
+
+// Each arena struct is allocated using malloc().
+struct _Arena {
+    bdescr *current;
+    StgWord *free;		// ptr to next free byte in current block
+    StgWord *lim;		// limit (== last free byte + 1)
+};
+
+// We like to keep track of how many blocks we've allocated for 
+// Storage.c:memInventory().
+static long arena_blocks = 0;
+
+// Begin a new arena
+Arena *
+newArena( void )
+{
+    Arena *arena;
+
+    arena = stgMallocBytes(sizeof(Arena), "newArena");
+    arena->current = allocBlock();
+    arena->current->link = NULL;
+    arena->free = arena->current->start;
+    arena->lim  = arena->current->start + BLOCK_SIZE_W;
+    arena_blocks++;
+
+    return arena;
+}
+
+// The minimum alignment of an allocated block.
+#define MIN_ALIGN 8
+
+/* 'n' is assumed to be a power of 2 */
+#define ROUNDUP(x,n)  (((x)+((n)-1))&(~((n)-1)))
+#define B_TO_W(x)     ((x) / sizeof(W_))
+
+// Allocate some memory in an arena
+void  *
+arenaAlloc( Arena *arena, size_t size )
+{
+    void *p;
+    nat size_w;
+    nat req_blocks;
+    bdescr *bd;
+
+    // round up to nearest alignment chunk.
+    size = ROUNDUP(size,MIN_ALIGN);
+
+    // size of allocated block in words.
+    size_w = B_TO_W(size);
+
+    if ( arena->free + size_w < arena->lim ) {
+	// enough room in the current block...
+	p = arena->free;
+	arena->free += size_w;
+	return p;
+    } else {
+	// allocate a fresh block...
+	req_blocks =  (lnat)BLOCK_ROUND_UP(size) / BLOCK_SIZE;
+	bd = allocGroup(req_blocks);
+	arena_blocks += req_blocks;
+
+	bd->gen_no  = 0;
+	bd->step    = NULL;
+	bd->flags   = 0;
+	bd->free    = bd->start;
+	bd->link    = arena->current;
+	arena->current = bd;
+	arena->free = bd->free + size_w;
+	arena->lim = bd->free + bd->blocks * BLOCK_SIZE_W;
+	return bd->start;
+    }
+}
+
+// Free an entire arena
+void
+arenaFree( Arena *arena )
+{
+    bdescr *bd, *next;
+
+    for (bd = arena->current; bd != NULL; bd = next) {
+	next = bd->link;
+	arena_blocks -= bd->blocks;
+	ASSERT(arena_blocks >= 0);
+	freeGroup(bd);
+    }
+    stgFree(arena);
+}
+
+unsigned long
+arenaBlocks( void )
+{
+    return arena_blocks;
+}
+
diff --git a/rts/Arena.h b/rts/Arena.h
new file mode 100644
index 0000000000..7a2989e543
--- /dev/null
+++ b/rts/Arena.h
@@ -0,0 +1,25 @@
+/* -----------------------------------------------------------------------------
+   (c) The University of Glasgow 2001
+
+   Arena allocation interface.
+   -------------------------------------------------------------------------- */
+
+#ifndef ARENA_H
+#define ARENA_H
+
+// Abstract type of arenas
+typedef struct _Arena Arena;
+
+// Start a new arena
+extern Arena * newArena   ( void );
+
+// Allocate memory in an arena
+extern void  * arenaAlloc ( Arena *, size_t );
+
+// Free an entire arena
+extern void    arenaFree  ( Arena * );
+
+// For internal use only:
+extern unsigned long arenaBlocks( void );
+
+#endif /* ARENA_H */
diff --git a/rts/AutoApply.h b/rts/AutoApply.h
new file mode 100644
index 0000000000..bbec1224ff
--- /dev/null
+++ b/rts/AutoApply.h
@@ -0,0 +1,80 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The University of Glasgow 2002-2004
+ *
+ * Helper bits for the generic apply code (AutoApply.hc)
+ *
+ * -------------------------------------------------------------------------- */
+
+#ifndef AUTOAPPLY_H
+#define AUTOAPPLY_H
+
+// Build a new PAP: function is in R1
+// ret addr and m arguments taking up n words are on the stack.
+// NB. x is a dummy argument attached to the 'for' label so that
+// BUILD_PAP can be used multiple times in the same function.
+#define BUILD_PAP(m,n,f,x)				\
+    W_ pap;						\
+    W_ size;						\
+    W_ i;						\
+    size = SIZEOF_StgPAP + WDS(n);			\
+    HP_CHK_NP_ASSIGN_SP0(size,f);			\
+    TICK_ALLOC_HEAP_NOCTR(BYTES_TO_WDS(size));		\
+    TICK_ALLOC_PAP(n+1 /* +1 for the FUN */, 0);	\
+    pap = Hp + WDS(1) - size;				\
+    SET_HDR(pap, stg_PAP_info, W_[CCCS]);		\
+    StgPAP_arity(pap) = HALF_W_(arity - m);		\
+    StgPAP_fun(pap)   = R1;				\
+    StgPAP_n_args(pap) = HALF_W_(n);			\
+    i = 0;						\
+  for##x:						\
+    if (i < n) {					\
+	StgPAP_payload(pap,i) = Sp(1+i);		\
+	i = i + 1;					\
+	goto for##x;					\
+    }							\
+    R1 = pap;						\
+    Sp_adj(1 + n);					\
+    jump %ENTRY_CODE(Sp(0));
+
+// Copy the old PAP, build a new one with the extra arg(s)
+// ret addr and m arguments taking up n words are on the stack.
+// NB. x is a dummy argument attached to the 'for' label so that
+// BUILD_PAP can be used multiple times in the same function.
+#define NEW_PAP(m,n,f,x)					\
+     W_ pap;							\
+     W_ new_pap;						\
+     W_ size;							\
+     W_ i;							\
+     pap = R1;							\
+     size = SIZEOF_StgPAP + WDS(TO_W_(StgPAP_n_args(pap))) + WDS(n);	\
+     HP_CHK_NP_ASSIGN_SP0(size,f);				\
+     TICK_ALLOC_HEAP_NOCTR(BYTES_TO_WDS(size));			\
+     TICK_ALLOC_PAP(n+1 /* +1 for the FUN */, 0);		\
+     new_pap = Hp + WDS(1) - size;				\
+     SET_HDR(new_pap, stg_PAP_info, W_[CCCS]);			\
+     StgPAP_arity(new_pap) = HALF_W_(arity - m);		\
+     W_ n_args;							\
+     n_args = TO_W_(StgPAP_n_args(pap));			\
+     StgPAP_n_args(new_pap) = HALF_W_(n_args + n);		\
+     StgPAP_fun(new_pap) = StgPAP_fun(pap);			\
+     i = 0;							\
+   for1##x:							\
+     if (i < n_args) {						\
+         StgPAP_payload(new_pap,i) = StgPAP_payload(pap,i);	\
+	 i = i + 1;						\
+	 goto for1##x;						\
+     }								\
+     i = 0;							\
+   for2##x:							\
+     if (i < n) {						\
+	 StgPAP_payload(new_pap,n_args+i) = Sp(1+i);		\
+         i = i + 1;						\
+         goto for2##x;						\
+     }								\
+     R1 = new_pap;						\
+     Sp_adj(n+1);						\
+     jump %ENTRY_CODE(Sp(0));
+
+#endif /* APPLY_H */
+
diff --git a/rts/AwaitEvent.h b/rts/AwaitEvent.h
new file mode 100644
index 0000000000..e03cb4444e
--- /dev/null
+++ b/rts/AwaitEvent.h
@@ -0,0 +1,24 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-2005
+ *
+ * The awaitEvent() interface, for the non-threaded RTS
+ *
+ * -------------------------------------------------------------------------*/
+
+#ifndef AWAITEVENT_H
+#define AWAITEVENT_H
+
+#if !defined(THREADED_RTS)
+/* awaitEvent(rtsBool wait)
+ *
+ * Checks for blocked threads that need to be woken.
+ *
+ * Called from STG :  NO
+ * Locks assumed   :  sched_mutex
+ */
+void awaitEvent(rtsBool wait);  /* In posix/Select.c or
+				 * win32/AwaitEvent.c */
+#endif
+
+#endif /* SELECT_H */
diff --git a/rts/BlockAlloc.c b/rts/BlockAlloc.c
new file mode 100644
index 0000000000..5e0e321947
--- /dev/null
+++ b/rts/BlockAlloc.c
@@ -0,0 +1,391 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-2006
+ * 
+ * The block allocator and free list manager.
+ *
+ * This is the architecture independent part of the block allocator.
+ * It requires only the following support from the operating system: 
+ *
+ *    void *getMBlock();
+ *
+ * returns the address of an MBLOCK_SIZE region of memory, aligned on
+ * an MBLOCK_SIZE boundary.  There is no requirement for successive
+ * calls to getMBlock to return strictly increasing addresses.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "BlockAlloc.h"
+#include "MBlock.h"
+#include "Storage.h"
+
+#include <string.h>
+
+static void    initMBlock(void *mblock);
+static bdescr *allocMegaGroup(nat mblocks);
+static void    freeMegaGroup(bdescr *bd);
+
+// In THREADED_RTS mode, the free list is protected by sm_mutex.
+static bdescr *free_list = NULL;
+
+/* -----------------------------------------------------------------------------
+   Initialisation
+   -------------------------------------------------------------------------- */
+
+void initBlockAllocator(void)
+{
+    // The free list starts off NULL
+}
+
+/* -----------------------------------------------------------------------------
+   Allocation
+   -------------------------------------------------------------------------- */
+
+STATIC_INLINE void
+initGroup(nat n, bdescr *head)
+{
+  bdescr *bd;
+  nat i;
+
+  if (n != 0) {
+    head->blocks = n;
+    head->free   = head->start;
+    head->link   = NULL;
+    for (i=1, bd = head+1; i < n; i++, bd++) {
+      bd->free = 0;
+      bd->blocks = 0;
+      bd->link = head;
+    }
+  }
+}
+
+bdescr *
+allocGroup(nat n)
+{
+  void *mblock;
+  bdescr *bd, **last;
+
+  ASSERT_SM_LOCK();
+  ASSERT(n != 0);
+
+  if (n > BLOCKS_PER_MBLOCK) {
+    return allocMegaGroup(BLOCKS_TO_MBLOCKS(n));
+  }
+
+  last = &free_list;
+  for (bd = free_list; bd != NULL; bd = bd->link) {
+    if (bd->blocks == n) {	/* exactly the right size! */
+      *last = bd->link;
+      /* no initialisation necessary - this is already a
+       * self-contained block group. */
+      bd->free = bd->start;	/* block isn't free now */
+      bd->link = NULL;
+      return bd;
+    }
+    if (bd->blocks >  n) {	/* block too big... */
+      bd->blocks -= n;		/* take a chunk off the *end* */
+      bd += bd->blocks;
+      initGroup(n, bd);		/* initialise it */
+      return bd;
+    }
+    last = &bd->link;
+  }
+  
+  mblock = getMBlock();		/* get a new megablock */
+  initMBlock(mblock);		/* initialise the start fields */
+  bd = FIRST_BDESCR(mblock);
+  initGroup(n,bd);		/* we know the group will fit */
+  if (n < BLOCKS_PER_MBLOCK) {
+    initGroup(BLOCKS_PER_MBLOCK-n, bd+n);
+    freeGroup(bd+n);      	/* add the rest on to the free list */
+  }
+  return bd;
+}
+
+bdescr *
+allocGroup_lock(nat n)
+{
+    bdescr *bd;
+    ACQUIRE_SM_LOCK;
+    bd = allocGroup(n);
+    RELEASE_SM_LOCK;
+    return bd;
+}
+
+bdescr *
+allocBlock(void)
+{
+  return allocGroup(1);
+}
+
+bdescr *
+allocBlock_lock(void)
+{
+    bdescr *bd;
+    ACQUIRE_SM_LOCK;
+    bd = allocBlock();
+    RELEASE_SM_LOCK;
+    return bd;
+}
+
+/* -----------------------------------------------------------------------------
+   Any request larger than BLOCKS_PER_MBLOCK needs a megablock group.
+   First, search the free list for enough contiguous megablocks to
+   fulfill the request - if we don't have enough, we need to
+   allocate some new ones.
+
+   A megablock group looks just like a normal block group, except that
+   the blocks field in the head will be larger than BLOCKS_PER_MBLOCK.
+
+   Note that any objects placed in this group must start in the first
+   megablock, since the other blocks don't have block descriptors.
+   -------------------------------------------------------------------------- */
+   
+static bdescr *
+allocMegaGroup(nat n)
+{
+  nat mbs_found;
+  bdescr *bd, *last, *grp_start, *grp_prev;
+
+  mbs_found = 0;
+  grp_start = NULL;
+  grp_prev  = NULL;
+  last      = NULL;
+  for (bd = free_list; bd != NULL; bd = bd->link) {
+
+    if (bd->blocks == BLOCKS_PER_MBLOCK) {	/* whole megablock found */
+
+      /* is it the first one we've found or a non-contiguous megablock? */
+      if (grp_start == NULL ||
+          bd->start != last->start + MBLOCK_SIZE/sizeof(W_)) {
+	grp_start = bd;
+	grp_prev  = last;
+	mbs_found = 1;
+      } else {
+	mbs_found++;
+      }
+
+      if (mbs_found == n) {	/* found enough contig megablocks? */
+	break;
+      }
+    } 
+
+    else {			/* only a partial megablock, start again */
+      grp_start = NULL;
+    }
+
+    last = bd;
+  }
+
+  /* found all the megablocks we need on the free list
+   */
+  if (mbs_found == n) {
+    /* remove the megablocks from the free list */
+    if (grp_prev == NULL) {	/* bd now points to the last mblock */
+      free_list = bd->link;
+    } else {
+      grp_prev->link = bd->link;
+    }
+  }
+
+  /* the free list wasn't sufficient, allocate all new mblocks.
+   */
+  else {
+    void *mblock = getMBlocks(n);
+    initMBlock(mblock);		/* only need to init the 1st one */
+    grp_start = FIRST_BDESCR(mblock);
+  }
+
+  /* set up the megablock group */
+  initGroup(BLOCKS_PER_MBLOCK, grp_start);
+  grp_start->blocks = MBLOCK_GROUP_BLOCKS(n);
+  return grp_start;
+}
+
+/* -----------------------------------------------------------------------------
+   De-Allocation
+   -------------------------------------------------------------------------- */
+
+/* coalesce the group p with p->link if possible.
+ *
+ * Returns p->link if no coalescing was done, otherwise returns a
+ * pointer to the newly enlarged group p.
+ */
+
+STATIC_INLINE bdescr *
+coalesce(bdescr *p)
+{
+  bdescr *bd, *q;
+  nat i, blocks;
+
+  q = p->link;
+  if (q != NULL && p->start + p->blocks * BLOCK_SIZE_W == q->start) {
+    /* can coalesce */
+    p->blocks += q->blocks;
+    p->link    = q->link;
+    blocks = q->blocks;
+    for (i = 0, bd = q; i < blocks; bd++, i++) {
+	bd->free = 0;
+	bd->blocks = 0;
+	bd->link = p;
+    }
+    return p;
+  }
+  return q;
+}
+
+void
+freeGroup(bdescr *p)
+{
+  bdescr *bd, *last;
+  
+  ASSERT_SM_LOCK();
+
+  /* are we dealing with a megablock group? */
+  if (p->blocks > BLOCKS_PER_MBLOCK) {
+    freeMegaGroup(p);
+    return;
+  }
+
+
+  p->free = (void *)-1;  /* indicates that this block is free */
+  p->step = NULL;
+  p->gen_no = 0;
+  /* fill the block group with garbage if sanity checking is on */
+  IF_DEBUG(sanity,memset(p->start, 0xaa, p->blocks * BLOCK_SIZE));
+
+  /* find correct place in free list to place new group */
+  last = NULL;
+  for (bd = free_list; bd != NULL && bd->start < p->start; 
+       bd = bd->link) {
+    last = bd;
+  }
+
+  /* now, last = previous group (or NULL) */
+  if (last == NULL) {
+    p->link = free_list;
+    free_list = p;
+  } else {
+    /* coalesce with previous group if possible */
+    p->link = last->link;
+    last->link = p;
+    p = coalesce(last);
+  }
+
+  /* coalesce with next group if possible */
+  coalesce(p);
+  IF_DEBUG(sanity, checkFreeListSanity());
+}
+
+void
+freeGroup_lock(bdescr *p)
+{
+    ACQUIRE_SM_LOCK;
+    freeGroup(p);
+    RELEASE_SM_LOCK;
+}
+
+static void
+freeMegaGroup(bdescr *p)
+{
+  nat n;
+  void *q = p;
+
+  n = ((bdescr *)q)->blocks * BLOCK_SIZE / MBLOCK_SIZE + 1;
+  for (; n > 0; q += MBLOCK_SIZE, n--) {
+    initMBlock(MBLOCK_ROUND_DOWN(q));
+    initGroup(BLOCKS_PER_MBLOCK, (bdescr *)q);
+    freeGroup((bdescr *)q);
+  }
+}
+
+void
+freeChain(bdescr *bd)
+{
+  bdescr *next_bd;
+  while (bd != NULL) {
+    next_bd = bd->link;
+    freeGroup(bd);
+    bd = next_bd;
+  }
+}
+
+void
+freeChain_lock(bdescr *bd)
+{
+    ACQUIRE_SM_LOCK;
+    freeChain(bd);
+    RELEASE_SM_LOCK;
+}
+
+static void
+initMBlock(void *mblock)
+{
+  bdescr *bd;
+  void *block;
+
+  /* the first few Bdescr's in a block are unused, so we don't want to
+   * put them all on the free list.
+   */
+  block = FIRST_BLOCK(mblock);
+  bd    = FIRST_BDESCR(mblock);
+
+  /* Initialise the start field of each block descriptor
+   */
+  for (; block <= LAST_BLOCK(mblock); bd += 1, block += BLOCK_SIZE) {
+    bd->start = block;
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   Debugging
+   -------------------------------------------------------------------------- */
+
+#ifdef DEBUG
+static void
+checkWellFormedGroup( bdescr *bd )
+{
+    nat i;
+
+    for (i = 1; i < bd->blocks; i++) {
+	ASSERT(bd[i].blocks == 0);
+	ASSERT(bd[i].free   == 0);
+	ASSERT(bd[i].link   == bd);
+    }
+}
+
+void
+checkFreeListSanity(void)
+{
+  bdescr *bd;
+
+  for (bd = free_list; bd != NULL; bd = bd->link) {
+    IF_DEBUG(block_alloc,
+	     debugBelch("group at 0x%p, length %d blocks\n", 
+			bd->start, bd->blocks));
+    ASSERT(bd->blocks > 0);
+    checkWellFormedGroup(bd);
+    if (bd->link != NULL) {
+      /* make sure we're fully coalesced */
+      ASSERT(bd->start + bd->blocks * BLOCK_SIZE_W != bd->link->start);
+      ASSERT(bd->start < bd->link->start);
+    }
+  }
+}
+
+nat /* BLOCKS */
+countFreeList(void)
+{
+  bdescr *bd;
+  lnat total_blocks = 0;
+
+  for (bd = free_list; bd != NULL; bd = bd->link) {
+    total_blocks += bd->blocks;
+  }
+  return total_blocks;
+}
+#endif
diff --git a/rts/BlockAlloc.h b/rts/BlockAlloc.h
new file mode 100644
index 0000000000..1472ac6f76
--- /dev/null
+++ b/rts/BlockAlloc.h
@@ -0,0 +1,19 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-1999
+ *
+ * Block Allocator Interface
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef BLOCK_ALLOC_H
+#define BLOCK_ALLOC_H
+
+/* Debugging  -------------------------------------------------------------- */
+
+#ifdef DEBUG
+extern void checkFreeListSanity(void);
+nat         countFreeList(void);
+#endif
+
+#endif /* BLOCK_ALLOC_H */
diff --git a/rts/Capability.c b/rts/Capability.c
new file mode 100644
index 0000000000..51a42ef468
--- /dev/null
+++ b/rts/Capability.c
@@ -0,0 +1,668 @@
+/* ---------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2003-2006
+ *
+ * Capabilities
+ *
+ * A Capability represent the token required to execute STG code,
+ * and all the state an OS thread/task needs to run Haskell code:
+ * its STG registers, a pointer to its TSO, a nursery etc. During
+ * STG execution, a pointer to the capabilitity is kept in a
+ * register (BaseReg; actually it is a pointer to cap->r).
+ *
+ * Only in an THREADED_RTS build will there be multiple capabilities,
+ * for non-threaded builds there is only one global capability, namely
+ * MainCapability.
+ *
+ * --------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "STM.h"
+#include "OSThreads.h"
+#include "Capability.h"
+#include "Schedule.h"
+#include "Sparks.h"
+
+// one global capability, this is the Capability for non-threaded
+// builds, and for +RTS -N1
+Capability MainCapability;
+
+nat n_capabilities;
+Capability *capabilities = NULL;
+
+// Holds the Capability which last became free.  This is used so that
+// an in-call has a chance of quickly finding a free Capability.
+// Maintaining a global free list of Capabilities would require global
+// locking, so we don't do that.
+Capability *last_free_capability;
+
+#if defined(THREADED_RTS)
+STATIC_INLINE rtsBool
+globalWorkToDo (void)
+{
+    return blackholes_need_checking
+	|| sched_state >= SCHED_INTERRUPTING
+	;
+}
+#endif
+
+#if defined(THREADED_RTS)
+STATIC_INLINE rtsBool
+anyWorkForMe( Capability *cap, Task *task )
+{
+    if (task->tso != NULL) {
+	// A bound task only runs if its thread is on the run queue of
+	// the capability on which it was woken up.  Otherwise, we
+	// can't be sure that we have the right capability: the thread
+	// might be woken up on some other capability, and task->cap
+	// could change under our feet.
+	return !emptyRunQueue(cap) && cap->run_queue_hd->bound == task;
+    } else {
+	// A vanilla worker task runs if either there is a lightweight
+	// thread at the head of the run queue, or the run queue is
+	// empty and (there are sparks to execute, or there is some
+	// other global condition to check, such as threads blocked on
+	// blackholes).
+	if (emptyRunQueue(cap)) {
+	    return !emptySparkPoolCap(cap)
+		|| !emptyWakeupQueue(cap)
+		|| globalWorkToDo();
+	} else
+	    return cap->run_queue_hd->bound == NULL;
+    }
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Manage the returning_tasks lists.
+ *
+ * These functions require cap->lock
+ * -------------------------------------------------------------------------- */
+
+#if defined(THREADED_RTS)
+STATIC_INLINE void
+newReturningTask (Capability *cap, Task *task)
+{
+    ASSERT_LOCK_HELD(&cap->lock);
+    ASSERT(task->return_link == NULL);
+    if (cap->returning_tasks_hd) {
+	ASSERT(cap->returning_tasks_tl->return_link == NULL);
+	cap->returning_tasks_tl->return_link = task;
+    } else {
+	cap->returning_tasks_hd = task;
+    }
+    cap->returning_tasks_tl = task;
+}
+
+STATIC_INLINE Task *
+popReturningTask (Capability *cap)
+{
+    ASSERT_LOCK_HELD(&cap->lock);
+    Task *task;
+    task = cap->returning_tasks_hd;
+    ASSERT(task);
+    cap->returning_tasks_hd = task->return_link;
+    if (!cap->returning_tasks_hd) {
+	cap->returning_tasks_tl = NULL;
+    }
+    task->return_link = NULL;
+    return task;
+}
+#endif
+
+/* ----------------------------------------------------------------------------
+ * Initialisation
+ *
+ * The Capability is initially marked not free.
+ * ------------------------------------------------------------------------- */
+
+static void
+initCapability( Capability *cap, nat i )
+{
+    nat g;
+
+    cap->no = i;
+    cap->in_haskell        = rtsFalse;
+
+    cap->run_queue_hd      = END_TSO_QUEUE;
+    cap->run_queue_tl      = END_TSO_QUEUE;
+
+#if defined(THREADED_RTS)
+    initMutex(&cap->lock);
+    cap->running_task      = NULL; // indicates cap is free
+    cap->spare_workers     = NULL;
+    cap->suspended_ccalling_tasks = NULL;
+    cap->returning_tasks_hd = NULL;
+    cap->returning_tasks_tl = NULL;
+    cap->wakeup_queue_hd    = END_TSO_QUEUE;
+    cap->wakeup_queue_tl    = END_TSO_QUEUE;
+#endif
+
+    cap->f.stgGCEnter1     = (F_)__stg_gc_enter_1;
+    cap->f.stgGCFun        = (F_)__stg_gc_fun;
+
+    cap->mut_lists  = stgMallocBytes(sizeof(bdescr *) *
+				     RtsFlags.GcFlags.generations,
+				     "initCapability");
+
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	cap->mut_lists[g] = NULL;
+    }
+
+    cap->free_tvar_wait_queues = END_STM_WAIT_QUEUE;
+    cap->free_trec_chunks = END_STM_CHUNK_LIST;
+    cap->free_trec_headers = NO_TREC;
+    cap->transaction_tokens = 0;
+}
+
+/* ---------------------------------------------------------------------------
+ * Function:  initCapabilities()
+ *
+ * Purpose:   set up the Capability handling. For the THREADED_RTS build,
+ *            we keep a table of them, the size of which is
+ *            controlled by the user via the RTS flag -N.
+ *
+ * ------------------------------------------------------------------------- */
+void
+initCapabilities( void )
+{
+#if defined(THREADED_RTS)
+    nat i;
+
+#ifndef REG_Base
+    // We can't support multiple CPUs if BaseReg is not a register
+    if (RtsFlags.ParFlags.nNodes > 1) {
+	errorBelch("warning: multiple CPUs not supported in this build, reverting to 1");
+	RtsFlags.ParFlags.nNodes = 1;
+    }
+#endif
+
+    n_capabilities = RtsFlags.ParFlags.nNodes;
+
+    if (n_capabilities == 1) {
+	capabilities = &MainCapability;
+	// THREADED_RTS must work on builds that don't have a mutable
+	// BaseReg (eg. unregisterised), so in this case
+	// capabilities[0] must coincide with &MainCapability.
+    } else {
+	capabilities = stgMallocBytes(n_capabilities * sizeof(Capability),
+				      "initCapabilities");
+    }
+
+    for (i = 0; i < n_capabilities; i++) {
+	initCapability(&capabilities[i], i);
+    }
+
+    IF_DEBUG(scheduler, sched_belch("allocated %d capabilities", 
+				    n_capabilities));
+
+#else /* !THREADED_RTS */
+
+    n_capabilities = 1;
+    capabilities = &MainCapability;
+    initCapability(&MainCapability, 0);
+
+#endif
+
+    // There are no free capabilities to begin with.  We will start
+    // a worker Task to each Capability, which will quickly put the
+    // Capability on the free list when it finds nothing to do.
+    last_free_capability = &capabilities[0];
+}
+
+/* ----------------------------------------------------------------------------
+ * Give a Capability to a Task.  The task must currently be sleeping
+ * on its condition variable.
+ *
+ * Requires cap->lock (modifies cap->running_task).
+ *
+ * When migrating a Task, the migrater must take task->lock before
+ * modifying task->cap, to synchronise with the waking up Task.
+ * Additionally, the migrater should own the Capability (when
+ * migrating the run queue), or cap->lock (when migrating
+ * returning_workers).
+ *
+ * ------------------------------------------------------------------------- */
+
+#if defined(THREADED_RTS)
+STATIC_INLINE void
+giveCapabilityToTask (Capability *cap USED_IF_DEBUG, Task *task)
+{
+    ASSERT_LOCK_HELD(&cap->lock);
+    ASSERT(task->cap == cap);
+    IF_DEBUG(scheduler,
+	     sched_belch("passing capability %d to %s %p",
+			 cap->no, task->tso ? "bound task" : "worker",
+			 (void *)task->id));
+    ACQUIRE_LOCK(&task->lock);
+    task->wakeup = rtsTrue;
+    // the wakeup flag is needed because signalCondition() doesn't
+    // flag the condition if the thread is already runniing, but we want
+    // it to be sticky.
+    signalCondition(&task->cond);
+    RELEASE_LOCK(&task->lock);
+}
+#endif
+
+/* ----------------------------------------------------------------------------
+ * Function:  releaseCapability(Capability*)
+ *
+ * Purpose:   Letting go of a capability. Causes a
+ *            'returning worker' thread or a 'waiting worker'
+ *            to wake up, in that order.
+ * ------------------------------------------------------------------------- */
+
+#if defined(THREADED_RTS)
+void
+releaseCapability_ (Capability* cap)
+{
+    Task *task;
+
+    task = cap->running_task;
+
+    ASSERT_PARTIAL_CAPABILITY_INVARIANTS(cap,task);
+
+    cap->running_task = NULL;
+
+    // Check to see whether a worker thread can be given
+    // the go-ahead to return the result of an external call..
+    if (cap->returning_tasks_hd != NULL) {
+	giveCapabilityToTask(cap,cap->returning_tasks_hd);
+	// The Task pops itself from the queue (see waitForReturnCapability())
+	return;
+    }
+
+    // If the next thread on the run queue is a bound thread,
+    // give this Capability to the appropriate Task.
+    if (!emptyRunQueue(cap) && cap->run_queue_hd->bound) {
+	// Make sure we're not about to try to wake ourselves up
+	ASSERT(task != cap->run_queue_hd->bound);
+	task = cap->run_queue_hd->bound;
+	giveCapabilityToTask(cap,task);
+	return;
+    }
+
+    if (!cap->spare_workers) {
+	// Create a worker thread if we don't have one.  If the system
+	// is interrupted, we only create a worker task if there
+	// are threads that need to be completed.  If the system is
+	// shutting down, we never create a new worker.
+	if (sched_state < SCHED_SHUTTING_DOWN || !emptyRunQueue(cap)) {
+	    IF_DEBUG(scheduler,
+		     sched_belch("starting new worker on capability %d", cap->no));
+	    startWorkerTask(cap, workerStart);
+	    return;
+	}
+    }
+
+    // If we have an unbound thread on the run queue, or if there's
+    // anything else to do, give the Capability to a worker thread.
+    if (!emptyRunQueue(cap) || !emptyWakeupQueue(cap)
+	      || !emptySparkPoolCap(cap) || globalWorkToDo()) {
+	if (cap->spare_workers) {
+	    giveCapabilityToTask(cap,cap->spare_workers);
+	    // The worker Task pops itself from the queue;
+	    return;
+	}
+    }
+
+    last_free_capability = cap;
+    IF_DEBUG(scheduler, sched_belch("freeing capability %d", cap->no));
+}
+
+void
+releaseCapability (Capability* cap USED_IF_THREADS)
+{
+    ACQUIRE_LOCK(&cap->lock);
+    releaseCapability_(cap);
+    RELEASE_LOCK(&cap->lock);
+}
+
+static void
+releaseCapabilityAndQueueWorker (Capability* cap USED_IF_THREADS)
+{
+    Task *task;
+
+    ACQUIRE_LOCK(&cap->lock);
+
+    task = cap->running_task;
+
+    // If the current task is a worker, save it on the spare_workers
+    // list of this Capability.  A worker can mark itself as stopped,
+    // in which case it is not replaced on the spare_worker queue.
+    // This happens when the system is shutting down (see
+    // Schedule.c:workerStart()).
+    // Also, be careful to check that this task hasn't just exited
+    // Haskell to do a foreign call (task->suspended_tso).
+    if (!isBoundTask(task) && !task->stopped && !task->suspended_tso) {
+	task->next = cap->spare_workers;
+	cap->spare_workers = task;
+    }
+    // Bound tasks just float around attached to their TSOs.
+
+    releaseCapability_(cap);
+
+    RELEASE_LOCK(&cap->lock);
+}
+#endif
+
+/* ----------------------------------------------------------------------------
+ * waitForReturnCapability( Task *task )
+ *
+ * Purpose:  when an OS thread returns from an external call,
+ * it calls waitForReturnCapability() (via Schedule.resumeThread())
+ * to wait for permission to enter the RTS & communicate the
+ * result of the external call back to the Haskell thread that
+ * made it.
+ *
+ * ------------------------------------------------------------------------- */
+void
+waitForReturnCapability (Capability **pCap, Task *task)
+{
+#if !defined(THREADED_RTS)
+
+    MainCapability.running_task = task;
+    task->cap = &MainCapability;
+    *pCap = &MainCapability;
+
+#else
+    Capability *cap = *pCap;
+
+    if (cap == NULL) {
+	// Try last_free_capability first
+	cap = last_free_capability;
+	if (!cap->running_task) {
+	    nat i;
+	    // otherwise, search for a free capability
+	    for (i = 0; i < n_capabilities; i++) {
+		cap = &capabilities[i];
+		if (!cap->running_task) {
+		    break;
+		}
+	    }
+	    // Can't find a free one, use last_free_capability.
+	    cap = last_free_capability;
+	}
+
+	// record the Capability as the one this Task is now assocated with.
+	task->cap = cap;
+
+    } else {
+	ASSERT(task->cap == cap);
+    }
+
+    ACQUIRE_LOCK(&cap->lock);
+
+    IF_DEBUG(scheduler,
+	     sched_belch("returning; I want capability %d", cap->no));
+
+    if (!cap->running_task) {
+	// It's free; just grab it
+	cap->running_task = task;
+	RELEASE_LOCK(&cap->lock);
+    } else {
+	newReturningTask(cap,task);
+	RELEASE_LOCK(&cap->lock);
+
+	for (;;) {
+	    ACQUIRE_LOCK(&task->lock);
+	    // task->lock held, cap->lock not held
+	    if (!task->wakeup) waitCondition(&task->cond, &task->lock);
+	    cap = task->cap;
+	    task->wakeup = rtsFalse;
+	    RELEASE_LOCK(&task->lock);
+
+	    // now check whether we should wake up...
+	    ACQUIRE_LOCK(&cap->lock);
+	    if (cap->running_task == NULL) {
+		if (cap->returning_tasks_hd != task) {
+		    giveCapabilityToTask(cap,cap->returning_tasks_hd);
+		    RELEASE_LOCK(&cap->lock);
+		    continue;
+		}
+		cap->running_task = task;
+		popReturningTask(cap);
+		RELEASE_LOCK(&cap->lock);
+		break;
+	    }
+	    RELEASE_LOCK(&cap->lock);
+	}
+
+    }
+
+    ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
+
+    IF_DEBUG(scheduler,
+	     sched_belch("returning; got capability %d", cap->no));
+
+    *pCap = cap;
+#endif
+}
+
+#if defined(THREADED_RTS)
+/* ----------------------------------------------------------------------------
+ * yieldCapability
+ * ------------------------------------------------------------------------- */
+
+void
+yieldCapability (Capability** pCap, Task *task)
+{
+    Capability *cap = *pCap;
+
+    // The fast path has no locking, if we don't enter this while loop
+
+    while ( cap->returning_tasks_hd != NULL || !anyWorkForMe(cap,task) ) {
+	IF_DEBUG(scheduler, sched_belch("giving up capability %d", cap->no));
+
+	// We must now release the capability and wait to be woken up
+	// again.
+	task->wakeup = rtsFalse;
+	releaseCapabilityAndQueueWorker(cap);
+
+	for (;;) {
+	    ACQUIRE_LOCK(&task->lock);
+	    // task->lock held, cap->lock not held
+	    if (!task->wakeup) waitCondition(&task->cond, &task->lock);
+	    cap = task->cap;
+	    task->wakeup = rtsFalse;
+	    RELEASE_LOCK(&task->lock);
+
+	    IF_DEBUG(scheduler, sched_belch("woken up on capability %d", cap->no));
+	    ACQUIRE_LOCK(&cap->lock);
+	    if (cap->running_task != NULL) {
+		IF_DEBUG(scheduler, sched_belch("capability %d is owned by another task", cap->no));
+		RELEASE_LOCK(&cap->lock);
+		continue;
+	    }
+
+	    if (task->tso == NULL) {
+		ASSERT(cap->spare_workers != NULL);
+		// if we're not at the front of the queue, release it
+		// again.  This is unlikely to happen.
+		if (cap->spare_workers != task) {
+		    giveCapabilityToTask(cap,cap->spare_workers);
+		    RELEASE_LOCK(&cap->lock);
+		    continue;
+		}
+		cap->spare_workers = task->next;
+		task->next = NULL;
+	    }
+	    cap->running_task = task;
+	    RELEASE_LOCK(&cap->lock);
+	    break;
+	}
+
+	IF_DEBUG(scheduler, sched_belch("got capability %d", cap->no));
+	ASSERT(cap->running_task == task);
+    }
+
+    *pCap = cap;
+
+    ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
+
+    return;
+}
+
+/* ----------------------------------------------------------------------------
+ * Wake up a thread on a Capability.
+ *
+ * This is used when the current Task is running on a Capability and
+ * wishes to wake up a thread on a different Capability.
+ * ------------------------------------------------------------------------- */
+
+void
+wakeupThreadOnCapability (Capability *cap, StgTSO *tso)
+{
+    ASSERT(tso->cap == cap);
+    ASSERT(tso->bound ? tso->bound->cap == cap : 1);
+
+    ACQUIRE_LOCK(&cap->lock);
+    if (cap->running_task == NULL) {
+	// nobody is running this Capability, we can add our thread
+	// directly onto the run queue and start up a Task to run it.
+	appendToRunQueue(cap,tso);
+
+	// start it up
+	cap->running_task = myTask(); // precond for releaseCapability_()
+	releaseCapability_(cap);
+    } else {
+	appendToWakeupQueue(cap,tso);
+	// someone is running on this Capability, so it cannot be
+	// freed without first checking the wakeup queue (see
+	// releaseCapability_).
+    }
+    RELEASE_LOCK(&cap->lock);
+}
+
+/* ----------------------------------------------------------------------------
+ * prodCapabilities
+ *
+ * Used to indicate that the interrupted flag is now set, or some
+ * other global condition that might require waking up a Task on each
+ * Capability.
+ * ------------------------------------------------------------------------- */
+
+static void
+prodCapabilities(rtsBool all)
+{
+    nat i;
+    Capability *cap;
+    Task *task;
+
+    for (i=0; i < n_capabilities; i++) {
+	cap = &capabilities[i];
+	ACQUIRE_LOCK(&cap->lock);
+	if (!cap->running_task) {
+	    if (cap->spare_workers) {
+		task = cap->spare_workers;
+		ASSERT(!task->stopped);
+		giveCapabilityToTask(cap,task);
+		if (!all) {
+		    RELEASE_LOCK(&cap->lock);
+		    return;
+		}
+	    }
+	}
+	RELEASE_LOCK(&cap->lock);
+    }
+    return;
+}
+
+void
+prodAllCapabilities (void)
+{
+    prodCapabilities(rtsTrue);
+}
+
+/* ----------------------------------------------------------------------------
+ * prodOneCapability
+ *
+ * Like prodAllCapabilities, but we only require a single Task to wake
+ * up in order to service some global event, such as checking for
+ * deadlock after some idle time has passed.
+ * ------------------------------------------------------------------------- */
+
+void
+prodOneCapability (void)
+{
+    prodCapabilities(rtsFalse);
+}
+
+/* ----------------------------------------------------------------------------
+ * shutdownCapability
+ *
+ * At shutdown time, we want to let everything exit as cleanly as
+ * possible.  For each capability, we let its run queue drain, and
+ * allow the workers to stop.
+ *
+ * This function should be called when interrupted and
+ * shutting_down_scheduler = rtsTrue, thus any worker that wakes up
+ * will exit the scheduler and call taskStop(), and any bound thread
+ * that wakes up will return to its caller.  Runnable threads are
+ * killed.
+ *
+ * ------------------------------------------------------------------------- */
+
+void
+shutdownCapability (Capability *cap, Task *task)
+{
+    nat i;
+
+    ASSERT(sched_state == SCHED_SHUTTING_DOWN);
+
+    task->cap = cap;
+
+    for (i = 0; i < 50; i++) {
+	IF_DEBUG(scheduler, sched_belch("shutting down capability %d, attempt %d", cap->no, i));
+	ACQUIRE_LOCK(&cap->lock);
+	if (cap->running_task) {
+	    RELEASE_LOCK(&cap->lock);
+	    IF_DEBUG(scheduler, sched_belch("not owner, yielding"));
+	    yieldThread();
+	    continue;
+	}
+	cap->running_task = task;
+	if (!emptyRunQueue(cap) || cap->spare_workers) {
+	    IF_DEBUG(scheduler, sched_belch("runnable threads or workers still alive, yielding"));
+	    releaseCapability_(cap); // this will wake up a worker
+	    RELEASE_LOCK(&cap->lock);
+	    yieldThread();
+	    continue;
+	}
+	IF_DEBUG(scheduler, sched_belch("capability %d is stopped.", cap->no));
+	RELEASE_LOCK(&cap->lock);
+	break;
+    }
+    // we now have the Capability, its run queue and spare workers
+    // list are both empty.
+}
+
+/* ----------------------------------------------------------------------------
+ * tryGrabCapability
+ *
+ * Attempt to gain control of a Capability if it is free.
+ *
+ * ------------------------------------------------------------------------- */
+
+rtsBool
+tryGrabCapability (Capability *cap, Task *task)
+{
+    if (cap->running_task != NULL) return rtsFalse;
+    ACQUIRE_LOCK(&cap->lock);
+    if (cap->running_task != NULL) {
+	RELEASE_LOCK(&cap->lock);
+	return rtsFalse;
+    }
+    task->cap = cap;
+    cap->running_task = task;
+    RELEASE_LOCK(&cap->lock);
+    return rtsTrue;
+}
+
+
+#endif /* THREADED_RTS */
+
+
diff --git a/rts/Capability.h b/rts/Capability.h
new file mode 100644
index 0000000000..a2551d0cc5
--- /dev/null
+++ b/rts/Capability.h
@@ -0,0 +1,250 @@
+/* ---------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001-2006
+ *
+ * Capabilities
+ *
+ * The notion of a capability is used when operating in multi-threaded
+ * environments (which the THREADED_RTS build of the RTS does), to
+ * hold all the state an OS thread/task needs to run Haskell code:
+ * its STG registers, a pointer to its  TSO, a nursery etc. During
+ * STG execution, a pointer to the capabilitity is kept in a 
+ * register (BaseReg).
+ *
+ * Only in an THREADED_RTS build will there be multiple capabilities,
+ * in the non-threaded builds there is one global capability, namely
+ * MainCapability.
+ *
+ * This header file contains the functions for working with capabilities.
+ * (the main, and only, consumer of this interface is the scheduler).
+ * 
+ * --------------------------------------------------------------------------*/
+
+#ifndef CAPABILITY_H
+#define CAPABILITY_H
+
+#include "RtsFlags.h"
+#include "Task.h"
+
+struct Capability_ {
+    // State required by the STG virtual machine when running Haskell
+    // code.  During STG execution, the BaseReg register always points
+    // to the StgRegTable of the current Capability (&cap->r).
+    StgFunTable f;
+    StgRegTable r;
+
+    nat no;  // capability number.
+
+    // The Task currently holding this Capability.  This task has
+    // exclusive access to the contents of this Capability (apart from
+    // returning_tasks_hd/returning_tasks_tl).
+    // Locks required: cap->lock.
+    Task *running_task;
+
+    // true if this Capability is running Haskell code, used for
+    // catching unsafe call-ins.
+    rtsBool in_haskell;
+
+    // The run queue.  The Task owning this Capability has exclusive
+    // access to its run queue, so can wake up threads without
+    // taking a lock, and the common path through the scheduler is
+    // also lock-free.
+    StgTSO *run_queue_hd;
+    StgTSO *run_queue_tl;
+
+    // Tasks currently making safe foreign calls.  Doubly-linked.
+    // When returning, a task first acquires the Capability before
+    // removing itself from this list, so that the GC can find all
+    // the suspended TSOs easily.  Hence, when migrating a Task from
+    // the returning_tasks list, we must also migrate its entry from
+    // this list.
+    Task *suspended_ccalling_tasks;
+
+    // One mutable list per generation, so we don't need to take any
+    // locks when updating an old-generation thunk.  These
+    // mini-mut-lists are moved onto the respective gen->mut_list at
+    // each GC.
+    bdescr **mut_lists;
+
+#if defined(THREADED_RTS)
+    // Worker Tasks waiting in the wings.  Singly-linked.
+    Task *spare_workers;
+
+    // This lock protects running_task, returning_tasks_{hd,tl}, wakeup_queue.
+    Mutex lock;
+
+    // Tasks waiting to return from a foreign call, or waiting to make
+    // a new call-in using this Capability (NULL if empty).
+    // NB. this field needs to be modified by tasks other than the
+    // running_task, so it requires cap->lock to modify.  A task can
+    // check whether it is NULL without taking the lock, however.
+    Task *returning_tasks_hd; // Singly-linked, with head/tail
+    Task *returning_tasks_tl;
+
+    // A list of threads to append to this Capability's run queue at
+    // the earliest opportunity.  These are threads that have been
+    // woken up by another Capability.
+    StgTSO *wakeup_queue_hd;
+    StgTSO *wakeup_queue_tl;
+#endif
+
+    // Per-capability STM-related data
+    StgTVarWaitQueue *free_tvar_wait_queues;
+    StgTRecChunk *free_trec_chunks;
+    StgTRecHeader *free_trec_headers;
+    nat transaction_tokens;
+}; // typedef Capability, defined in RtsAPI.h
+
+
+#if defined(THREADED_RTS)
+#define ASSERT_TASK_ID(task) ASSERT(task->id == osThreadId())
+#else
+#define ASSERT_TASK_ID(task) /*empty*/
+#endif
+
+// These properties should be true when a Task is holding a Capability
+#define ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task)			\
+  ASSERT(cap->running_task != NULL && cap->running_task == task);	\
+  ASSERT(task->cap == cap);						\
+  ASSERT_PARTIAL_CAPABILITY_INVARIANTS(cap,task)
+
+// Sometimes a Task holds a Capability, but the Task is not associated
+// with that Capability (ie. task->cap != cap).  This happens when
+// (a) a Task holds multiple Capabilities, and (b) when the current
+// Task is bound, its thread has just blocked, and it may have been
+// moved to another Capability.
+#define ASSERT_PARTIAL_CAPABILITY_INVARIANTS(cap,task)	\
+  ASSERT(cap->run_queue_hd == END_TSO_QUEUE ?		\
+	    cap->run_queue_tl == END_TSO_QUEUE : 1);	\
+  ASSERT(myTask() == task);				\
+  ASSERT_TASK_ID(task);
+
+// Converts a *StgRegTable into a *Capability.
+//
+INLINE_HEADER Capability *
+regTableToCapability (StgRegTable *reg)
+{
+    return (Capability *)((void *)((unsigned char*)reg - sizeof(StgFunTable)));
+}
+
+// Initialise the available capabilities.
+//
+void initCapabilities (void);
+
+// Release a capability.  This is called by a Task that is exiting
+// Haskell to make a foreign call, or in various other cases when we
+// want to relinquish a Capability that we currently hold.
+//
+// ASSUMES: cap->running_task is the current Task.
+//
+#if defined(THREADED_RTS)
+void releaseCapability  (Capability* cap);
+void releaseCapability_ (Capability* cap); // assumes cap->lock is held
+#else
+// releaseCapability() is empty in non-threaded RTS
+INLINE_HEADER void releaseCapability  (Capability* cap STG_UNUSED) {};
+INLINE_HEADER void releaseCapability_ (Capability* cap STG_UNUSED) {};
+#endif
+
+#if !IN_STG_CODE
+// one global capability
+extern Capability MainCapability; 
+#endif
+
+// Array of all the capabilities
+//
+extern nat n_capabilities;
+extern Capability *capabilities;
+
+// The Capability that was last free.  Used as a good guess for where
+// to assign new threads.
+//
+extern Capability *last_free_capability;
+
+// Acquires a capability at a return point.  If *cap is non-NULL, then
+// this is taken as a preference for the Capability we wish to
+// acquire.
+//
+// OS threads waiting in this function get priority over those waiting
+// in waitForCapability().
+//
+// On return, *cap is non-NULL, and points to the Capability acquired.
+//
+void waitForReturnCapability (Capability **cap/*in/out*/, Task *task);
+
+INLINE_HEADER void recordMutableCap (StgClosure *p, Capability *cap, nat gen);
+
+#if defined(THREADED_RTS)
+
+// Gives up the current capability IFF there is a higher-priority
+// thread waiting for it.  This happens in one of two ways:
+//
+//   (a) we are passing the capability to another OS thread, so
+//       that it can run a bound Haskell thread, or
+//
+//   (b) there is an OS thread waiting to return from a foreign call
+//
+// On return: *pCap is NULL if the capability was released.  The
+// current task should then re-acquire it using waitForCapability().
+//
+void yieldCapability (Capability** pCap, Task *task);
+
+// Acquires a capability for doing some work.
+//
+// On return: pCap points to the capability.
+//
+void waitForCapability (Task *task, Mutex *mutex, Capability **pCap);
+
+// Wakes up a thread on a Capability (probably a different Capability
+// from the one held by the current Task).
+//
+void wakeupThreadOnCapability (Capability *cap, StgTSO *tso);
+
+// Wakes up a worker thread on just one Capability, used when we
+// need to service some global event.
+//
+void prodOneCapability (void);
+
+// Similar to prodOneCapability(), but prods all of them.
+//
+void prodAllCapabilities (void);
+
+// Waits for a capability to drain of runnable threads and workers,
+// and then acquires it.  Used at shutdown time.
+//
+void shutdownCapability (Capability *cap, Task *task);
+
+// Attempt to gain control of a Capability if it is free.
+//
+rtsBool tryGrabCapability (Capability *cap, Task *task);
+
+#else // !THREADED_RTS
+
+// Grab a capability.  (Only in the non-threaded RTS; in the threaded
+// RTS one of the waitFor*Capability() functions must be used).
+//
+extern void grabCapability (Capability **pCap);
+
+#endif /* !THREADED_RTS */
+
+/* -----------------------------------------------------------------------------
+ * INLINE functions... private below here
+ * -------------------------------------------------------------------------- */
+
+INLINE_HEADER void
+recordMutableCap (StgClosure *p, Capability *cap, nat gen)
+{
+    bdescr *bd;
+
+    bd = cap->mut_lists[gen];
+    if (bd->free >= bd->start + BLOCK_SIZE_W) {
+	bdescr *new_bd;
+	new_bd = allocBlock_lock();
+	new_bd->link = bd;
+	bd = new_bd;
+	cap->mut_lists[gen] = bd;
+    }
+    *bd->free++ = (StgWord)p;
+}
+
+#endif /* CAPABILITY_H */
diff --git a/rts/ClosureFlags.c b/rts/ClosureFlags.c
new file mode 100644
index 0000000000..5545693362
--- /dev/null
+++ b/rts/ClosureFlags.c
@@ -0,0 +1,107 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-1999
+ *
+ * Closure type flags
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+
+StgWord16 closure_flags[] = {
+
+/* 
+ * These *must* be in the same order as the closure types in
+ * ClosureTypes.h.
+ */
+
+/* ToDo: some of these flags seem to be duplicated.
+ *       - NS is the same as HNF, and the negation of THU
+ * (however, we set NS for indirections, which is probably the
+ *  right thing to do, since we never get indirections pointing
+ *  to thunks.)
+ */
+
+/*                              0    1    2    3    4   5   6   7 */
+/*			      HNF  BTM   NS  STA  THU MUT UPT SRT */
+
+/* INVALID_OBJECT       = */ ( 0                                      	 ),
+/* CONSTR  		= */ (_HNF|     _NS                           	 ),
+/* CONSTR_1_0	   	= */ (_HNF|     _NS                           	 ),
+/* CONSTR_0_1	   	= */ (_HNF|     _NS                           	 ),
+/* CONSTR_2_0	   	= */ (_HNF|     _NS                           	 ),
+/* CONSTR_1_1	   	= */ (_HNF|     _NS                           	 ),
+/* CONSTR_0_2	   	= */ (_HNF|     _NS                           	 ),
+/* CONSTR_INTLIKE 	= */ (_HNF|     _NS|_STA                      	 ),
+/* CONSTR_CHARLIKE  	= */ (_HNF|     _NS|_STA                      	 ),
+/* CONSTR_STATIC	= */ (_HNF|     _NS|_STA                      	 ),
+/* CONSTR_NOCAF_STATIC  = */ (_HNF|     _NS|_STA                      	 ),
+/* FUN		   	= */ (_HNF|     _NS|                  _SRT    	 ),
+/* FUN_1_0		= */ (_HNF|     _NS|		      _SRT       ),
+/* FUN_0_1		= */ (_HNF|     _NS|		      _SRT       ),
+/* FUN_2_0		= */ (_HNF|     _NS|		      _SRT       ),
+/* FUN_1_1		= */ (_HNF|     _NS|		      _SRT       ),
+/* FUN_0_2		= */ (_HNF|     _NS|		      _SRT       ),
+/* FUN_STATIC	   	= */ (_HNF|     _NS|_STA|             _SRT    	 ),
+/* THUNK		= */ (     _BTM|         _THU|        _SRT    	 ),
+/* THUNK_1_0		= */ (     _BTM|         _THU|        _SRT    	 ),
+/* THUNK_0_1		= */ (     _BTM|         _THU|        _SRT    	 ),
+/* THUNK_2_0		= */ (     _BTM|         _THU|        _SRT    	 ),
+/* THUNK_1_1		= */ (     _BTM|         _THU|        _SRT    	 ),
+/* THUNK_0_2		= */ (     _BTM|         _THU|        _SRT    	 ),
+/* THUNK_STATIC	   	= */ (     _BTM|    _STA|_THU|        _SRT    	 ),
+/* THUNK_SELECTOR	= */ (     _BTM|         _THU|        _SRT    	 ),
+/* BCO		   	= */ (_HNF|     _NS                           	 ),
+/* AP			= */ (                   _THU                 	 ),
+/* PAP		   	= */ (_HNF|     _NS				 ),
+/* AP_STACK	   	= */ (          	 _THU			 ),
+/* IND		   	= */ (          _NS|			    _IND ),
+/* IND_OLDGEN	   	= */ (          _NS|			    _IND ),
+/* IND_PERM		= */ (          _NS|			    _IND ),
+/* IND_OLDGEN_PERM	= */ (          _NS|			    _IND ),
+/* IND_STATIC	   	= */ (          _NS|_STA|                   _IND ),
+/* RET_BCO		= */ (     _BTM                                  ),
+/* RET_SMALL		= */ (     _BTM|                       _SRT      ),
+/* RET_VEC_SMALL	= */ (     _BTM|                       _SRT      ),
+/* RET_BIG		= */ (                                 _SRT      ),
+/* RET_VEC_BIG	   	= */ (                                 _SRT      ),
+/* RET_DYN		= */ (                                 _SRT      ),
+/* RET_FUN		= */ ( 0                                         ),
+/* UPDATE_FRAME        	= */ (     _BTM                                  ),
+/* CATCH_FRAME	   	= */ (     _BTM                                  ),
+/* STOP_FRAME	   	= */ (     _BTM                                  ),
+/* CAF_BLACKHOLE   	= */ ( 	   _BTM|_NS|              _UPT           ),
+/* BLACKHOLE		= */ ( 	        _NS|              _UPT           ),
+/* SE_BLACKHOLE		= */ ( 	        _NS|              _UPT           ),
+/* SE_CAF_BLACKHOLE	= */ ( 	        _NS|              _UPT           ),
+/* MVAR		   	= */ (_HNF|     _NS|         _MUT|_UPT           ),
+/* ARR_WORDS		= */ (_HNF|     _NS|              _UPT           ),
+/* MUT_ARR_PTRS_CLEAN  	= */ (_HNF|     _NS|         _MUT|_UPT           ),
+/* MUT_ARR_PTRS_DIRTY  	= */ (_HNF|     _NS|         _MUT|_UPT           ),
+/* MUT_ARR_PTRS_FROZEN0	= */ (_HNF|     _NS|         _MUT|_UPT           ),
+/* MUT_ARR_PTRS_FROZEN 	= */ (_HNF|     _NS|              _UPT           ),
+/* MUT_VAR_CLEAN	= */ (_HNF|     _NS|         _MUT|_UPT           ),
+/* MUT_VAR_DIRTY	= */ (_HNF|     _NS|         _MUT|_UPT           ),
+/* WEAK		   	= */ (_HNF|     _NS|              _UPT           ),
+/* STABLE_NAME	   	= */ (_HNF|     _NS|              _UPT           ),
+/* TSO                 	= */ (_HNF|     _NS|         _MUT|_UPT           ),
+/* BLOCKED_FETCH	= */ (_HNF|     _NS|         _MUT|_UPT           ),
+/* FETCH_ME		= */ (_HNF|     _NS|         _MUT|_UPT           ),
+/* FETCH_ME_BQ          = */ ( 	        _NS|         _MUT|_UPT           ),
+/* RBH                  = */ ( 	        _NS|         _MUT|_UPT           ),
+/* EVACUATED		= */ ( 0                                         ),
+/* REMOTE_REF		= */ (_HNF|     _NS|              _UPT           ),
+/* TVAR_WAIT_QUEUE      = */ (          _NS|         _MUT|_UPT           ),
+/* TVAR                 = */ (_HNF|     _NS|         _MUT|_UPT           ), 
+/* TREC_CHUNK           = */ (          _NS|         _MUT|_UPT           ),
+/* TREC_HEADER          = */ (          _NS|         _MUT|_UPT           ),
+/* ATOMICALLY_FRAME     = */ (     _BTM                                  ),
+/* CATCH_RETRY_FRAME    = */ (     _BTM                                  ),
+/* CATCH_STM_FRAME      = */ (     _BTM                                  )
+};
+
+#if N_CLOSURE_TYPES != 73
+#error Closure types changed: update ClosureFlags.c!
+#endif
+
diff --git a/rts/Disassembler.c b/rts/Disassembler.c
new file mode 100644
index 0000000000..b084a29b89
--- /dev/null
+++ b/rts/Disassembler.c
@@ -0,0 +1,281 @@
+/* -----------------------------------------------------------------------------
+ * Bytecode disassembler
+ *
+ * Copyright (c) 1994-2002.
+ *
+ * $RCSfile: Disassembler.c,v $
+ * $Revision: 1.29 $
+ * $Date: 2004/09/03 15:28:19 $
+ * ---------------------------------------------------------------------------*/
+
+#ifdef DEBUG
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsAPI.h"
+#include "RtsUtils.h"
+#include "Closures.h"
+#include "TSO.h"
+#include "Schedule.h"
+
+#include "Bytecodes.h"
+#include "Printer.h"
+#include "Disassembler.h"
+#include "Interpreter.h"
+
+/* --------------------------------------------------------------------------
+ * Disassembler
+ * ------------------------------------------------------------------------*/
+
+int
+disInstr ( StgBCO *bco, int pc )
+{
+   int i;
+
+   StgWord16*     instrs      = (StgWord16*)(bco->instrs->payload);
+
+   StgArrWords*   literal_arr = bco->literals;
+   StgWord*       literals    = (StgWord*)(&literal_arr->payload[0]);
+
+   StgMutArrPtrs* ptrs_arr    = bco->ptrs;
+   StgPtr*        ptrs        = (StgPtr*)(&ptrs_arr->payload[0]);
+
+   StgArrWords*   itbls_arr   = bco->itbls;
+   StgInfoTable** itbls       = (StgInfoTable**)(&itbls_arr->payload[0]);
+
+   switch (instrs[pc++]) {
+      case bci_SWIZZLE:
+         debugBelch("SWIZZLE stkoff %d by %d\n",
+                         instrs[pc], (signed int)instrs[pc+1]);
+         pc += 2; break;
+      case bci_CCALL:
+         debugBelch("CCALL    marshaller at 0x%x\n", 
+                         literals[instrs[pc]] );
+         pc += 1; break;
+      case bci_STKCHECK: 
+         debugBelch("STKCHECK %d\n", instrs[pc] );
+         pc += 1; break;
+      case bci_PUSH_L: 
+         debugBelch("PUSH_L   %d\n", instrs[pc] );
+         pc += 1; break;
+      case bci_PUSH_LL:
+         debugBelch("PUSH_LL  %d %d\n", instrs[pc], instrs[pc+1] ); 
+         pc += 2; break;
+      case bci_PUSH_LLL:
+         debugBelch("PUSH_LLL %d %d %d\n", instrs[pc], instrs[pc+1], 
+                                                            instrs[pc+2] ); 
+         pc += 3; break;
+      case bci_PUSH_G:
+         debugBelch("PUSH_G   " ); printPtr( ptrs[instrs[pc]] );
+         debugBelch("\n" );
+         pc += 1; break;
+
+      case bci_PUSH_ALTS:
+         debugBelch("PUSH_ALTS  " ); printPtr( ptrs[instrs[pc]] );
+         debugBelch("\n");
+         pc += 1; break;
+      case bci_PUSH_ALTS_P:
+         debugBelch("PUSH_ALTS_P  " ); printPtr( ptrs[instrs[pc]] );
+         debugBelch("\n");
+         pc += 1; break;
+      case bci_PUSH_ALTS_N:
+         debugBelch("PUSH_ALTS_N  " ); printPtr( ptrs[instrs[pc]] );
+         debugBelch("\n");
+         pc += 1; break;
+      case bci_PUSH_ALTS_F:
+         debugBelch("PUSH_ALTS_F  " ); printPtr( ptrs[instrs[pc]] );
+         debugBelch("\n");
+         pc += 1; break;
+      case bci_PUSH_ALTS_D:
+         debugBelch("PUSH_ALTS_D  " ); printPtr( ptrs[instrs[pc]] );
+         debugBelch("\n");
+         pc += 1; break;
+      case bci_PUSH_ALTS_L:
+         debugBelch("PUSH_ALTS_L  " ); printPtr( ptrs[instrs[pc]] );
+         debugBelch("\n");
+         pc += 1; break;
+      case bci_PUSH_ALTS_V:
+         debugBelch("PUSH_ALTS_V  " ); printPtr( ptrs[instrs[pc]] );
+         debugBelch("\n");
+         pc += 1; break;
+
+      case bci_PUSH_UBX:
+         debugBelch("PUSH_UBX ");
+         for (i = 0; i < instrs[pc+1]; i++) 
+            debugBelch("0x%x ", literals[i + instrs[pc]] );
+         debugBelch("\n");
+         pc += 2; break;
+      case bci_PUSH_APPLY_N:
+	  debugBelch("PUSH_APPLY_N\n");
+	  break;
+      case bci_PUSH_APPLY_V:
+	  debugBelch("PUSH_APPLY_V\n");
+	  break;
+      case bci_PUSH_APPLY_F:
+	  debugBelch("PUSH_APPLY_F\n");
+	  break;
+      case bci_PUSH_APPLY_D:
+	  debugBelch("PUSH_APPLY_D\n");
+	  break;
+      case bci_PUSH_APPLY_L:
+	  debugBelch("PUSH_APPLY_L\n");
+	  break;
+      case bci_PUSH_APPLY_P:
+	  debugBelch("PUSH_APPLY_P\n");
+	  break;
+      case bci_PUSH_APPLY_PP:
+	  debugBelch("PUSH_APPLY_PP\n");
+	  break;
+      case bci_PUSH_APPLY_PPP:
+	  debugBelch("PUSH_APPLY_PPP\n");
+	  break;
+      case bci_PUSH_APPLY_PPPP:
+	  debugBelch("PUSH_APPLY_PPPP\n");
+	  break;
+      case bci_PUSH_APPLY_PPPPP:
+	  debugBelch("PUSH_APPLY_PPPPP\n");
+	  break;
+      case bci_PUSH_APPLY_PPPPPP:
+	  debugBelch("PUSH_APPLY_PPPPPP\n");
+	  break;
+      case bci_SLIDE: 
+         debugBelch("SLIDE     %d down by %d\n", instrs[pc], instrs[pc+1] );
+         pc += 2; break;
+      case bci_ALLOC_AP:
+         debugBelch("ALLOC_AP  %d words\n", instrs[pc] );
+         pc += 1; break;
+      case bci_ALLOC_PAP:
+         debugBelch("ALLOC_PAP %d words, %d arity\n",
+		 instrs[pc], instrs[pc+1] );
+         pc += 2; break;
+      case bci_MKAP:
+         debugBelch("MKAP      %d words, %d stkoff\n", instrs[pc+1], 
+                                                           instrs[pc] );
+         pc += 2; break;
+      case bci_UNPACK:
+         debugBelch("UNPACK    %d\n", instrs[pc] );
+         pc += 1; break;
+      case bci_PACK:
+         debugBelch("PACK      %d words with itbl ", instrs[pc+1] );
+         printPtr( (StgPtr)itbls[instrs[pc]] );
+         debugBelch("\n");
+         pc += 2; break;
+
+      case bci_TESTLT_I:
+         debugBelch("TESTLT_I  %d, fail to %d\n", literals[instrs[pc]],
+                                                      instrs[pc+1]);
+         pc += 2; break;
+      case bci_TESTEQ_I:
+         debugBelch("TESTEQ_I  %d, fail to %d\n", literals[instrs[pc]],
+                                                      instrs[pc+1]);
+         pc += 2; break;
+
+      case bci_TESTLT_F:
+         debugBelch("TESTLT_F  %d, fail to %d\n", literals[instrs[pc]],
+                                                      instrs[pc+1]);
+         pc += 2; break;
+      case bci_TESTEQ_F:
+         debugBelch("TESTEQ_F  %d, fail to %d\n", literals[instrs[pc]],
+                                                      instrs[pc+1]);
+         pc += 2; break;
+
+      case bci_TESTLT_D:
+         debugBelch("TESTLT_D  %d, fail to %d\n", literals[instrs[pc]],
+                                                      instrs[pc+1]);
+         pc += 2; break;
+      case bci_TESTEQ_D:
+         debugBelch("TESTEQ_D  %d, fail to %d\n", literals[instrs[pc]],
+                                                      instrs[pc+1]);
+         pc += 2; break;
+
+      case bci_TESTLT_P:
+         debugBelch("TESTLT_P  %d, fail to %d\n", instrs[pc],
+                                                      instrs[pc+1]);
+         pc += 2; break;
+      case bci_TESTEQ_P:
+         debugBelch("TESTEQ_P  %d, fail to %d\n", instrs[pc],
+                                                      instrs[pc+1]);
+         pc += 2; break;
+      case bci_CASEFAIL: 
+         debugBelch("CASEFAIL\n" );
+         break;
+      case bci_JMP:
+         debugBelch("JMP to    %d\n", instrs[pc]);
+         pc += 1; break;
+
+      case bci_ENTER:
+         debugBelch("ENTER\n");
+         break;
+
+      case bci_RETURN:
+         debugBelch("RETURN\n" );
+	 break;
+      case bci_RETURN_P:
+         debugBelch("RETURN_P\n" );
+	 break;
+      case bci_RETURN_N:
+         debugBelch("RETURN_N\n" );
+	 break;
+      case bci_RETURN_F:
+         debugBelch("RETURN_F\n" );
+	 break;
+      case bci_RETURN_D:
+         debugBelch("RETURN_D\n" );
+	 break;
+      case bci_RETURN_L:
+         debugBelch("RETURN_L\n" );
+	 break;
+      case bci_RETURN_V:
+         debugBelch("RETURN_V\n" );
+	 break;
+
+      default:
+         barf("disInstr: unknown opcode");
+   }
+   return pc;
+}
+
+
+/* Something of a kludge .. how do we know where the end of the insn
+   array is, since it isn't recorded anywhere?  Answer: the first
+   short is the number of bytecodes which follow it.  
+   See ByteCodeGen.linkBCO.insns_arr for construction ...  
+*/
+void disassemble( StgBCO *bco )
+{
+   nat i, j;
+   StgWord16*     instrs    = (StgWord16*)(bco->instrs->payload);
+   StgMutArrPtrs* ptrs      = bco->ptrs;
+   nat            nbcs      = (int)instrs[0];
+   nat            pc        = 1;
+
+   debugBelch("BCO\n" );
+   pc = 1;
+   while (pc <= nbcs) {
+      debugBelch("\t%2d:  ", pc );
+      pc = disInstr ( bco, pc );
+   }
+
+   debugBelch("INSTRS:\n   " );
+   j = 16;
+   for (i = 0; i < nbcs; i++) {
+      debugBelch("%3d ", (int)instrs[i] );
+      j--; 
+      if (j == 0) { j = 16; debugBelch("\n   "); };
+   }
+   debugBelch("\n");
+
+   debugBelch("PTRS:\n   " );
+   j = 8;
+   for (i = 0; i < ptrs->ptrs; i++) {
+      debugBelch("%8p ", ptrs->payload[i] );
+      j--; 
+      if (j == 0) { j = 8; debugBelch("\n   "); };
+   }
+   debugBelch("\n");
+
+   debugBelch("\n");
+   ASSERT(pc == nbcs+1);
+}
+
+#endif /* DEBUG */
diff --git a/rts/Disassembler.h b/rts/Disassembler.h
new file mode 100644
index 0000000000..2851097117
--- /dev/null
+++ b/rts/Disassembler.h
@@ -0,0 +1,19 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Prototypes for functions in Disassembler.c
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef DISASSEMBLER_H
+#define DISASSEMBLER_H
+
+#ifdef DEBUG
+
+extern int  disInstr   ( StgBCO *bco, int pc );
+extern void disassemble( StgBCO *bco );
+
+#endif
+
+#endif /* DISASSEMBLER_H */
diff --git a/rts/Exception.cmm b/rts/Exception.cmm
new file mode 100644
index 0000000000..b5c29626b2
--- /dev/null
+++ b/rts/Exception.cmm
@@ -0,0 +1,446 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * Exception support
+ *
+ * This file is written in a subset of C--, extended with various
+ * features specific to GHC.  It is compiled by GHC directly.  For the
+ * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Cmm.h"
+
+/* -----------------------------------------------------------------------------
+   Exception Primitives
+
+   A thread can request that asynchronous exceptions not be delivered
+   ("blocked") for the duration of an I/O computation.  The primitive
+   
+	blockAsyncExceptions# :: IO a -> IO a
+
+   is used for this purpose.  During a blocked section, asynchronous
+   exceptions may be unblocked again temporarily:
+
+	unblockAsyncExceptions# :: IO a -> IO a
+
+   Furthermore, asynchronous exceptions are blocked automatically during
+   the execution of an exception handler.  Both of these primitives
+   leave a continuation on the stack which reverts to the previous
+   state (blocked or unblocked) on exit.
+
+   A thread which wants to raise an exception in another thread (using
+   killThread#) must block until the target thread is ready to receive
+   it.  The action of unblocking exceptions in a thread will release all
+   the threads waiting to deliver exceptions to that thread.
+
+   NB. there's a bug in here.  If a thread is inside an
+   unsafePerformIO, and inside blockAsyncExceptions# (there is an
+   unblockAsyncExceptions_ret on the stack), and it is blocked in an
+   interruptible operation, and it receives an exception, then the
+   unsafePerformIO thunk will be updated with a stack object
+   containing the unblockAsyncExceptions_ret frame.  Later, when
+   someone else evaluates this thunk, the blocked exception state is
+   not restored, and the result is that unblockAsyncExceptions_ret
+   will attempt to unblock exceptions in the current thread, but it'll
+   find that the CurrentTSO->blocked_exceptions is NULL.  Hence, we
+   work around this by checking for NULL in awakenBlockedQueue().
+
+   -------------------------------------------------------------------------- */
+
+INFO_TABLE_RET( stg_unblockAsyncExceptionszh_ret,
+ 		0/*framesize*/, 0/*bitmap*/, RET_SMALL )
+{
+    // Not true: see comments above
+    // ASSERT(StgTSO_blocked_exceptions(CurrentTSO) != NULL);
+#if defined(GRAN) || defined(PAR)
+    foreign "C" awakenBlockedQueue(MyCapability() "ptr", StgTSO_blocked_exceptions(CurrentTSO) "ptr", 
+				   NULL "ptr"); 
+#else
+    foreign "C" awakenBlockedQueue(MyCapability() "ptr", StgTSO_blocked_exceptions(CurrentTSO) "ptr");
+#endif
+    StgTSO_blocked_exceptions(CurrentTSO) = NULL;
+#ifdef REG_R1
+    Sp_adj(1);
+    jump %ENTRY_CODE(Sp(0));
+#else
+    Sp(1) = Sp(0);
+    Sp_adj(1);
+    jump %ENTRY_CODE(Sp(1));
+#endif
+}
+
+INFO_TABLE_RET( stg_blockAsyncExceptionszh_ret,
+  		0/*framesize*/, 0/*bitmap*/, RET_SMALL )
+{
+    // Not true: see comments above
+    // ASSERT(StgTSO_blocked_exceptions(CurrentTSO) == NULL);
+    StgTSO_blocked_exceptions(CurrentTSO) = END_TSO_QUEUE;
+#ifdef REG_R1
+    Sp_adj(1);
+    jump %ENTRY_CODE(Sp(0));
+#else
+    Sp(1) = Sp(0);
+    Sp_adj(1);
+    jump %ENTRY_CODE(Sp(1));
+#endif
+}
+
+blockAsyncExceptionszh_fast
+{
+    /* Args: R1 :: IO a */
+    STK_CHK_GEN( WDS(2)/* worst case */, R1_PTR, blockAsyncExceptionszh_fast);
+
+    if (StgTSO_blocked_exceptions(CurrentTSO) == NULL) {
+      StgTSO_blocked_exceptions(CurrentTSO) = END_TSO_QUEUE;
+      /* avoid growing the stack unnecessarily */
+      if (Sp(0) == stg_blockAsyncExceptionszh_ret_info) {
+	Sp_adj(1);
+      } else {
+	Sp_adj(-1);
+	Sp(0) = stg_unblockAsyncExceptionszh_ret_info;
+      }
+    }
+    TICK_UNKNOWN_CALL();
+    TICK_SLOW_CALL_v();
+    jump stg_ap_v_fast;
+}
+
+unblockAsyncExceptionszh_fast
+{
+    /* Args: R1 :: IO a */
+    STK_CHK_GEN( WDS(2), R1_PTR, unblockAsyncExceptionszh_fast);
+
+    if (StgTSO_blocked_exceptions(CurrentTSO) != NULL) {
+#if defined(GRAN) || defined(PAR)
+      foreign "C" awakenBlockedQueue(MyCapability() "ptr", StgTSO_blocked_exceptions(CurrentTSO) "ptr", 
+	          		     StgTSO_block_info(CurrentTSO) "ptr");
+#else
+      foreign "C" awakenBlockedQueue(MyCapability() "ptr", StgTSO_blocked_exceptions(CurrentTSO) "ptr");
+#endif
+      StgTSO_blocked_exceptions(CurrentTSO) = NULL;
+
+      /* avoid growing the stack unnecessarily */
+      if (Sp(0) == stg_unblockAsyncExceptionszh_ret_info) {
+	Sp_adj(1);
+      } else {
+	Sp_adj(-1);
+	Sp(0) = stg_blockAsyncExceptionszh_ret_info;
+      }
+    }
+    TICK_UNKNOWN_CALL();
+    TICK_SLOW_CALL_v();
+    jump stg_ap_v_fast;
+}
+
+
+#define interruptible(what_next) 		\
+        (   what_next == BlockedOnMVar		\
+         || what_next == BlockedOnException	\
+         || what_next == BlockedOnRead		\
+         || what_next == BlockedOnWrite		\
+         || what_next == BlockedOnDelay		\
+         || what_next == BlockedOnDoProc)
+
+killThreadzh_fast
+{
+  /* args: R1 = TSO to kill, R2 = Exception */
+
+  W_ why_blocked;
+
+  /* This thread may have been relocated.
+   * (see Schedule.c:threadStackOverflow)
+   */
+ while:
+  if (StgTSO_what_next(R1) == ThreadRelocated::I16) {
+    R1 = StgTSO_link(R1);
+    goto while;
+  }
+
+  /* Determine whether this thread is interruptible or not */
+
+  /* If the target thread is currently blocking async exceptions,
+   * we'll have to block until it's ready to accept them.  The
+   * exception is interruptible threads - ie. those that are blocked
+   * on some resource.
+   */
+  why_blocked = TO_W_(StgTSO_why_blocked(R1));
+  if (StgTSO_blocked_exceptions(R1) != NULL && !interruptible(why_blocked))
+  {
+      StgTSO_link(CurrentTSO) = StgTSO_blocked_exceptions(R1);
+      StgTSO_blocked_exceptions(R1) = CurrentTSO;
+      
+      StgTSO_why_blocked(CurrentTSO) = BlockedOnException::I16;
+      StgTSO_block_info(CurrentTSO) = R1;
+      
+      BLOCK( R1_PTR & R2_PTR, killThreadzh_fast );
+  }
+
+  /* Killed threads turn into zombies, which might be garbage
+   * collected at a later date.  That's why we don't have to
+   * explicitly remove them from any queues they might be on.
+   */
+
+  /* We might have killed ourselves.  In which case, better be *very*
+   * careful.  If the exception killed us, then return to the scheduler.
+   * If the exception went to a catch frame, we'll just continue from
+   * the handler.
+   */
+  if (R1 == CurrentTSO) {
+	SAVE_THREAD_STATE();
+	foreign "C" raiseAsync(MyCapability() "ptr", R1 "ptr", R2 "ptr");
+	if (StgTSO_what_next(CurrentTSO) == ThreadKilled::I16) {
+		R1 = ThreadFinished;
+		jump StgReturn;
+	} else {
+		LOAD_THREAD_STATE();
+		ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
+		jump %ENTRY_CODE(Sp(0));
+	}
+  } else {
+	foreign "C" raiseAsync(MyCapability() "ptr", R1 "ptr", R2 "ptr");
+  }
+
+  jump %ENTRY_CODE(Sp(0));
+}
+
+/* -----------------------------------------------------------------------------
+   Catch frames
+   -------------------------------------------------------------------------- */
+
+#ifdef REG_R1
+#define CATCH_FRAME_ENTRY_TEMPLATE(label,ret) 	\
+   label					\
+   {						\
+      Sp = Sp + SIZEOF_StgCatchFrame; 		\
+      jump ret;					\
+   }
+#else
+#define CATCH_FRAME_ENTRY_TEMPLATE(label,ret) 	\
+   label					\
+   {						\
+      W_ rval;					\
+      rval = Sp(0);				\
+      Sp = Sp + SIZEOF_StgCatchFrame;		\
+      Sp(0) = rval;				\
+      jump ret;					\
+   }
+#endif
+
+#ifdef REG_R1
+#define SP_OFF 0
+#else
+#define SP_OFF 1
+#endif
+
+CATCH_FRAME_ENTRY_TEMPLATE(stg_catch_frame_0_ret,%RET_VEC(Sp(SP_OFF),0))
+CATCH_FRAME_ENTRY_TEMPLATE(stg_catch_frame_1_ret,%RET_VEC(Sp(SP_OFF),1))
+CATCH_FRAME_ENTRY_TEMPLATE(stg_catch_frame_2_ret,%RET_VEC(Sp(SP_OFF),2))
+CATCH_FRAME_ENTRY_TEMPLATE(stg_catch_frame_3_ret,%RET_VEC(Sp(SP_OFF),3))
+CATCH_FRAME_ENTRY_TEMPLATE(stg_catch_frame_4_ret,%RET_VEC(Sp(SP_OFF),4))
+CATCH_FRAME_ENTRY_TEMPLATE(stg_catch_frame_5_ret,%RET_VEC(Sp(SP_OFF),5))
+CATCH_FRAME_ENTRY_TEMPLATE(stg_catch_frame_6_ret,%RET_VEC(Sp(SP_OFF),6))
+CATCH_FRAME_ENTRY_TEMPLATE(stg_catch_frame_7_ret,%RET_VEC(Sp(SP_OFF),7))
+
+#if MAX_VECTORED_RTN > 8
+#error MAX_VECTORED_RTN has changed: please modify stg_catch_frame too.
+#endif
+
+#if defined(PROFILING)
+#define CATCH_FRAME_BITMAP 7
+#define CATCH_FRAME_WORDS  4
+#else
+#define CATCH_FRAME_BITMAP 1
+#define CATCH_FRAME_WORDS  2
+#endif
+
+/* Catch frames are very similar to update frames, but when entering
+ * one we just pop the frame off the stack and perform the correct
+ * kind of return to the activation record underneath us on the stack.
+ */
+
+INFO_TABLE_RET(stg_catch_frame,
+	       CATCH_FRAME_WORDS, CATCH_FRAME_BITMAP,
+	       CATCH_FRAME,
+	       stg_catch_frame_0_ret,
+	       stg_catch_frame_1_ret,
+	       stg_catch_frame_2_ret,
+	       stg_catch_frame_3_ret,
+	       stg_catch_frame_4_ret,
+	       stg_catch_frame_5_ret,
+	       stg_catch_frame_6_ret,
+	       stg_catch_frame_7_ret)
+CATCH_FRAME_ENTRY_TEMPLATE(,%ENTRY_CODE(Sp(SP_OFF)))
+
+/* -----------------------------------------------------------------------------
+ * The catch infotable
+ *
+ * This should be exactly the same as would be generated by this STG code
+ *
+ * catch = {x,h} \n {} -> catch#{x,h}
+ *
+ * It is used in deleteThread when reverting blackholes.
+ * -------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_catch,2,0,FUN,"catch","catch")
+{
+  R2 = StgClosure_payload(R1,1); /* h */
+  R1 = StgClosure_payload(R1,0); /* x */
+  jump catchzh_fast;
+}
+
+catchzh_fast
+{
+    /* args: R1 = m :: IO a, R2 = handler :: Exception -> IO a */
+    STK_CHK_GEN(SIZEOF_StgCatchFrame + WDS(1), R1_PTR & R2_PTR, catchzh_fast);
+  
+    /* Set up the catch frame */
+    Sp = Sp - SIZEOF_StgCatchFrame;
+    SET_HDR(Sp,stg_catch_frame_info,W_[CCCS]);
+    
+    StgCatchFrame_handler(Sp) = R2;
+    StgCatchFrame_exceptions_blocked(Sp) = 
+	(StgTSO_blocked_exceptions(CurrentTSO) != NULL);
+    TICK_CATCHF_PUSHED();
+
+    /* Apply R1 to the realworld token */
+    TICK_UNKNOWN_CALL();
+    TICK_SLOW_CALL_v();
+    jump stg_ap_v_fast;
+}      
+
+/* -----------------------------------------------------------------------------
+ * The raise infotable
+ * 
+ * This should be exactly the same as would be generated by this STG code
+ *
+ *   raise = {err} \n {} -> raise#{err}
+ *
+ * It is used in raisezh_fast to update thunks on the update list
+ * -------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_raise,1,0,THUNK_1_0,"raise","raise")
+{
+  R1 = StgThunk_payload(R1,0);
+  jump raisezh_fast;
+}
+
+raisezh_fast
+{
+    W_ handler;
+    W_ raise_closure;
+    W_ frame_type;
+    /* args : R1 :: Exception */
+
+
+#if defined(PROFILING)
+    /* Debugging tool: on raising an  exception, show where we are. */
+
+    /* ToDo: currently this is a hack.  Would be much better if
+     * the info was only displayed for an *uncaught* exception.
+     */
+    if (RtsFlags_ProfFlags_showCCSOnException(RtsFlags)) {
+      foreign "C" fprintCCS_stderr(W_[CCCS] "ptr");
+    }
+#endif
+
+retry_pop_stack:
+    StgTSO_sp(CurrentTSO) = Sp;
+    frame_type = foreign "C" raiseExceptionHelper(BaseReg "ptr", CurrentTSO "ptr", R1 "ptr");
+    Sp = StgTSO_sp(CurrentTSO);
+    if (frame_type == ATOMICALLY_FRAME) {
+      /* The exception has reached the edge of a memory transaction.  Check that 
+       * the transaction is valid.  If not then perhaps the exception should
+       * not have been thrown: re-run the transaction */
+      W_ trec;
+      W_ r;
+      trec = StgTSO_trec(CurrentTSO);
+      r = foreign "C" stmValidateNestOfTransactions(trec "ptr");
+      foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr");
+      StgTSO_trec(CurrentTSO) = NO_TREC;
+      if (r) {
+        // Transaction was valid: continue searching for a catch frame
+        Sp = Sp + SIZEOF_StgAtomicallyFrame;
+        goto retry_pop_stack;
+      } else {
+        // Transaction was not valid: we retry the exception (otherwise continue
+        // with a further call to raiseExceptionHelper)
+        "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr");
+        StgTSO_trec(CurrentTSO) = trec;
+        R1 = StgAtomicallyFrame_code(Sp);
+        jump stg_ap_v_fast;
+      }          
+    }
+
+    if (frame_type == STOP_FRAME) {
+	/*
+	 * We've stripped the entire stack, the thread is now dead.
+	 * We will leave the stack in a GC'able state, see the stg_stop_thread
+	 * entry code in StgStartup.cmm.
+	 */
+	Sp = CurrentTSO + TSO_OFFSET_StgTSO_stack 
+		+ WDS(TO_W_(StgTSO_stack_size(CurrentTSO))) - WDS(2);
+	Sp(1) = R1;		/* save the exception */
+	Sp(0) = stg_enter_info; /* so that GC can traverse this stack */
+	StgTSO_what_next(CurrentTSO) = ThreadKilled::I16;
+	SAVE_THREAD_STATE();	/* inline! */
+
+	/* The return code goes in BaseReg->rRet, and BaseReg is returned in R1 */
+	StgRegTable_rRet(BaseReg) = ThreadFinished;
+	R1 = BaseReg;
+
+	jump StgReturn;
+    }
+
+    /* Ok, Sp points to the enclosing CATCH_FRAME or CATCH_STM_FRAME.  Pop everything
+     * down to and including this frame, update Su, push R1, and enter the handler.
+     */
+    if (frame_type == CATCH_FRAME) {
+      handler = StgCatchFrame_handler(Sp);
+    } else {
+      handler = StgCatchSTMFrame_handler(Sp);
+    }
+
+    /* Restore the blocked/unblocked state for asynchronous exceptions
+     * at the CATCH_FRAME.  
+     *
+     * If exceptions were unblocked, arrange that they are unblocked
+     * again after executing the handler by pushing an
+     * unblockAsyncExceptions_ret stack frame.
+     */
+    W_ frame;
+    frame = Sp;
+    if (frame_type == CATCH_FRAME) {
+      Sp = Sp + SIZEOF_StgCatchFrame;
+      if (StgCatchFrame_exceptions_blocked(frame) == 0) {
+        Sp_adj(-1);
+        Sp(0) = stg_unblockAsyncExceptionszh_ret_info;
+      }
+    } else {
+      Sp = Sp + SIZEOF_StgCatchSTMFrame;
+    }
+
+    /* Ensure that async excpetions are blocked when running the handler.
+    */
+    if (StgTSO_blocked_exceptions(CurrentTSO) == NULL) {
+      StgTSO_blocked_exceptions(CurrentTSO) = END_TSO_QUEUE;
+    }
+
+    /* Call the handler, passing the exception value and a realworld
+     * token as arguments.
+     */
+    Sp_adj(-1);
+    Sp(0) = R1;
+    R1 = handler;
+    Sp_adj(-1);
+    TICK_UNKNOWN_CALL();
+    TICK_SLOW_CALL_pv();
+    jump RET_LBL(stg_ap_pv);
+}
+
+raiseIOzh_fast
+{
+  /* Args :: R1 :: Exception */
+  jump raisezh_fast;
+}
diff --git a/rts/Exception.h b/rts/Exception.h
new file mode 100644
index 0000000000..f7832f4045
--- /dev/null
+++ b/rts/Exception.h
@@ -0,0 +1,40 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Exception support
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef EXCEPTION_H
+#define EXCEPTION_H
+
+extern const StgRetInfoTable stg_blockAsyncExceptionszh_ret_info;
+extern const StgRetInfoTable stg_unblockAsyncExceptionszh_ret_info;
+
+/* Determine whether a thread is interruptible (ie. blocked
+ * indefinitely).  Interruptible threads can be sent an exception with
+ * killThread# even if they have async exceptions blocked.
+ */
+STATIC_INLINE int
+interruptible(StgTSO *t)
+{
+  switch (t->why_blocked) {
+  case BlockedOnMVar:
+  case BlockedOnException:
+  case BlockedOnRead:
+  case BlockedOnWrite:
+#if defined(mingw32_HOST_OS)
+  case BlockedOnDoProc:
+#endif
+  case BlockedOnDelay:
+    return 1;
+  // NB. Threaded blocked on foreign calls (BlockedOnCCall) are
+  // *not* interruptible.  We can't send these threads an exception.
+  default:
+    return 0;
+  }
+}
+
+#endif /* EXCEPTION_H */
+
diff --git a/rts/FrontPanel.c b/rts/FrontPanel.c
new file mode 100644
index 0000000000..579b75bab3
--- /dev/null
+++ b/rts/FrontPanel.c
@@ -0,0 +1,802 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2000
+ *
+ * RTS GTK Front Panel
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifdef RTS_GTK_FRONTPANEL
+
+/* Alas, not Posix. */
+/* #include "PosixSource.h" */
+
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "MBlock.h"
+#include "FrontPanel.h"
+#include "Storage.h"
+#include "Stats.h"
+#include "RtsFlags.h"
+#include "Schedule.h"
+
+#include <gtk/gtk.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "VisSupport.h"
+#include "VisWindow.h"
+
+static GtkWidget *window, *map_drawing_area, *gen_drawing_area;
+static GtkWidget *res_drawing_area;
+static GtkWidget *continue_but, *stop_but, *quit_but;
+static GtkWidget *statusbar;
+static GtkWidget *live_label, *allocated_label;
+static GtkWidget *footprint_label, *alloc_rate_label;
+static GtkWidget *map_ruler, *gen_ruler;
+static GtkWidget *res_vruler, *res_hruler;
+static GtkWidget *running_label, *b_read_label, *b_write_label, *total_label;
+static GtkWidget *b_mvar_label, *b_bh_label, *b_throwto_label, *sleeping_label;
+
+static guint status_context_id;
+
+gboolean continue_now = FALSE, stop_now = FALSE, quit = FALSE;
+UpdateMode update_mode = Continuous;
+
+static GdkPixmap *map_pixmap = NULL;
+static GdkPixmap *gen_pixmap = NULL;
+static GdkPixmap *res_pixmap = NULL;
+
+#define N_GENS 10
+
+static GdkColor 
+    bdescr_color = { 0, 0xffff, 0, 0 },	/* red */
+    free_color   = { 0, 0, 0, 0xffff },	/* blue */
+    gen_colors[N_GENS] = {
+	{ 0, 0, 0xffff, 0 },
+	{ 0, 0, 0xf000, 0 },
+	{ 0, 0, 0xe000, 0 },
+	{ 0, 0, 0xd000, 0 },
+	{ 0, 0, 0xc000, 0 },
+	{ 0, 0, 0xb000, 0 },
+	{ 0, 0, 0xa000, 0 },
+	{ 0, 0, 0x9000, 0 },
+	{ 0, 0, 0x8000, 0 },
+	{ 0, 0, 0x7000, 0 }
+    };
+
+GdkGC *my_gc = NULL;
+
+static void *mem_start = (void *) 0x50000000;
+
+static void colorBlock( void *addr, GdkColor *color, 
+			nat block_width, nat block_height, 
+			nat blocks_per_line );
+
+static void residencyCensus( void );
+static void updateResidencyGraph( void );
+static void updateThreadsPanel( void );
+
+/* Some code pinched from examples/scribble-simple in the GTK+
+ * distribution.
+ */
+
+/* Create a new backing pixmap of the appropriate size */
+static gint 
+configure_event( GtkWidget *widget, GdkEventConfigure *event STG_UNUSED,
+		 GdkPixmap **pixmap )
+{
+  if (*pixmap)
+    gdk_pixmap_unref(*pixmap);
+
+  *pixmap = gdk_pixmap_new(widget->window,
+			   widget->allocation.width,
+			   widget->allocation.height,
+			   -1);
+
+  gdk_draw_rectangle (*pixmap,
+		      widget->style->white_gc,
+		      TRUE,
+		      0, 0,
+		      widget->allocation.width,
+		      widget->allocation.height);
+
+  debugBelch("configure!\n");
+  updateFrontPanel();
+  return TRUE;
+}
+
+/* Redraw the screen from the backing pixmap */
+static gint 
+expose_event( GtkWidget *widget, GdkEventExpose *event, GdkPixmap **pixmap )
+{
+  gdk_draw_pixmap(widget->window,
+		  widget->style->fg_gc[GTK_WIDGET_STATE (widget)],
+		  *pixmap,
+		  event->area.x, event->area.y,
+		  event->area.x, event->area.y,
+		  event->area.width, event->area.height);
+
+  return FALSE;
+}
+
+void
+initFrontPanel( void )
+{
+    GdkColormap *colormap;
+    GtkWidget *gen_hbox;
+
+    gtk_init( &prog_argc, &prog_argv );
+
+    window = create_GHC_Front_Panel();
+    map_drawing_area  = lookup_widget(window, "memmap");
+    gen_drawing_area  = lookup_widget(window, "generations");
+    res_drawing_area  = lookup_widget(window, "res_drawingarea");
+    stop_but          = lookup_widget(window, "stop_but");
+    continue_but      = lookup_widget(window, "continue_but");
+    quit_but          = lookup_widget(window, "quit_but");
+    statusbar         = lookup_widget(window, "statusbar");
+    live_label        = lookup_widget(window, "live_label");
+    footprint_label   = lookup_widget(window, "footprint_label");
+    allocated_label   = lookup_widget(window, "allocated_label");
+    alloc_rate_label  = lookup_widget(window, "alloc_rate_label");
+    gen_hbox          = lookup_widget(window, "gen_hbox");
+    gen_ruler         = lookup_widget(window, "gen_ruler");
+    map_ruler         = lookup_widget(window, "map_ruler");
+    res_vruler        = lookup_widget(window, "res_vruler");
+    res_hruler        = lookup_widget(window, "res_hruler");
+    running_label     = lookup_widget(window, "running_label");
+    b_read_label      = lookup_widget(window, "blockread_label");
+    b_write_label     = lookup_widget(window, "blockwrite_label");
+    b_mvar_label      = lookup_widget(window, "blockmvar_label");
+    b_bh_label        = lookup_widget(window, "blockbh_label");
+    b_throwto_label   = lookup_widget(window, "blockthrowto_label");
+    sleeping_label    = lookup_widget(window, "sleeping_label");
+    total_label       = lookup_widget(window, "total_label");
+    
+    status_context_id = 
+	gtk_statusbar_get_context_id( GTK_STATUSBAR(statusbar), "context" );
+
+    /* hook up some signals for the mem map drawing area */
+    gtk_signal_connect (GTK_OBJECT(map_drawing_area), "expose_event",
+			(GtkSignalFunc)expose_event, &map_pixmap);
+    gtk_signal_connect (GTK_OBJECT(map_drawing_area), "configure_event",
+			(GtkSignalFunc)configure_event, &map_pixmap);
+
+    gtk_widget_set_events(map_drawing_area, GDK_EXPOSURE_MASK);
+
+    /* hook up some signals for the gen drawing area */
+    gtk_signal_connect (GTK_OBJECT(gen_drawing_area), "expose_event",
+			(GtkSignalFunc)expose_event, &gen_pixmap);
+    gtk_signal_connect (GTK_OBJECT(gen_drawing_area), "configure_event",
+			(GtkSignalFunc)configure_event, &gen_pixmap);
+
+    gtk_widget_set_events(gen_drawing_area, GDK_EXPOSURE_MASK);
+    
+    /* hook up some signals for the res drawing area */
+    gtk_signal_connect (GTK_OBJECT(res_drawing_area), "expose_event",
+			(GtkSignalFunc)expose_event, &res_pixmap);
+    gtk_signal_connect (GTK_OBJECT(res_drawing_area), "configure_event",
+			(GtkSignalFunc)configure_event, &res_pixmap);
+
+    gtk_widget_set_events(res_drawing_area, GDK_EXPOSURE_MASK);
+    
+    /* allocate our colors */
+    colormap = gdk_colormap_get_system();
+    gdk_colormap_alloc_color(colormap, &bdescr_color, TRUE, TRUE);
+    gdk_colormap_alloc_color(colormap, &free_color, TRUE, TRUE);
+
+    {
+	gboolean success[N_GENS];
+	gdk_colormap_alloc_colors(colormap, gen_colors, N_GENS, TRUE,
+				  TRUE, success);
+	if (!success) { barf("can't allocate colors"); }
+    }
+
+    /* set the labels on the generation histogram */
+    {
+	char buf[64];
+	nat g, s;
+	GtkWidget *label;
+
+	for(g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	    for(s = 0; s < generations[g].n_steps; s++) {
+		g_snprintf( buf, 64, "%d.%d", g, s );
+		label = gtk_label_new( buf );
+		gtk_box_pack_start( GTK_BOX(gen_hbox), label,
+				    TRUE, TRUE, 5 );
+		gtk_widget_show(label);
+	    }
+	}
+    }
+
+    gtk_widget_show(window);
+
+    /* wait for the user to press "Continue" before getting going... */
+    gtk_statusbar_push( GTK_STATUSBAR(statusbar), status_context_id, 
+			"Program start");
+    gtk_widget_set_sensitive( stop_but, FALSE );
+    continue_now = FALSE;
+    while (continue_now == FALSE) {
+	gtk_main_iteration();
+    }
+    gtk_statusbar_pop( GTK_STATUSBAR(statusbar), status_context_id );
+    gtk_statusbar_push( GTK_STATUSBAR(statusbar), status_context_id, 
+			"Running");
+
+    gtk_widget_set_sensitive( continue_but, FALSE );
+    gtk_widget_set_sensitive( stop_but, TRUE );
+    gtk_widget_set_sensitive( quit_but, FALSE );
+
+    while (gtk_events_pending()) {
+	gtk_main_iteration();
+    }
+}
+
+void
+stopFrontPanel( void )
+{
+    gtk_widget_set_sensitive( quit_but, TRUE );
+    gtk_widget_set_sensitive( continue_but, FALSE );
+    gtk_widget_set_sensitive( stop_but, FALSE );
+
+    updateFrontPanel();
+
+    gtk_statusbar_push( GTK_STATUSBAR(statusbar), status_context_id, 
+			"Program finished");
+
+    quit = FALSE;
+    while (quit == FALSE) {
+	gtk_main_iteration();
+    }
+}
+
+static void
+waitForContinue( void )
+{
+    gtk_widget_set_sensitive( continue_but, TRUE );
+    gtk_widget_set_sensitive( stop_but, FALSE );
+    stop_now = FALSE;
+    continue_now = FALSE;
+    while (continue_now == FALSE) {
+	gtk_main_iteration();
+    }
+    gtk_widget_set_sensitive( continue_but, FALSE );
+    gtk_widget_set_sensitive( stop_but, TRUE );
+}
+
+void
+updateFrontPanelBeforeGC( nat N )
+{
+    char buf[1000];
+
+    updateFrontPanel();
+
+    if (update_mode == BeforeGC 
+	|| update_mode == BeforeAfterGC
+	|| stop_now == TRUE) {
+	g_snprintf( buf, 1000, "Stopped (before GC, generation %d)", N );
+	gtk_statusbar_push( GTK_STATUSBAR(statusbar), status_context_id, buf );
+	waitForContinue();
+	gtk_statusbar_pop( GTK_STATUSBAR(statusbar), status_context_id );
+    }
+
+    g_snprintf( buf, 1000, "Garbage collecting (generation %d)", N );
+    gtk_statusbar_push( GTK_STATUSBAR(statusbar), status_context_id, buf);
+
+    while (gtk_events_pending()) {
+	gtk_main_iteration();
+    }
+}
+
+static void
+numLabel( GtkWidget *lbl, nat n )
+{
+    char buf[64];
+    g_snprintf(buf, 64, "%d", n);
+    gtk_label_set_text( GTK_LABEL(lbl), buf );
+}
+
+void
+updateFrontPanelAfterGC( nat N, lnat live )
+{
+    char buf[1000];
+
+    gtk_statusbar_pop( GTK_STATUSBAR(statusbar), status_context_id );
+
+    /* is a major GC? */
+    if (N == RtsFlags.GcFlags.generations-1) {
+	residencyCensus();
+    }
+
+    updateFrontPanel();
+
+    if (update_mode == AfterGC 
+	|| update_mode == BeforeAfterGC
+	|| stop_now == TRUE) {
+	snprintf( buf, 1000, "Stopped (after GC, generation %d)", N );
+	gtk_statusbar_push( GTK_STATUSBAR(statusbar), status_context_id, buf );
+	waitForContinue();
+	gtk_statusbar_pop( GTK_STATUSBAR(statusbar), status_context_id );
+    }
+
+    {
+	double words_to_megs = (1024 * 1024) / sizeof(W_);
+	double time = mut_user_time();
+
+	snprintf( buf, 1000, "%.2f", (double)live / words_to_megs );
+	gtk_label_set_text( GTK_LABEL(live_label), buf );
+
+	snprintf( buf, 1000, "%.2f", (double)total_allocated / words_to_megs );
+	gtk_label_set_text( GTK_LABEL(allocated_label), buf );
+
+	snprintf( buf, 1000, "%.2f",
+		  (double)(mblocks_allocated * MBLOCK_SIZE_W) / words_to_megs );
+	gtk_label_set_text( GTK_LABEL(footprint_label), buf );
+
+	if ( time == 0.0 )
+	    snprintf( buf, 1000, "%.2f", time );
+	else
+	    snprintf( buf, 1000, "%.2f",
+		      (double)(total_allocated / words_to_megs) / time );
+	gtk_label_set_text( GTK_LABEL(alloc_rate_label), buf );
+    }
+
+    while (gtk_events_pending()) {
+	gtk_main_iteration();
+    }
+}
+
+void
+updateFrontPanel( void )
+{
+    void *m, *a;
+    bdescr *bd;
+
+    updateThreadsPanel();
+
+    if (my_gc == NULL) {
+	my_gc = gdk_gc_new( window->window );
+    }
+
+    if (map_pixmap != NULL) {
+	nat height, width, blocks_per_line, 
+	    block_height, block_width, mblock_height;
+
+	height = map_drawing_area->allocation.height;
+	width  = map_drawing_area->allocation.width;
+
+	mblock_height =  height / mblocks_allocated;
+	blocks_per_line = 16;
+	block_height  = mblock_height / 
+	    ((MBLOCK_SIZE/BLOCK_SIZE) / blocks_per_line);
+	while (block_height == 0) {
+	    blocks_per_line *= 2;
+	    block_height  = mblock_height / 
+		((MBLOCK_SIZE/BLOCK_SIZE) / blocks_per_line);
+	}
+	block_width = width / blocks_per_line;
+
+	gdk_draw_rectangle (map_pixmap,
+			    map_drawing_area->style->bg_gc[GTK_STATE_NORMAL],
+			    TRUE,
+			    0, 0,
+			    map_drawing_area->allocation.width,
+			    map_drawing_area->allocation.height);
+	
+	for ( m = mem_start; 
+	      (char *)m < (char *)mem_start + 
+		  (mblocks_allocated * MBLOCK_SIZE); 
+	      (char *)m += MBLOCK_SIZE ) {
+	    
+	    /* color the bdescr area first */
+	    for (a = m; a < FIRST_BLOCK(m); (char *)a += BLOCK_SIZE) {
+		colorBlock( a, &bdescr_color, 
+			    block_width, block_height, blocks_per_line );
+	    }
+	    
+#if 0 /* Segfaults because bd appears to be bogus but != NULL. stolz, 2003-06-24 */
+	    /* color each block */
+	    for (; a <= LAST_BLOCK(m); (char *)a += BLOCK_SIZE) {
+		bd = Bdescr((P_)a);
+		ASSERT(bd->start == a);
+                if (bd->flags & BF_FREE) {
+		    colorBlock( a, &free_color, 
+				block_width, block_height, blocks_per_line );
+		} else {
+		    colorBlock( a, &gen_colors[bd->gen_no],
+				block_width, block_height, blocks_per_line );
+		}
+	    }
+#endif
+	}
+
+	
+	{ 
+	    nat height = map_drawing_area->allocation.height,
+		block_height, mblock_height;
+
+	    block_height = (height / mblocks_allocated) / 
+		((MBLOCK_SIZE/BLOCK_SIZE) / blocks_per_line);
+	    if (block_height < 1) block_height = 1;
+	    mblock_height = block_height * 
+		((MBLOCK_SIZE/BLOCK_SIZE) / blocks_per_line);
+
+	    gtk_ruler_set_range( GTK_RULER(map_ruler), 0, 
+				 (double)(height * mblocks_allocated) / 
+				 (double)((mblock_height * mblocks_allocated)),
+				 0,
+				 (double)(height * mblocks_allocated) / 
+				 (double)((mblock_height * mblocks_allocated))
+		);
+	}
+				  
+	gtk_widget_draw( map_drawing_area, NULL );
+    }
+
+    if (gen_pixmap != NULL) {
+
+	GdkRectangle rect;
+	nat g, s, columns, column, max_blocks, height_blocks,
+	    width, height;
+	
+	gdk_draw_rectangle (gen_pixmap,
+			    gen_drawing_area->style->white_gc,
+			    TRUE,
+			    0, 0,
+			    gen_drawing_area->allocation.width,
+			    gen_drawing_area->allocation.height);
+
+	height = gen_drawing_area->allocation.height;
+	width  = gen_drawing_area->allocation.width;
+
+	columns = 0; max_blocks = 0;
+	for(g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	    columns += generations[g].n_steps;
+	    for(s = 0; s < generations[g].n_steps; s++) {
+		if (generations[g].steps[s].n_blocks > max_blocks) {
+		    max_blocks = generations[g].steps[s].n_blocks;
+		}
+	    }
+	}
+
+	/* find a reasonable height value larger than max_blocks */
+	{ 
+	    nat n = 0;
+	    while (max_blocks != 0) {
+		max_blocks >>= 1; n++;
+	    }
+	    height_blocks = 1 << n;
+	}
+
+	column = 0;
+	for(g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	    for(s = 0; s < generations[g].n_steps; s++, column++) {
+		gdk_gc_set_foreground(my_gc, &gen_colors[g]);
+
+		rect.x = column * (width / columns);
+
+		if (generations[g].steps[s].n_blocks == 0)
+		    rect.y = height;
+		else
+		    rect.y = height - 
+			(height * generations[g].steps[s].n_blocks
+			 / height_blocks);
+
+		rect.width = (width / columns);
+		rect.height = height - rect.y;
+
+		gdk_draw_rectangle( gen_pixmap, my_gc, TRUE/*filled*/, 
+				    rect.x, rect.y, rect.width,
+				    rect.height );
+	    }
+	}
+
+	gtk_ruler_set_range( GTK_RULER(gen_ruler), 
+			     height_blocks * BLOCK_SIZE / (1024 * 1024),
+			     0, 0,
+			     height_blocks * BLOCK_SIZE / (1024 * 1024)
+	    );
+
+	gtk_widget_draw( gen_drawing_area, NULL );
+    }
+
+    if (res_pixmap != NULL) {
+	updateResidencyGraph();
+    }
+
+    while (gtk_events_pending()) {
+	gtk_main_iteration_do(FALSE/*don't block*/);
+    }
+}
+
+static void
+colorBlock( void *addr, GdkColor *color, 
+	    nat block_width, nat block_height, nat blocks_per_line )
+{
+    GdkRectangle rect;
+    nat block_no;
+
+    gdk_gc_set_foreground(my_gc, color);
+
+    block_no = ((char *)addr - (char *)mem_start) / BLOCK_SIZE;
+
+    rect.x = (block_no % blocks_per_line) * block_width;
+    rect.y = block_no / blocks_per_line * block_height;
+    rect.width = block_width;
+    rect.height = block_height;
+    gdk_draw_rectangle( map_pixmap, my_gc, TRUE/*filled*/, 
+			rect.x, rect.y, rect.width, rect.height );
+}
+
+static void
+updateThreadsPanel( void )
+{
+    nat running = 0,
+	b_read = 0,
+	b_write = 0,
+	b_mvar = 0,
+	b_throwto = 0,
+	b_bh = 0,
+	sleeping = 0,
+	total = 0;
+
+    StgTSO *t;
+
+    for (t = all_threads; t != END_TSO_QUEUE; t = t->global_link) {
+	switch (t->what_next) {
+	case ThreadKilled:	    break;
+	case ThreadComplete:	    break;
+	default:
+	    switch (t->why_blocked) {
+	    case BlockedOnRead:       b_read++;    break;
+	    case BlockedOnWrite:      b_write++;   break;
+	    case BlockedOnDelay:      sleeping++;  break;
+	    case BlockedOnMVar:       b_mvar++;    break;
+	    case BlockedOnException:  b_throwto++; break;
+	    case BlockedOnBlackHole:  b_bh++;      break;
+	    case NotBlocked:          running++;   break;
+	    }
+	}
+    }
+    total = running + b_read + b_write + b_mvar + b_throwto + b_bh + sleeping;
+    numLabel(running_label,   running);
+    numLabel(b_read_label,    b_read);
+    numLabel(b_write_label,   b_write);
+    numLabel(b_mvar_label,    b_mvar);
+    numLabel(b_bh_label,      b_bh);
+    numLabel(b_throwto_label, b_throwto);
+    numLabel(sleeping_label,  sleeping);
+    numLabel(total_label,     total);
+}
+
+typedef enum { Thunk, Fun, Constr, BlackHole,
+	       Array, Thread, Other, N_Cats } ClosureCategory;
+
+#define N_SLICES 100
+
+static nat *res_prof[N_SLICES];
+static double res_time[N_SLICES];
+static nat next_slice = 0;
+
+static void
+residencyCensus( void )
+{
+    nat slice = next_slice++, *prof;
+    bdescr *bd;
+    nat g, s, size, type;
+    StgPtr p;
+    StgInfoTable *info;
+
+    if (slice >= N_SLICES) {
+	barf("too many slices");
+    }
+    res_prof[slice] = stgMallocBytes(N_Cats * sizeof(nat), "residencyCensus");
+    prof = res_prof[slice];
+    memset(prof, 0, N_Cats * sizeof(nat));
+
+    res_time[slice] = mut_user_time();
+    
+    for(g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	for(s = 0; s < generations[g].n_steps; s++) {
+
+	    /* skip over g0s0 if multi-generational */
+	    if (RtsFlags.GcFlags.generations > 1 &&
+		g == 0 && s == 0) continue;
+
+	    if (RtsFlags.GcFlags.generations == 1) {
+/*		bd = generations[g].steps[s].to_blocks; FIXME to_blocks does not exist */
+	    } else {
+		bd = generations[g].steps[s].blocks;
+	    }
+
+	    for (; bd != NULL; bd = bd->link) {
+
+		p = bd->start;
+
+		while (p < bd->free) {
+		    info = get_itbl((StgClosure *)p);
+		    type = Other;
+		    
+		    switch (info->type) {
+
+		    case CONSTR:
+		    case BCO:
+			if (((StgClosure *)p)->header.info == &stg_DEAD_WEAK_info) {
+			    size = sizeofW(StgWeak);
+			    type = Other;
+			    break;
+			}
+			/* else, fall through... */
+		    case CONSTR_1_0:
+		    case CONSTR_0_1:
+		    case CONSTR_1_1:
+		    case CONSTR_0_2:
+		    case CONSTR_2_0:
+			size = sizeW_fromITBL(info);
+			type = Constr;
+			break;
+			
+		    case FUN_1_0:
+		    case FUN_0_1:
+			size = sizeofW(StgHeader) + 1;
+			goto fun;
+		    case FUN_1_1:
+		    case FUN_0_2:
+		    case FUN_2_0:
+		    case FUN:
+			size = sizeW_fromITBL(info);
+		    fun:
+			type = Fun;
+			break;
+
+		    case THUNK_1_0:
+		    case THUNK_0_1:
+		    case THUNK_SELECTOR:
+			size = sizeofW(StgHeader) + 2;
+			goto thunk;
+		    case THUNK_1_1:
+		    case THUNK_0_2:
+		    case THUNK_2_0:
+		    case THUNK:
+			size = sizeW_fromITBL(info);
+		    thunk:
+			type = Thunk;
+			break;
+
+		    case CAF_BLACKHOLE:
+		    case SE_CAF_BLACKHOLE:
+		    case SE_BLACKHOLE:
+		    case BLACKHOLE:
+/*		    case BLACKHOLE_BQ: FIXME: case does not exist */
+			size = sizeW_fromITBL(info);
+			type = BlackHole;
+			break;
+
+		    case AP:
+			size = pap_sizeW((StgPAP *)p);
+			type = Thunk;
+			break;
+
+		    case PAP:
+			size = pap_sizeW((StgPAP *)p);
+			type = Fun;
+			break;
+			
+		    case ARR_WORDS:
+			size = arr_words_sizeW(stgCast(StgArrWords*,p));
+			type = Array;
+			break;
+			
+		    case MUT_ARR_PTRS:
+		    case MUT_ARR_PTRS_FROZEN:
+			size = mut_arr_ptrs_sizeW((StgMutArrPtrs *)p);
+			type = Array;
+			break;
+			
+		    case TSO:
+			size = tso_sizeW((StgTSO *)p);
+			type = Thread;
+			break;
+			
+		    case WEAK:
+		    case STABLE_NAME:
+		    case MVAR:
+		    case MUT_VAR:
+/*		    case MUT_CONS: FIXME: case does not exist */
+		    case IND_PERM:
+		    case IND_OLDGEN_PERM:
+			size = sizeW_fromITBL(info);
+			type = Other;
+			break;
+
+		    default:
+			barf("updateResidencyGraph: strange closure "
+                             "%d", info->type );
+		    }
+
+		    prof[type] += size;
+		    p += size;
+		}
+	    }
+	}
+    }
+
+}
+	    
+static void
+updateResidencyGraph( void )
+{
+    nat total, prev_total, i, max_res;
+    double time;
+    double time_scale = 1;
+    nat last_slice = next_slice-1;
+    double res_scale  = 1; /* in megabytes, doubles */
+    nat *prof;
+    nat width, height;
+    GdkPoint points[4];
+
+    gdk_draw_rectangle (res_pixmap,
+			res_drawing_area->style->bg_gc[GTK_STATE_NORMAL],
+			TRUE,
+			0, 0,
+			res_drawing_area->allocation.width,
+			res_drawing_area->allocation.height);
+    
+    if (next_slice == 0) return;
+
+    time = res_time[last_slice];
+    while (time > time_scale) {
+	time_scale *= 2;
+    }
+
+    max_res = 0; 
+    for (i = 0; i < next_slice; i++) {
+	prof = res_prof[i];
+	total = prof[Thunk] + prof[Fun] + prof[Constr] +
+	    prof[BlackHole] + prof[Array] + prof[Other];
+	if (total > max_res) {
+	    max_res = total;
+	}
+    }
+    while (max_res > res_scale) {
+	res_scale *= 2;
+    }
+
+    height = res_drawing_area->allocation.height;
+    width  = res_drawing_area->allocation.width;
+
+    points[0].x = 0;
+    points[0].y = height;
+    points[1].y = height;
+    points[3].x = 0;
+    points[3].y = height;
+
+    gdk_gc_set_foreground(my_gc, &free_color);
+
+    prev_total = 0;
+    for (i = 0; i < next_slice; i++) {
+	prof = res_prof[i];
+	total = prof[Thunk] + prof[Fun] + prof[Constr] +
+	    prof[BlackHole] + prof[Array] + prof[Other];
+	points[1].x = width * res_time[i] / time_scale;
+	points[2].x = points[1].x;
+	points[2].y = height - ((height * total) / res_scale);
+	gdk_draw_polygon(res_pixmap, my_gc, TRUE/*filled*/, points, 4);
+	points[3] = points[2];
+	points[0] = points[1];
+    }
+
+    gtk_ruler_set_range( GTK_RULER(res_vruler), 
+			 res_scale / ((1024*1024)/sizeof(W_)),
+			 0, 0,
+			 res_scale / ((1024*1024)/sizeof(W_)) );
+
+    gtk_ruler_set_range( GTK_RULER(res_hruler), 
+			 0, time_scale, 0, time_scale );
+
+
+    gtk_widget_draw( res_drawing_area, NULL );
+}
+
+#endif /* RTS_GTK_FRONTPANEL */
diff --git a/rts/FrontPanel.h b/rts/FrontPanel.h
new file mode 100644
index 0000000000..de3b741657
--- /dev/null
+++ b/rts/FrontPanel.h
@@ -0,0 +1,35 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2000-2005
+ *
+ * RTS GTK Front Panel
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef FRONTPANEL_H
+#define FRONTPANEL_H
+
+#ifdef RTS_GTK_FRONTPANEL
+
+#include "Rts.h"  /* needed because this file gets included by
+		   * auto-generated code */
+
+void initFrontPanel( void );
+void stopFrontPanel( void );
+void updateFrontPanelBeforeGC( nat N );
+void updateFrontPanelAfterGC( nat N, lnat live );
+void updateFrontPanel( void );
+
+
+/* --------- PRIVATE ----------------------------------------- */
+
+#include <gdk/gdktypes.h>
+
+typedef enum { BeforeGC, AfterGC, BeforeAfterGC, Continuous } UpdateMode;
+extern UpdateMode update_mode;
+extern gboolean continue_now, stop_now, quit;
+
+#endif /* RTS_GTK_FRONTPANEL */
+
+#endif /* FRONTPANEL_H */
+
diff --git a/rts/GC.c b/rts/GC.c
new file mode 100644
index 0000000000..a13cd33afa
--- /dev/null
+++ b/rts/GC.c
@@ -0,0 +1,4719 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-2003
+ *
+ * Generational garbage collector
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Apply.h"
+#include "OSThreads.h"
+#include "Storage.h"
+#include "LdvProfile.h"
+#include "Updates.h"
+#include "Stats.h"
+#include "Schedule.h"
+#include "Sanity.h"
+#include "BlockAlloc.h"
+#include "MBlock.h"
+#include "ProfHeap.h"
+#include "SchedAPI.h"
+#include "Weak.h"
+#include "Prelude.h"
+#include "ParTicky.h"		// ToDo: move into Rts.h
+#include "GCCompact.h"
+#include "RtsSignals.h"
+#include "STM.h"
+#if defined(GRAN) || defined(PAR)
+# include "GranSimRts.h"
+# include "ParallelRts.h"
+# include "FetchMe.h"
+# if defined(DEBUG)
+#  include "Printer.h"
+#  include "ParallelDebug.h"
+# endif
+#endif
+#include "HsFFI.h"
+#include "Linker.h"
+#if defined(RTS_GTK_FRONTPANEL)
+#include "FrontPanel.h"
+#endif
+
+#include "RetainerProfile.h"
+
+#include <string.h>
+
+// Turn off inlining when debugging - it obfuscates things
+#ifdef DEBUG
+# undef  STATIC_INLINE
+# define STATIC_INLINE static
+#endif
+
+/* STATIC OBJECT LIST.
+ *
+ * During GC:
+ * We maintain a linked list of static objects that are still live.
+ * The requirements for this list are:
+ *
+ *  - we need to scan the list while adding to it, in order to
+ *    scavenge all the static objects (in the same way that
+ *    breadth-first scavenging works for dynamic objects).
+ *
+ *  - we need to be able to tell whether an object is already on
+ *    the list, to break loops.
+ *
+ * Each static object has a "static link field", which we use for
+ * linking objects on to the list.  We use a stack-type list, consing
+ * objects on the front as they are added (this means that the
+ * scavenge phase is depth-first, not breadth-first, but that
+ * shouldn't matter).  
+ *
+ * A separate list is kept for objects that have been scavenged
+ * already - this is so that we can zero all the marks afterwards.
+ *
+ * An object is on the list if its static link field is non-zero; this
+ * means that we have to mark the end of the list with '1', not NULL.  
+ *
+ * Extra notes for generational GC:
+ *
+ * Each generation has a static object list associated with it.  When
+ * collecting generations up to N, we treat the static object lists
+ * from generations > N as roots.
+ *
+ * We build up a static object list while collecting generations 0..N,
+ * which is then appended to the static object list of generation N+1.
+ */
+static StgClosure* static_objects;      // live static objects
+StgClosure* scavenged_static_objects;   // static objects scavenged so far
+
+/* N is the oldest generation being collected, where the generations
+ * are numbered starting at 0.  A major GC (indicated by the major_gc
+ * flag) is when we're collecting all generations.  We only attempt to
+ * deal with static objects and GC CAFs when doing a major GC.
+ */
+static nat N;
+static rtsBool major_gc;
+
+/* Youngest generation that objects should be evacuated to in
+ * evacuate().  (Logically an argument to evacuate, but it's static
+ * a lot of the time so we optimise it into a global variable).
+ */
+static nat evac_gen;
+
+/* Whether to do eager promotion or not.
+ */
+static rtsBool eager_promotion;
+
+/* Weak pointers
+ */
+StgWeak *old_weak_ptr_list; // also pending finaliser list
+
+/* Which stage of processing various kinds of weak pointer are we at?
+ * (see traverse_weak_ptr_list() below for discussion).
+ */
+typedef enum { WeakPtrs, WeakThreads, WeakDone } WeakStage;
+static WeakStage weak_stage;
+
+/* List of all threads during GC
+ */
+static StgTSO *old_all_threads;
+StgTSO *resurrected_threads;
+
+/* Flag indicating failure to evacuate an object to the desired
+ * generation.
+ */
+static rtsBool failed_to_evac;
+
+/* Saved nursery (used for 2-space collector only)
+ */
+static bdescr *saved_nursery;
+static nat saved_n_blocks;
+  
+/* Data used for allocation area sizing.
+ */
+static lnat new_blocks;		 // blocks allocated during this GC 
+static lnat new_scavd_blocks;	 // ditto, but depth-first blocks
+static lnat g0s0_pcnt_kept = 30; // percentage of g0s0 live at last minor GC 
+
+/* Used to avoid long recursion due to selector thunks
+ */
+static lnat thunk_selector_depth = 0;
+#define MAX_THUNK_SELECTOR_DEPTH 8
+
+/* Mut-list stats */
+#ifdef DEBUG
+static nat 
+    mutlist_MUTVARS,
+    mutlist_MUTARRS,
+    mutlist_OTHERS;
+#endif
+
+/* -----------------------------------------------------------------------------
+   Static function declarations
+   -------------------------------------------------------------------------- */
+
+static bdescr *     gc_alloc_block          ( step *stp );
+static void         mark_root               ( StgClosure **root );
+
+// Use a register argument for evacuate, if available.
+#if __GNUC__ >= 2
+#define REGPARM1 __attribute__((regparm(1)))
+#else
+#define REGPARM1
+#endif
+
+REGPARM1 static StgClosure * evacuate (StgClosure *q);
+
+static void         zero_static_object_list ( StgClosure* first_static );
+
+static rtsBool      traverse_weak_ptr_list  ( void );
+static void         mark_weak_ptr_list      ( StgWeak **list );
+
+static StgClosure * eval_thunk_selector     ( nat field, StgSelector * p );
+
+
+static void    scavenge                ( step * );
+static void    scavenge_mark_stack     ( void );
+static void    scavenge_stack          ( StgPtr p, StgPtr stack_end );
+static rtsBool scavenge_one            ( StgPtr p );
+static void    scavenge_large          ( step * );
+static void    scavenge_static         ( void );
+static void    scavenge_mutable_list   ( generation *g );
+
+static void    scavenge_large_bitmap   ( StgPtr p, 
+					 StgLargeBitmap *large_bitmap, 
+					 nat size );
+
+#if 0 && defined(DEBUG)
+static void         gcCAFs                  ( void );
+#endif
+
+/* -----------------------------------------------------------------------------
+   inline functions etc. for dealing with the mark bitmap & stack.
+   -------------------------------------------------------------------------- */
+
+#define MARK_STACK_BLOCKS 4
+
+static bdescr *mark_stack_bdescr;
+static StgPtr *mark_stack;
+static StgPtr *mark_sp;
+static StgPtr *mark_splim;
+
+// Flag and pointers used for falling back to a linear scan when the
+// mark stack overflows.
+static rtsBool mark_stack_overflowed;
+static bdescr *oldgen_scan_bd;
+static StgPtr  oldgen_scan;
+
+STATIC_INLINE rtsBool
+mark_stack_empty(void)
+{
+    return mark_sp == mark_stack;
+}
+
+STATIC_INLINE rtsBool
+mark_stack_full(void)
+{
+    return mark_sp >= mark_splim;
+}
+
+STATIC_INLINE void
+reset_mark_stack(void)
+{
+    mark_sp = mark_stack;
+}
+
+STATIC_INLINE void
+push_mark_stack(StgPtr p)
+{
+    *mark_sp++ = p;
+}
+
+STATIC_INLINE StgPtr
+pop_mark_stack(void)
+{
+    return *--mark_sp;
+}
+
+/* -----------------------------------------------------------------------------
+   Allocate a new to-space block in the given step.
+   -------------------------------------------------------------------------- */
+
+static bdescr *
+gc_alloc_block(step *stp)
+{
+    bdescr *bd = allocBlock();
+    bd->gen_no = stp->gen_no;
+    bd->step = stp;
+    bd->link = NULL;
+
+    // blocks in to-space in generations up to and including N
+    // get the BF_EVACUATED flag.
+    if (stp->gen_no <= N) {
+	bd->flags = BF_EVACUATED;
+    } else {
+	bd->flags = 0;
+    }
+
+    // Start a new to-space block, chain it on after the previous one.
+    if (stp->hp_bd != NULL) {
+	stp->hp_bd->free = stp->hp;
+	stp->hp_bd->link = bd;
+    }
+
+    stp->hp_bd = bd;
+    stp->hp    = bd->start;
+    stp->hpLim = stp->hp + BLOCK_SIZE_W;
+
+    stp->n_blocks++;
+    new_blocks++;
+
+    return bd;
+}
+
+static bdescr *
+gc_alloc_scavd_block(step *stp)
+{
+    bdescr *bd = allocBlock();
+    bd->gen_no = stp->gen_no;
+    bd->step = stp;
+
+    // blocks in to-space in generations up to and including N
+    // get the BF_EVACUATED flag.
+    if (stp->gen_no <= N) {
+	bd->flags = BF_EVACUATED;
+    } else {
+	bd->flags = 0;
+    }
+
+    bd->link = stp->blocks;
+    stp->blocks = bd;
+
+    if (stp->scavd_hp != NULL) {
+	Bdescr(stp->scavd_hp)->free = stp->scavd_hp;
+    }
+    stp->scavd_hp    = bd->start;
+    stp->scavd_hpLim = stp->scavd_hp + BLOCK_SIZE_W;
+
+    stp->n_blocks++;
+    new_scavd_blocks++;
+
+    return bd;
+}
+
+/* -----------------------------------------------------------------------------
+   GarbageCollect
+
+   Rough outline of the algorithm: for garbage collecting generation N
+   (and all younger generations):
+
+     - follow all pointers in the root set.  the root set includes all 
+       mutable objects in all generations (mutable_list).
+
+     - for each pointer, evacuate the object it points to into either
+
+       + to-space of the step given by step->to, which is the next
+         highest step in this generation or the first step in the next
+         generation if this is the last step.
+
+       + to-space of generations[evac_gen]->steps[0], if evac_gen != 0.
+         When we evacuate an object we attempt to evacuate
+         everything it points to into the same generation - this is
+         achieved by setting evac_gen to the desired generation.  If
+         we can't do this, then an entry in the mut list has to
+         be made for the cross-generation pointer.
+
+       + if the object is already in a generation > N, then leave
+         it alone.
+
+     - repeatedly scavenge to-space from each step in each generation
+       being collected until no more objects can be evacuated.
+      
+     - free from-space in each step, and set from-space = to-space.
+
+   Locks held: all capabilities are held throughout GarbageCollect().
+
+   -------------------------------------------------------------------------- */
+
+void
+GarbageCollect ( void (*get_roots)(evac_fn), rtsBool force_major_gc )
+{
+  bdescr *bd;
+  step *stp;
+  lnat live, allocated, copied = 0, scavd_copied = 0;
+  lnat oldgen_saved_blocks = 0;
+  nat g, s, i;
+
+  ACQUIRE_SM_LOCK;
+
+#ifdef PROFILING
+  CostCentreStack *prev_CCS;
+#endif
+
+#if defined(DEBUG) && defined(GRAN)
+  IF_DEBUG(gc, debugBelch("@@ Starting garbage collection at %ld (%lx)\n", 
+		     Now, Now));
+#endif
+
+#if defined(RTS_USER_SIGNALS)
+  // block signals
+  blockUserSignals();
+#endif
+
+  // tell the STM to discard any cached closures its hoping to re-use
+  stmPreGCHook();
+
+  // tell the stats department that we've started a GC 
+  stat_startGC();
+
+#ifdef DEBUG
+  // check for memory leaks if DEBUG is on 
+  memInventory();
+#endif
+
+#ifdef DEBUG
+  mutlist_MUTVARS = 0;
+  mutlist_MUTARRS = 0;
+  mutlist_OTHERS = 0;
+#endif
+
+  // Init stats and print par specific (timing) info 
+  PAR_TICKY_PAR_START();
+
+  // attribute any costs to CCS_GC 
+#ifdef PROFILING
+  prev_CCS = CCCS;
+  CCCS = CCS_GC;
+#endif
+
+  /* Approximate how much we allocated.  
+   * Todo: only when generating stats? 
+   */
+  allocated = calcAllocated();
+
+  /* Figure out which generation to collect
+   */
+  if (force_major_gc) {
+    N = RtsFlags.GcFlags.generations - 1;
+    major_gc = rtsTrue;
+  } else {
+    N = 0;
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+      if (generations[g].steps[0].n_blocks +
+	  generations[g].steps[0].n_large_blocks
+	  >= generations[g].max_blocks) {
+        N = g;
+      }
+    }
+    major_gc = (N == RtsFlags.GcFlags.generations-1);
+  }
+
+#ifdef RTS_GTK_FRONTPANEL
+  if (RtsFlags.GcFlags.frontpanel) {
+      updateFrontPanelBeforeGC(N);
+  }
+#endif
+
+  // check stack sanity *before* GC (ToDo: check all threads) 
+#if defined(GRAN)
+  // ToDo!: check sanity  IF_DEBUG(sanity, checkTSOsSanity());
+#endif
+  IF_DEBUG(sanity, checkFreeListSanity());
+
+  /* Initialise the static object lists
+   */
+  static_objects = END_OF_STATIC_LIST;
+  scavenged_static_objects = END_OF_STATIC_LIST;
+
+  /* Save the nursery if we're doing a two-space collection.
+   * g0s0->blocks will be used for to-space, so we need to get the
+   * nursery out of the way.
+   */
+  if (RtsFlags.GcFlags.generations == 1) {
+      saved_nursery = g0s0->blocks;
+      saved_n_blocks = g0s0->n_blocks;
+      g0s0->blocks = NULL;
+      g0s0->n_blocks = 0;
+  }
+
+  /* Keep a count of how many new blocks we allocated during this GC
+   * (used for resizing the allocation area, later).
+   */
+  new_blocks = 0;
+  new_scavd_blocks = 0;
+
+  // Initialise to-space in all the generations/steps that we're
+  // collecting.
+  //
+  for (g = 0; g <= N; g++) {
+
+    // throw away the mutable list.  Invariant: the mutable list
+    // always has at least one block; this means we can avoid a check for
+    // NULL in recordMutable().
+    if (g != 0) {
+	freeChain(generations[g].mut_list);
+	generations[g].mut_list = allocBlock();
+	for (i = 0; i < n_capabilities; i++) {
+	    freeChain(capabilities[i].mut_lists[g]);
+	    capabilities[i].mut_lists[g] = allocBlock();
+	}
+    }
+
+    for (s = 0; s < generations[g].n_steps; s++) {
+
+      // generation 0, step 0 doesn't need to-space 
+      if (g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1) { 
+	continue; 
+      }
+
+      stp = &generations[g].steps[s];
+      ASSERT(stp->gen_no == g);
+
+      // start a new to-space for this step.
+      stp->old_blocks   = stp->blocks;
+      stp->n_old_blocks = stp->n_blocks;
+
+      // allocate the first to-space block; extra blocks will be
+      // chained on as necessary.
+      stp->hp_bd     = NULL;
+      bd = gc_alloc_block(stp);
+      stp->blocks      = bd;
+      stp->n_blocks    = 1;
+      stp->scan        = bd->start;
+      stp->scan_bd     = bd;
+
+      // allocate a block for "already scavenged" objects.  This goes
+      // on the front of the stp->blocks list, so it won't be
+      // traversed by the scavenging sweep.
+      gc_alloc_scavd_block(stp);
+
+      // initialise the large object queues.
+      stp->new_large_objects = NULL;
+      stp->scavenged_large_objects = NULL;
+      stp->n_scavenged_large_blocks = 0;
+
+      // mark the large objects as not evacuated yet 
+      for (bd = stp->large_objects; bd; bd = bd->link) {
+	bd->flags &= ~BF_EVACUATED;
+      }
+
+      // for a compacted step, we need to allocate the bitmap
+      if (stp->is_compacted) {
+	  nat bitmap_size; // in bytes
+	  bdescr *bitmap_bdescr;
+	  StgWord *bitmap;
+
+	  bitmap_size = stp->n_old_blocks * BLOCK_SIZE / (sizeof(W_)*BITS_PER_BYTE);
+
+	  if (bitmap_size > 0) {
+	      bitmap_bdescr = allocGroup((lnat)BLOCK_ROUND_UP(bitmap_size) 
+					 / BLOCK_SIZE);
+	      stp->bitmap = bitmap_bdescr;
+	      bitmap = bitmap_bdescr->start;
+	      
+	      IF_DEBUG(gc, debugBelch("bitmap_size: %d, bitmap: %p",
+				   bitmap_size, bitmap););
+	      
+	      // don't forget to fill it with zeros!
+	      memset(bitmap, 0, bitmap_size);
+	      
+	      // For each block in this step, point to its bitmap from the
+	      // block descriptor.
+	      for (bd=stp->old_blocks; bd != NULL; bd = bd->link) {
+		  bd->u.bitmap = bitmap;
+		  bitmap += BLOCK_SIZE_W / (sizeof(W_)*BITS_PER_BYTE);
+
+		  // Also at this point we set the BF_COMPACTED flag
+		  // for this block.  The invariant is that
+		  // BF_COMPACTED is always unset, except during GC
+		  // when it is set on those blocks which will be
+		  // compacted.
+		  bd->flags |= BF_COMPACTED;
+	      }
+	  }
+      }
+    }
+  }
+
+  /* make sure the older generations have at least one block to
+   * allocate into (this makes things easier for copy(), see below).
+   */
+  for (g = N+1; g < RtsFlags.GcFlags.generations; g++) {
+    for (s = 0; s < generations[g].n_steps; s++) {
+      stp = &generations[g].steps[s];
+      if (stp->hp_bd == NULL) {
+	  ASSERT(stp->blocks == NULL);
+	  bd = gc_alloc_block(stp);
+	  stp->blocks = bd;
+	  stp->n_blocks = 1;
+      }
+      if (stp->scavd_hp == NULL) {
+	  gc_alloc_scavd_block(stp);
+	  stp->n_blocks++;
+      }
+      /* Set the scan pointer for older generations: remember we
+       * still have to scavenge objects that have been promoted. */
+      stp->scan = stp->hp;
+      stp->scan_bd = stp->hp_bd;
+      stp->new_large_objects = NULL;
+      stp->scavenged_large_objects = NULL;
+      stp->n_scavenged_large_blocks = 0;
+    }
+
+    /* Move the private mutable lists from each capability onto the
+     * main mutable list for the generation.
+     */
+    for (i = 0; i < n_capabilities; i++) {
+	for (bd = capabilities[i].mut_lists[g]; 
+	     bd->link != NULL; bd = bd->link) {
+	    /* nothing */
+	}
+	bd->link = generations[g].mut_list;
+	generations[g].mut_list = capabilities[i].mut_lists[g];
+	capabilities[i].mut_lists[g] = allocBlock();
+    }
+  }
+
+  /* Allocate a mark stack if we're doing a major collection.
+   */
+  if (major_gc) {
+      mark_stack_bdescr = allocGroup(MARK_STACK_BLOCKS);
+      mark_stack = (StgPtr *)mark_stack_bdescr->start;
+      mark_sp    = mark_stack;
+      mark_splim = mark_stack + (MARK_STACK_BLOCKS * BLOCK_SIZE_W);
+  } else {
+      mark_stack_bdescr = NULL;
+  }
+
+  eager_promotion = rtsTrue; // for now
+
+  /* -----------------------------------------------------------------------
+   * follow all the roots that we know about:
+   *   - mutable lists from each generation > N
+   * we want to *scavenge* these roots, not evacuate them: they're not
+   * going to move in this GC.
+   * Also: do them in reverse generation order.  This is because we
+   * often want to promote objects that are pointed to by older
+   * generations early, so we don't have to repeatedly copy them.
+   * Doing the generations in reverse order ensures that we don't end
+   * up in the situation where we want to evac an object to gen 3 and
+   * it has already been evaced to gen 2.
+   */
+  { 
+    int st;
+    for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
+      generations[g].saved_mut_list = generations[g].mut_list;
+      generations[g].mut_list = allocBlock(); 
+        // mut_list always has at least one block.
+    }
+
+    for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
+      IF_PAR_DEBUG(verbose, printMutableList(&generations[g]));
+      scavenge_mutable_list(&generations[g]);
+      evac_gen = g;
+      for (st = generations[g].n_steps-1; st >= 0; st--) {
+	scavenge(&generations[g].steps[st]);
+      }
+    }
+  }
+
+  /* follow roots from the CAF list (used by GHCi)
+   */
+  evac_gen = 0;
+  markCAFs(mark_root);
+
+  /* follow all the roots that the application knows about.
+   */
+  evac_gen = 0;
+  get_roots(mark_root);
+
+#if defined(PAR)
+  /* And don't forget to mark the TSO if we got here direct from
+   * Haskell! */
+  /* Not needed in a seq version?
+  if (CurrentTSO) {
+    CurrentTSO = (StgTSO *)MarkRoot((StgClosure *)CurrentTSO);
+  }
+  */
+
+  // Mark the entries in the GALA table of the parallel system 
+  markLocalGAs(major_gc);
+  // Mark all entries on the list of pending fetches 
+  markPendingFetches(major_gc);
+#endif
+
+  /* Mark the weak pointer list, and prepare to detect dead weak
+   * pointers.
+   */
+  mark_weak_ptr_list(&weak_ptr_list);
+  old_weak_ptr_list = weak_ptr_list;
+  weak_ptr_list = NULL;
+  weak_stage = WeakPtrs;
+
+  /* The all_threads list is like the weak_ptr_list.  
+   * See traverse_weak_ptr_list() for the details.
+   */
+  old_all_threads = all_threads;
+  all_threads = END_TSO_QUEUE;
+  resurrected_threads = END_TSO_QUEUE;
+
+  /* Mark the stable pointer table.
+   */
+  markStablePtrTable(mark_root);
+
+  /* -------------------------------------------------------------------------
+   * Repeatedly scavenge all the areas we know about until there's no
+   * more scavenging to be done.
+   */
+  { 
+    rtsBool flag;
+  loop:
+    flag = rtsFalse;
+
+    // scavenge static objects 
+    if (major_gc && static_objects != END_OF_STATIC_LIST) {
+	IF_DEBUG(sanity, checkStaticObjects(static_objects));
+	scavenge_static();
+    }
+
+    /* When scavenging the older generations:  Objects may have been
+     * evacuated from generations <= N into older generations, and we
+     * need to scavenge these objects.  We're going to try to ensure that
+     * any evacuations that occur move the objects into at least the
+     * same generation as the object being scavenged, otherwise we
+     * have to create new entries on the mutable list for the older
+     * generation.
+     */
+
+    // scavenge each step in generations 0..maxgen 
+    { 
+      long gen;
+      int st; 
+
+    loop2:
+      // scavenge objects in compacted generation
+      if (mark_stack_overflowed || oldgen_scan_bd != NULL ||
+	  (mark_stack_bdescr != NULL && !mark_stack_empty())) {
+	  scavenge_mark_stack();
+	  flag = rtsTrue;
+      }
+
+      for (gen = RtsFlags.GcFlags.generations; --gen >= 0; ) {
+	for (st = generations[gen].n_steps; --st >= 0; ) {
+	  if (gen == 0 && st == 0 && RtsFlags.GcFlags.generations > 1) { 
+	    continue; 
+	  }
+	  stp = &generations[gen].steps[st];
+	  evac_gen = gen;
+	  if (stp->hp_bd != stp->scan_bd || stp->scan < stp->hp) {
+	    scavenge(stp);
+	    flag = rtsTrue;
+	    goto loop2;
+	  }
+	  if (stp->new_large_objects != NULL) {
+	    scavenge_large(stp);
+	    flag = rtsTrue;
+	    goto loop2;
+	  }
+	}
+      }
+    }
+
+    if (flag) { goto loop; }
+
+    // must be last...  invariant is that everything is fully
+    // scavenged at this point.
+    if (traverse_weak_ptr_list()) { // returns rtsTrue if evaced something 
+      goto loop;
+    }
+  }
+
+  /* Update the pointers from the task list - these are
+   * treated as weak pointers because we want to allow a main thread
+   * to get a BlockedOnDeadMVar exception in the same way as any other
+   * thread.  Note that the threads should all have been retained by
+   * GC by virtue of being on the all_threads list, we're just
+   * updating pointers here.
+   */
+  {
+      Task *task;
+      StgTSO *tso;
+      for (task = all_tasks; task != NULL; task = task->all_link) {
+	  if (!task->stopped && task->tso) {
+	      ASSERT(task->tso->bound == task);
+	      tso = (StgTSO *) isAlive((StgClosure *)task->tso);
+	      if (tso == NULL) {
+		  barf("task %p: main thread %d has been GC'd", 
+#ifdef THREADED_RTS
+		       (void *)task->id, 
+#else
+		       (void *)task,
+#endif
+		       task->tso->id);
+	      }
+	      task->tso = tso;
+	  }
+      }
+  }
+
+#if defined(PAR)
+  // Reconstruct the Global Address tables used in GUM 
+  rebuildGAtables(major_gc);
+  IF_DEBUG(sanity, checkLAGAtable(rtsTrue/*check closures, too*/));
+#endif
+
+  // Now see which stable names are still alive.
+  gcStablePtrTable();
+
+  // Tidy the end of the to-space chains 
+  for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+      for (s = 0; s < generations[g].n_steps; s++) {
+	  stp = &generations[g].steps[s];
+	  if (!(g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1)) {
+	      ASSERT(Bdescr(stp->hp) == stp->hp_bd);
+	      stp->hp_bd->free = stp->hp;
+	      Bdescr(stp->scavd_hp)->free = stp->scavd_hp;
+	  }
+      }
+  }
+
+#ifdef PROFILING
+  // We call processHeapClosureForDead() on every closure destroyed during
+  // the current garbage collection, so we invoke LdvCensusForDead().
+  if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_LDV
+      || RtsFlags.ProfFlags.bioSelector != NULL)
+    LdvCensusForDead(N);
+#endif
+
+  // NO MORE EVACUATION AFTER THIS POINT!
+  // Finally: compaction of the oldest generation.
+  if (major_gc && oldest_gen->steps[0].is_compacted) {
+      // save number of blocks for stats
+      oldgen_saved_blocks = oldest_gen->steps[0].n_old_blocks;
+      compact(get_roots);
+  }
+
+  IF_DEBUG(sanity, checkGlobalTSOList(rtsFalse));
+
+  /* run through all the generations/steps and tidy up 
+   */
+  copied = new_blocks * BLOCK_SIZE_W;
+  scavd_copied =  new_scavd_blocks * BLOCK_SIZE_W;
+  for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+
+    if (g <= N) {
+      generations[g].collections++; // for stats 
+    }
+
+    // Count the mutable list as bytes "copied" for the purposes of
+    // stats.  Every mutable list is copied during every GC.
+    if (g > 0) {
+	nat mut_list_size = 0;
+	for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
+	    mut_list_size += bd->free - bd->start;
+	}
+	copied +=  mut_list_size;
+
+	IF_DEBUG(gc, debugBelch("mut_list_size: %ld (%d vars, %d arrays, %d others)\n", mut_list_size * sizeof(W_), mutlist_MUTVARS, mutlist_MUTARRS, mutlist_OTHERS));
+    }
+
+    for (s = 0; s < generations[g].n_steps; s++) {
+      bdescr *next;
+      stp = &generations[g].steps[s];
+
+      if (!(g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1)) {
+	// stats information: how much we copied 
+	if (g <= N) {
+	  copied -= stp->hp_bd->start + BLOCK_SIZE_W -
+	    stp->hp_bd->free;
+	  scavd_copied -= (P_)(BLOCK_ROUND_UP(stp->scavd_hp)) - stp->scavd_hp;
+	}
+      }
+
+      // for generations we collected... 
+      if (g <= N) {
+
+	/* free old memory and shift to-space into from-space for all
+	 * the collected steps (except the allocation area).  These
+	 * freed blocks will probaby be quickly recycled.
+	 */
+	if (!(g == 0 && s == 0)) {
+	    if (stp->is_compacted) {
+		// for a compacted step, just shift the new to-space
+		// onto the front of the now-compacted existing blocks.
+		for (bd = stp->blocks; bd != NULL; bd = bd->link) {
+		    bd->flags &= ~BF_EVACUATED;  // now from-space 
+		}
+		// tack the new blocks on the end of the existing blocks
+		if (stp->old_blocks != NULL) {
+		    for (bd = stp->old_blocks; bd != NULL; bd = next) {
+			// NB. this step might not be compacted next
+			// time, so reset the BF_COMPACTED flags.
+			// They are set before GC if we're going to
+			// compact.  (search for BF_COMPACTED above).
+			bd->flags &= ~BF_COMPACTED;
+			next = bd->link;
+			if (next == NULL) {
+			    bd->link = stp->blocks;
+			}
+		    }
+		    stp->blocks = stp->old_blocks;
+		}
+		// add the new blocks to the block tally
+		stp->n_blocks += stp->n_old_blocks;
+		ASSERT(countBlocks(stp->blocks) == stp->n_blocks);
+	    } else {
+		freeChain(stp->old_blocks);
+		for (bd = stp->blocks; bd != NULL; bd = bd->link) {
+		    bd->flags &= ~BF_EVACUATED;	 // now from-space 
+		}
+	    }
+	    stp->old_blocks = NULL;
+	    stp->n_old_blocks = 0;
+	}
+
+	/* LARGE OBJECTS.  The current live large objects are chained on
+	 * scavenged_large, having been moved during garbage
+	 * collection from large_objects.  Any objects left on
+	 * large_objects list are therefore dead, so we free them here.
+	 */
+	for (bd = stp->large_objects; bd != NULL; bd = next) {
+	  next = bd->link;
+	  freeGroup(bd);
+	  bd = next;
+	}
+
+	// update the count of blocks used by large objects
+	for (bd = stp->scavenged_large_objects; bd != NULL; bd = bd->link) {
+	  bd->flags &= ~BF_EVACUATED;
+	}
+	stp->large_objects  = stp->scavenged_large_objects;
+	stp->n_large_blocks = stp->n_scavenged_large_blocks;
+
+      } else {
+	// for older generations... 
+	
+	/* For older generations, we need to append the
+	 * scavenged_large_object list (i.e. large objects that have been
+	 * promoted during this GC) to the large_object list for that step.
+	 */
+	for (bd = stp->scavenged_large_objects; bd; bd = next) {
+	  next = bd->link;
+	  bd->flags &= ~BF_EVACUATED;
+	  dbl_link_onto(bd, &stp->large_objects);
+	}
+
+	// add the new blocks we promoted during this GC 
+	stp->n_large_blocks += stp->n_scavenged_large_blocks;
+      }
+    }
+  }
+
+  /* Reset the sizes of the older generations when we do a major
+   * collection.
+   *
+   * CURRENT STRATEGY: make all generations except zero the same size.
+   * We have to stay within the maximum heap size, and leave a certain
+   * percentage of the maximum heap size available to allocate into.
+   */
+  if (major_gc && RtsFlags.GcFlags.generations > 1) {
+      nat live, size, min_alloc;
+      nat max  = RtsFlags.GcFlags.maxHeapSize;
+      nat gens = RtsFlags.GcFlags.generations;
+
+      // live in the oldest generations
+      live = oldest_gen->steps[0].n_blocks +
+	     oldest_gen->steps[0].n_large_blocks;
+
+      // default max size for all generations except zero
+      size = stg_max(live * RtsFlags.GcFlags.oldGenFactor,
+		     RtsFlags.GcFlags.minOldGenSize);
+
+      // minimum size for generation zero
+      min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200,
+			  RtsFlags.GcFlags.minAllocAreaSize);
+
+      // Auto-enable compaction when the residency reaches a
+      // certain percentage of the maximum heap size (default: 30%).
+      if (RtsFlags.GcFlags.generations > 1 &&
+	  (RtsFlags.GcFlags.compact ||
+	   (max > 0 &&
+	    oldest_gen->steps[0].n_blocks > 
+	    (RtsFlags.GcFlags.compactThreshold * max) / 100))) {
+	  oldest_gen->steps[0].is_compacted = 1;
+//	  debugBelch("compaction: on\n", live);
+      } else {
+	  oldest_gen->steps[0].is_compacted = 0;
+//	  debugBelch("compaction: off\n", live);
+      }
+
+      // if we're going to go over the maximum heap size, reduce the
+      // size of the generations accordingly.  The calculation is
+      // different if compaction is turned on, because we don't need
+      // to double the space required to collect the old generation.
+      if (max != 0) {
+
+	  // this test is necessary to ensure that the calculations
+	  // below don't have any negative results - we're working
+	  // with unsigned values here.
+	  if (max < min_alloc) {
+	      heapOverflow();
+	  }
+
+	  if (oldest_gen->steps[0].is_compacted) {
+	      if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) {
+		  size = (max - min_alloc) / ((gens - 1) * 2 - 1);
+	      }
+	  } else {
+	      if ( (size * (gens - 1) * 2) + min_alloc > max ) {
+		  size = (max - min_alloc) / ((gens - 1) * 2);
+	      }
+	  }
+
+	  if (size < live) {
+	      heapOverflow();
+	  }
+      }
+
+#if 0
+      debugBelch("live: %d, min_alloc: %d, size : %d, max = %d\n", live,
+	      min_alloc, size, max);
+#endif
+
+      for (g = 0; g < gens; g++) {
+	  generations[g].max_blocks = size;
+      }
+  }
+
+  // Guess the amount of live data for stats.
+  live = calcLive();
+
+  /* Free the small objects allocated via allocate(), since this will
+   * all have been copied into G0S1 now.  
+   */
+  if (small_alloc_list != NULL) {
+    freeChain(small_alloc_list);
+  }
+  small_alloc_list = NULL;
+  alloc_blocks = 0;
+  alloc_Hp = NULL;
+  alloc_HpLim = NULL;
+  alloc_blocks_lim = RtsFlags.GcFlags.minAllocAreaSize;
+
+  // Start a new pinned_object_block
+  pinned_object_block = NULL;
+
+  /* Free the mark stack.
+   */
+  if (mark_stack_bdescr != NULL) {
+      freeGroup(mark_stack_bdescr);
+  }
+
+  /* Free any bitmaps.
+   */
+  for (g = 0; g <= N; g++) {
+      for (s = 0; s < generations[g].n_steps; s++) {
+	  stp = &generations[g].steps[s];
+	  if (stp->bitmap != NULL) {
+	      freeGroup(stp->bitmap);
+	      stp->bitmap = NULL;
+	  }
+      }
+  }
+
+  /* Two-space collector:
+   * Free the old to-space, and estimate the amount of live data.
+   */
+  if (RtsFlags.GcFlags.generations == 1) {
+    nat blocks;
+    
+    if (g0s0->old_blocks != NULL) {
+      freeChain(g0s0->old_blocks);
+    }
+    for (bd = g0s0->blocks; bd != NULL; bd = bd->link) {
+      bd->flags = 0;	// now from-space 
+    }
+    g0s0->old_blocks = g0s0->blocks;
+    g0s0->n_old_blocks = g0s0->n_blocks;
+    g0s0->blocks = saved_nursery;
+    g0s0->n_blocks = saved_n_blocks;
+
+    /* For a two-space collector, we need to resize the nursery. */
+    
+    /* set up a new nursery.  Allocate a nursery size based on a
+     * function of the amount of live data (by default a factor of 2)
+     * Use the blocks from the old nursery if possible, freeing up any
+     * left over blocks.
+     *
+     * If we get near the maximum heap size, then adjust our nursery
+     * size accordingly.  If the nursery is the same size as the live
+     * data (L), then we need 3L bytes.  We can reduce the size of the
+     * nursery to bring the required memory down near 2L bytes.
+     * 
+     * A normal 2-space collector would need 4L bytes to give the same
+     * performance we get from 3L bytes, reducing to the same
+     * performance at 2L bytes.
+     */
+    blocks = g0s0->n_old_blocks;
+
+    if ( RtsFlags.GcFlags.maxHeapSize != 0 &&
+	 blocks * RtsFlags.GcFlags.oldGenFactor * 2 > 
+	   RtsFlags.GcFlags.maxHeapSize ) {
+      long adjusted_blocks;  // signed on purpose 
+      int pc_free; 
+      
+      adjusted_blocks = (RtsFlags.GcFlags.maxHeapSize - 2 * blocks);
+      IF_DEBUG(gc, debugBelch("@@ Near maximum heap size of 0x%x blocks, blocks = %d, adjusted to %ld", RtsFlags.GcFlags.maxHeapSize, blocks, adjusted_blocks));
+      pc_free = adjusted_blocks * 100 / RtsFlags.GcFlags.maxHeapSize;
+      if (pc_free < RtsFlags.GcFlags.pcFreeHeap) /* might even be < 0 */ {
+	heapOverflow();
+      }
+      blocks = adjusted_blocks;
+      
+    } else {
+      blocks *= RtsFlags.GcFlags.oldGenFactor;
+      if (blocks < RtsFlags.GcFlags.minAllocAreaSize) {
+	blocks = RtsFlags.GcFlags.minAllocAreaSize;
+      }
+    }
+    resizeNurseries(blocks);
+    
+  } else {
+    /* Generational collector:
+     * If the user has given us a suggested heap size, adjust our
+     * allocation area to make best use of the memory available.
+     */
+
+    if (RtsFlags.GcFlags.heapSizeSuggestion) {
+      long blocks;
+      nat needed = calcNeeded(); 	// approx blocks needed at next GC 
+
+      /* Guess how much will be live in generation 0 step 0 next time.
+       * A good approximation is obtained by finding the
+       * percentage of g0s0 that was live at the last minor GC.
+       */
+      if (N == 0) {
+	g0s0_pcnt_kept = (new_blocks * 100) / countNurseryBlocks();
+      }
+
+      /* Estimate a size for the allocation area based on the
+       * information available.  We might end up going slightly under
+       * or over the suggested heap size, but we should be pretty
+       * close on average.
+       *
+       * Formula:            suggested - needed
+       *                ----------------------------
+       *                    1 + g0s0_pcnt_kept/100
+       *
+       * where 'needed' is the amount of memory needed at the next
+       * collection for collecting all steps except g0s0.
+       */
+      blocks = 
+	(((long)RtsFlags.GcFlags.heapSizeSuggestion - (long)needed) * 100) /
+	(100 + (long)g0s0_pcnt_kept);
+      
+      if (blocks < (long)RtsFlags.GcFlags.minAllocAreaSize) {
+	blocks = RtsFlags.GcFlags.minAllocAreaSize;
+      }
+      
+      resizeNurseries((nat)blocks);
+
+    } else {
+      // we might have added extra large blocks to the nursery, so
+      // resize back to minAllocAreaSize again.
+      resizeNurseriesFixed(RtsFlags.GcFlags.minAllocAreaSize);
+    }
+  }
+
+ // mark the garbage collected CAFs as dead 
+#if 0 && defined(DEBUG) // doesn't work at the moment 
+  if (major_gc) { gcCAFs(); }
+#endif
+  
+#ifdef PROFILING
+  // resetStaticObjectForRetainerProfiling() must be called before
+  // zeroing below.
+  resetStaticObjectForRetainerProfiling();
+#endif
+
+  // zero the scavenged static object list 
+  if (major_gc) {
+    zero_static_object_list(scavenged_static_objects);
+  }
+
+  // Reset the nursery
+  resetNurseries();
+
+  // start any pending finalizers 
+  RELEASE_SM_LOCK;
+  scheduleFinalizers(last_free_capability, old_weak_ptr_list);
+  ACQUIRE_SM_LOCK;
+  
+  // send exceptions to any threads which were about to die 
+  RELEASE_SM_LOCK;
+  resurrectThreads(resurrected_threads);
+  ACQUIRE_SM_LOCK;
+
+  // Update the stable pointer hash table.
+  updateStablePtrTable(major_gc);
+
+  // check sanity after GC 
+  IF_DEBUG(sanity, checkSanity());
+
+  // extra GC trace info 
+  IF_DEBUG(gc, statDescribeGens());
+
+#ifdef DEBUG
+  // symbol-table based profiling 
+  /*  heapCensus(to_blocks); */ /* ToDo */
+#endif
+
+  // restore enclosing cost centre 
+#ifdef PROFILING
+  CCCS = prev_CCS;
+#endif
+
+#ifdef DEBUG
+  // check for memory leaks if DEBUG is on 
+  memInventory();
+#endif
+
+#ifdef RTS_GTK_FRONTPANEL
+  if (RtsFlags.GcFlags.frontpanel) {
+      updateFrontPanelAfterGC( N, live );
+  }
+#endif
+
+  // ok, GC over: tell the stats department what happened. 
+  stat_endGC(allocated, live, copied, scavd_copied, N);
+
+#if defined(RTS_USER_SIGNALS)
+  // unblock signals again
+  unblockUserSignals();
+#endif
+
+  RELEASE_SM_LOCK;
+
+  //PAR_TICKY_TP();
+}
+
+
+/* -----------------------------------------------------------------------------
+   Weak Pointers
+
+   traverse_weak_ptr_list is called possibly many times during garbage
+   collection.  It returns a flag indicating whether it did any work
+   (i.e. called evacuate on any live pointers).
+
+   Invariant: traverse_weak_ptr_list is called when the heap is in an
+   idempotent state.  That means that there are no pending
+   evacuate/scavenge operations.  This invariant helps the weak
+   pointer code decide which weak pointers are dead - if there are no
+   new live weak pointers, then all the currently unreachable ones are
+   dead.
+
+   For generational GC: we just don't try to finalize weak pointers in
+   older generations than the one we're collecting.  This could
+   probably be optimised by keeping per-generation lists of weak
+   pointers, but for a few weak pointers this scheme will work.
+
+   There are three distinct stages to processing weak pointers:
+
+   - weak_stage == WeakPtrs
+
+     We process all the weak pointers whos keys are alive (evacuate
+     their values and finalizers), and repeat until we can find no new
+     live keys.  If no live keys are found in this pass, then we
+     evacuate the finalizers of all the dead weak pointers in order to
+     run them.
+
+   - weak_stage == WeakThreads
+
+     Now, we discover which *threads* are still alive.  Pointers to
+     threads from the all_threads and main thread lists are the
+     weakest of all: a pointers from the finalizer of a dead weak
+     pointer can keep a thread alive.  Any threads found to be unreachable
+     are evacuated and placed on the resurrected_threads list so we 
+     can send them a signal later.
+
+   - weak_stage == WeakDone
+
+     No more evacuation is done.
+
+   -------------------------------------------------------------------------- */
+
+static rtsBool 
+traverse_weak_ptr_list(void)
+{
+  StgWeak *w, **last_w, *next_w;
+  StgClosure *new;
+  rtsBool flag = rtsFalse;
+
+  switch (weak_stage) {
+
+  case WeakDone:
+      return rtsFalse;
+
+  case WeakPtrs:
+      /* doesn't matter where we evacuate values/finalizers to, since
+       * these pointers are treated as roots (iff the keys are alive).
+       */
+      evac_gen = 0;
+      
+      last_w = &old_weak_ptr_list;
+      for (w = old_weak_ptr_list; w != NULL; w = next_w) {
+	  
+	  /* There might be a DEAD_WEAK on the list if finalizeWeak# was
+	   * called on a live weak pointer object.  Just remove it.
+	   */
+	  if (w->header.info == &stg_DEAD_WEAK_info) {
+	      next_w = ((StgDeadWeak *)w)->link;
+	      *last_w = next_w;
+	      continue;
+	  }
+	  
+	  switch (get_itbl(w)->type) {
+
+	  case EVACUATED:
+	      next_w = (StgWeak *)((StgEvacuated *)w)->evacuee;
+	      *last_w = next_w;
+	      continue;
+
+	  case WEAK:
+	      /* Now, check whether the key is reachable.
+	       */
+	      new = isAlive(w->key);
+	      if (new != NULL) {
+		  w->key = new;
+		  // evacuate the value and finalizer 
+		  w->value = evacuate(w->value);
+		  w->finalizer = evacuate(w->finalizer);
+		  // remove this weak ptr from the old_weak_ptr list 
+		  *last_w = w->link;
+		  // and put it on the new weak ptr list 
+		  next_w  = w->link;
+		  w->link = weak_ptr_list;
+		  weak_ptr_list = w;
+		  flag = rtsTrue;
+		  IF_DEBUG(weak, debugBelch("Weak pointer still alive at %p -> %p", 
+				       w, w->key));
+		  continue;
+	      }
+	      else {
+		  last_w = &(w->link);
+		  next_w = w->link;
+		  continue;
+	      }
+
+	  default:
+	      barf("traverse_weak_ptr_list: not WEAK");
+	  }
+      }
+      
+      /* If we didn't make any changes, then we can go round and kill all
+       * the dead weak pointers.  The old_weak_ptr list is used as a list
+       * of pending finalizers later on.
+       */
+      if (flag == rtsFalse) {
+	  for (w = old_weak_ptr_list; w; w = w->link) {
+	      w->finalizer = evacuate(w->finalizer);
+	  }
+
+	  // Next, move to the WeakThreads stage after fully
+	  // scavenging the finalizers we've just evacuated.
+	  weak_stage = WeakThreads;
+      }
+
+      return rtsTrue;
+
+  case WeakThreads:
+      /* Now deal with the all_threads list, which behaves somewhat like
+       * the weak ptr list.  If we discover any threads that are about to
+       * become garbage, we wake them up and administer an exception.
+       */
+      {
+	  StgTSO *t, *tmp, *next, **prev;
+	  
+	  prev = &old_all_threads;
+	  for (t = old_all_threads; t != END_TSO_QUEUE; t = next) {
+	      
+	      tmp = (StgTSO *)isAlive((StgClosure *)t);
+	      
+	      if (tmp != NULL) {
+		  t = tmp;
+	      }
+	      
+	      ASSERT(get_itbl(t)->type == TSO);
+	      switch (t->what_next) {
+	      case ThreadRelocated:
+		  next = t->link;
+		  *prev = next;
+		  continue;
+	      case ThreadKilled:
+	      case ThreadComplete:
+		  // finshed or died.  The thread might still be alive, but we
+		  // don't keep it on the all_threads list.  Don't forget to
+		  // stub out its global_link field.
+		  next = t->global_link;
+		  t->global_link = END_TSO_QUEUE;
+		  *prev = next;
+		  continue;
+	      default:
+		  ;
+	      }
+	      
+	      // Threads blocked on black holes: if the black hole
+	      // is alive, then the thread is alive too.
+	      if (tmp == NULL && t->why_blocked == BlockedOnBlackHole) {
+		  if (isAlive(t->block_info.closure)) {
+		      t = (StgTSO *)evacuate((StgClosure *)t);
+		      tmp = t;
+		      flag = rtsTrue;
+		  }
+	      }
+
+	      if (tmp == NULL) {
+		  // not alive (yet): leave this thread on the
+		  // old_all_threads list.
+		  prev = &(t->global_link);
+		  next = t->global_link;
+	      } 
+	      else {
+		  // alive: move this thread onto the all_threads list.
+		  next = t->global_link;
+		  t->global_link = all_threads;
+		  all_threads  = t;
+		  *prev = next;
+	      }
+	  }
+      }
+      
+      /* If we evacuated any threads, we need to go back to the scavenger.
+       */
+      if (flag) return rtsTrue;
+
+      /* And resurrect any threads which were about to become garbage.
+       */
+      {
+	  StgTSO *t, *tmp, *next;
+	  for (t = old_all_threads; t != END_TSO_QUEUE; t = next) {
+	      next = t->global_link;
+	      tmp = (StgTSO *)evacuate((StgClosure *)t);
+	      tmp->global_link = resurrected_threads;
+	      resurrected_threads = tmp;
+	  }
+      }
+      
+      /* Finally, we can update the blackhole_queue.  This queue
+       * simply strings together TSOs blocked on black holes, it is
+       * not intended to keep anything alive.  Hence, we do not follow
+       * pointers on the blackhole_queue until now, when we have
+       * determined which TSOs are otherwise reachable.  We know at
+       * this point that all TSOs have been evacuated, however.
+       */
+      { 
+	  StgTSO **pt;
+	  for (pt = &blackhole_queue; *pt != END_TSO_QUEUE; pt = &((*pt)->link)) {
+	      *pt = (StgTSO *)isAlive((StgClosure *)*pt);
+	      ASSERT(*pt != NULL);
+	  }
+      }
+
+      weak_stage = WeakDone;  // *now* we're done,
+      return rtsTrue;         // but one more round of scavenging, please
+
+  default:
+      barf("traverse_weak_ptr_list");
+      return rtsTrue;
+  }
+
+}
+
+/* -----------------------------------------------------------------------------
+   After GC, the live weak pointer list may have forwarding pointers
+   on it, because a weak pointer object was evacuated after being
+   moved to the live weak pointer list.  We remove those forwarding
+   pointers here.
+
+   Also, we don't consider weak pointer objects to be reachable, but
+   we must nevertheless consider them to be "live" and retain them.
+   Therefore any weak pointer objects which haven't as yet been
+   evacuated need to be evacuated now.
+   -------------------------------------------------------------------------- */
+
+
+static void
+mark_weak_ptr_list ( StgWeak **list )
+{
+  StgWeak *w, **last_w;
+
+  last_w = list;
+  for (w = *list; w; w = w->link) {
+      // w might be WEAK, EVACUATED, or DEAD_WEAK (actually CON_STATIC) here
+      ASSERT(w->header.info == &stg_DEAD_WEAK_info 
+	     || get_itbl(w)->type == WEAK || get_itbl(w)->type == EVACUATED);
+      w = (StgWeak *)evacuate((StgClosure *)w);
+      *last_w = w;
+      last_w = &(w->link);
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   isAlive determines whether the given closure is still alive (after
+   a garbage collection) or not.  It returns the new address of the
+   closure if it is alive, or NULL otherwise.
+
+   NOTE: Use it before compaction only!
+   -------------------------------------------------------------------------- */
+
+
+StgClosure *
+isAlive(StgClosure *p)
+{
+  const StgInfoTable *info;
+  bdescr *bd;
+
+  while (1) {
+
+    ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
+    info = get_itbl(p);
+
+    // ignore static closures 
+    //
+    // ToDo: for static closures, check the static link field.
+    // Problem here is that we sometimes don't set the link field, eg.
+    // for static closures with an empty SRT or CONSTR_STATIC_NOCAFs.
+    //
+    if (!HEAP_ALLOCED(p)) {
+	return p;
+    }
+
+    // ignore closures in generations that we're not collecting. 
+    bd = Bdescr((P_)p);
+    if (bd->gen_no > N) {
+	return p;
+    }
+
+    // if it's a pointer into to-space, then we're done
+    if (bd->flags & BF_EVACUATED) {
+	return p;
+    }
+
+    // large objects use the evacuated flag
+    if (bd->flags & BF_LARGE) {
+	return NULL;
+    }
+
+    // check the mark bit for compacted steps
+    if ((bd->flags & BF_COMPACTED) && is_marked((P_)p,bd)) {
+	return p;
+    }
+
+    switch (info->type) {
+
+    case IND:
+    case IND_STATIC:
+    case IND_PERM:
+    case IND_OLDGEN:		// rely on compatible layout with StgInd 
+    case IND_OLDGEN_PERM:
+      // follow indirections 
+      p = ((StgInd *)p)->indirectee;
+      continue;
+
+    case EVACUATED:
+      // alive! 
+      return ((StgEvacuated *)p)->evacuee;
+
+    case TSO:
+      if (((StgTSO *)p)->what_next == ThreadRelocated) {
+	p = (StgClosure *)((StgTSO *)p)->link;
+	continue;
+      } 
+      return NULL;
+
+    default:
+      // dead. 
+      return NULL;
+    }
+  }
+}
+
+static void
+mark_root(StgClosure **root)
+{
+  *root = evacuate(*root);
+}
+
+STATIC_INLINE void 
+upd_evacuee(StgClosure *p, StgClosure *dest)
+{
+    // not true: (ToDo: perhaps it should be)
+    // ASSERT(Bdescr((P_)dest)->flags & BF_EVACUATED);
+    SET_INFO(p, &stg_EVACUATED_info);
+    ((StgEvacuated *)p)->evacuee = dest;
+}
+
+
+STATIC_INLINE StgClosure *
+copy(StgClosure *src, nat size, step *stp)
+{
+  StgPtr to, from;
+  nat i;
+#ifdef PROFILING
+  // @LDV profiling
+  nat size_org = size;
+#endif
+
+  TICK_GC_WORDS_COPIED(size);
+  /* Find out where we're going, using the handy "to" pointer in 
+   * the step of the source object.  If it turns out we need to
+   * evacuate to an older generation, adjust it here (see comment
+   * by evacuate()).
+   */
+  if (stp->gen_no < evac_gen) {
+      if (eager_promotion) {
+	  stp = &generations[evac_gen].steps[0];
+      } else {
+	  failed_to_evac = rtsTrue;
+      }
+  }
+
+  /* chain a new block onto the to-space for the destination step if
+   * necessary.
+   */
+  if (stp->hp + size >= stp->hpLim) {
+    gc_alloc_block(stp);
+  }
+
+  to = stp->hp;
+  from = (StgPtr)src;
+  stp->hp = to + size;
+  for (i = 0; i < size; i++) { // unroll for small i
+      to[i] = from[i];
+  }
+  upd_evacuee((StgClosure *)from,(StgClosure *)to);
+
+#ifdef PROFILING
+  // We store the size of the just evacuated object in the LDV word so that
+  // the profiler can guess the position of the next object later.
+  SET_EVACUAEE_FOR_LDV(from, size_org);
+#endif
+  return (StgClosure *)to;
+}
+
+// Same as copy() above, except the object will be allocated in memory
+// that will not be scavenged.  Used for object that have no pointer
+// fields.
+STATIC_INLINE StgClosure *
+copy_noscav(StgClosure *src, nat size, step *stp)
+{
+  StgPtr to, from;
+  nat i;
+#ifdef PROFILING
+  // @LDV profiling
+  nat size_org = size;
+#endif
+
+  TICK_GC_WORDS_COPIED(size);
+  /* Find out where we're going, using the handy "to" pointer in 
+   * the step of the source object.  If it turns out we need to
+   * evacuate to an older generation, adjust it here (see comment
+   * by evacuate()).
+   */
+  if (stp->gen_no < evac_gen) {
+      if (eager_promotion) {
+	  stp = &generations[evac_gen].steps[0];
+      } else {
+	  failed_to_evac = rtsTrue;
+      }
+  }
+
+  /* chain a new block onto the to-space for the destination step if
+   * necessary.
+   */
+  if (stp->scavd_hp + size >= stp->scavd_hpLim) {
+    gc_alloc_scavd_block(stp);
+  }
+
+  to = stp->scavd_hp;
+  from = (StgPtr)src;
+  stp->scavd_hp = to + size;
+  for (i = 0; i < size; i++) { // unroll for small i
+      to[i] = from[i];
+  }
+  upd_evacuee((StgClosure *)from,(StgClosure *)to);
+
+#ifdef PROFILING
+  // We store the size of the just evacuated object in the LDV word so that
+  // the profiler can guess the position of the next object later.
+  SET_EVACUAEE_FOR_LDV(from, size_org);
+#endif
+  return (StgClosure *)to;
+}
+
+/* Special version of copy() for when we only want to copy the info
+ * pointer of an object, but reserve some padding after it.  This is
+ * used to optimise evacuation of BLACKHOLEs.
+ */
+
+
+static StgClosure *
+copyPart(StgClosure *src, nat size_to_reserve, nat size_to_copy, step *stp)
+{
+  P_ dest, to, from;
+#ifdef PROFILING
+  // @LDV profiling
+  nat size_to_copy_org = size_to_copy;
+#endif
+
+  TICK_GC_WORDS_COPIED(size_to_copy);
+  if (stp->gen_no < evac_gen) {
+      if (eager_promotion) {
+	  stp = &generations[evac_gen].steps[0];
+      } else {
+	  failed_to_evac = rtsTrue;
+      }
+  }
+
+  if (stp->hp + size_to_reserve >= stp->hpLim) {
+    gc_alloc_block(stp);
+  }
+
+  for(to = stp->hp, from = (P_)src; size_to_copy>0; --size_to_copy) {
+    *to++ = *from++;
+  }
+  
+  dest = stp->hp;
+  stp->hp += size_to_reserve;
+  upd_evacuee(src,(StgClosure *)dest);
+#ifdef PROFILING
+  // We store the size of the just evacuated object in the LDV word so that
+  // the profiler can guess the position of the next object later.
+  // size_to_copy_org is wrong because the closure already occupies size_to_reserve
+  // words.
+  SET_EVACUAEE_FOR_LDV(src, size_to_reserve);
+  // fill the slop
+  if (size_to_reserve - size_to_copy_org > 0)
+    LDV_FILL_SLOP(stp->hp - 1, (int)(size_to_reserve - size_to_copy_org)); 
+#endif
+  return (StgClosure *)dest;
+}
+
+
+/* -----------------------------------------------------------------------------
+   Evacuate a large object
+
+   This just consists of removing the object from the (doubly-linked)
+   step->large_objects list, and linking it on to the (singly-linked)
+   step->new_large_objects list, from where it will be scavenged later.
+
+   Convention: bd->flags has BF_EVACUATED set for a large object
+   that has been evacuated, or unset otherwise.
+   -------------------------------------------------------------------------- */
+
+
+STATIC_INLINE void
+evacuate_large(StgPtr p)
+{
+  bdescr *bd = Bdescr(p);
+  step *stp;
+
+  // object must be at the beginning of the block (or be a ByteArray)
+  ASSERT(get_itbl((StgClosure *)p)->type == ARR_WORDS ||
+	 (((W_)p & BLOCK_MASK) == 0));
+
+  // already evacuated? 
+  if (bd->flags & BF_EVACUATED) { 
+    /* Don't forget to set the failed_to_evac flag if we didn't get
+     * the desired destination (see comments in evacuate()).
+     */
+    if (bd->gen_no < evac_gen) {
+      failed_to_evac = rtsTrue;
+      TICK_GC_FAILED_PROMOTION();
+    }
+    return;
+  }
+
+  stp = bd->step;
+  // remove from large_object list 
+  if (bd->u.back) {
+    bd->u.back->link = bd->link;
+  } else { // first object in the list 
+    stp->large_objects = bd->link;
+  }
+  if (bd->link) {
+    bd->link->u.back = bd->u.back;
+  }
+  
+  /* link it on to the evacuated large object list of the destination step
+   */
+  stp = bd->step->to;
+  if (stp->gen_no < evac_gen) {
+      if (eager_promotion) {
+	  stp = &generations[evac_gen].steps[0];
+      } else {
+	  failed_to_evac = rtsTrue;
+      }
+  }
+
+  bd->step = stp;
+  bd->gen_no = stp->gen_no;
+  bd->link = stp->new_large_objects;
+  stp->new_large_objects = bd;
+  bd->flags |= BF_EVACUATED;
+}
+
+/* -----------------------------------------------------------------------------
+   Evacuate
+
+   This is called (eventually) for every live object in the system.
+
+   The caller to evacuate specifies a desired generation in the
+   evac_gen global variable.  The following conditions apply to
+   evacuating an object which resides in generation M when we're
+   collecting up to generation N
+
+   if  M >= evac_gen 
+           if  M > N     do nothing
+	   else          evac to step->to
+
+   if  M < evac_gen      evac to evac_gen, step 0
+
+   if the object is already evacuated, then we check which generation
+   it now resides in.
+
+   if  M >= evac_gen     do nothing
+   if  M <  evac_gen     set failed_to_evac flag to indicate that we
+                         didn't manage to evacuate this object into evac_gen.
+
+
+   OPTIMISATION NOTES:
+
+   evacuate() is the single most important function performance-wise
+   in the GC.  Various things have been tried to speed it up, but as
+   far as I can tell the code generated by gcc 3.2 with -O2 is about
+   as good as it's going to get.  We pass the argument to evacuate()
+   in a register using the 'regparm' attribute (see the prototype for
+   evacuate() near the top of this file).
+
+   Changing evacuate() to take an (StgClosure **) rather than
+   returning the new pointer seems attractive, because we can avoid
+   writing back the pointer when it hasn't changed (eg. for a static
+   object, or an object in a generation > N).  However, I tried it and
+   it doesn't help.  One reason is that the (StgClosure **) pointer
+   gets spilled to the stack inside evacuate(), resulting in far more
+   extra reads/writes than we save.
+   -------------------------------------------------------------------------- */
+
+REGPARM1 static StgClosure *
+evacuate(StgClosure *q)
+{
+#if defined(PAR)
+  StgClosure *to;
+#endif
+  bdescr *bd = NULL;
+  step *stp;
+  const StgInfoTable *info;
+
+loop:
+  ASSERT(LOOKS_LIKE_CLOSURE_PTR(q));
+
+  if (!HEAP_ALLOCED(q)) {
+
+      if (!major_gc) return q;
+
+      info = get_itbl(q);
+      switch (info->type) {
+
+      case THUNK_STATIC:
+	  if (info->srt_bitmap != 0 && 
+	      *THUNK_STATIC_LINK((StgClosure *)q) == NULL) {
+	      *THUNK_STATIC_LINK((StgClosure *)q) = static_objects;
+	      static_objects = (StgClosure *)q;
+	  }
+	  return q;
+	  
+      case FUN_STATIC:
+	  if (info->srt_bitmap != 0 && 
+	      *FUN_STATIC_LINK((StgClosure *)q) == NULL) {
+	      *FUN_STATIC_LINK((StgClosure *)q) = static_objects;
+	      static_objects = (StgClosure *)q;
+	  }
+	  return q;
+	  
+      case IND_STATIC:
+	  /* If q->saved_info != NULL, then it's a revertible CAF - it'll be
+	   * on the CAF list, so don't do anything with it here (we'll
+	   * scavenge it later).
+	   */
+	  if (((StgIndStatic *)q)->saved_info == NULL
+	      && *IND_STATIC_LINK((StgClosure *)q) == NULL) {
+	      *IND_STATIC_LINK((StgClosure *)q) = static_objects;
+	      static_objects = (StgClosure *)q;
+	  }
+	  return q;
+	  
+      case CONSTR_STATIC:
+	  if (*STATIC_LINK(info,(StgClosure *)q) == NULL) {
+	      *STATIC_LINK(info,(StgClosure *)q) = static_objects;
+	      static_objects = (StgClosure *)q;
+	  }
+	  return q;
+	  
+      case CONSTR_INTLIKE:
+      case CONSTR_CHARLIKE:
+      case CONSTR_NOCAF_STATIC:
+	  /* no need to put these on the static linked list, they don't need
+	   * to be scavenged.
+	   */
+	  return q;
+	  
+      default:
+	  barf("evacuate(static): strange closure type %d", (int)(info->type));
+      }
+  }
+
+  bd = Bdescr((P_)q);
+
+  if (bd->gen_no > N) {
+      /* Can't evacuate this object, because it's in a generation
+       * older than the ones we're collecting.  Let's hope that it's
+       * in evac_gen or older, or we will have to arrange to track
+       * this pointer using the mutable list.
+       */
+      if (bd->gen_no < evac_gen) {
+	  // nope 
+	  failed_to_evac = rtsTrue;
+	  TICK_GC_FAILED_PROMOTION();
+      }
+      return q;
+  }
+
+  if ((bd->flags & (BF_LARGE | BF_COMPACTED | BF_EVACUATED)) != 0) {
+
+      /* pointer into to-space: just return it.  This normally
+       * shouldn't happen, but alllowing it makes certain things
+       * slightly easier (eg. the mutable list can contain the same
+       * object twice, for example).
+       */
+      if (bd->flags & BF_EVACUATED) {
+	  if (bd->gen_no < evac_gen) {
+	      failed_to_evac = rtsTrue;
+	      TICK_GC_FAILED_PROMOTION();
+	  }
+	  return q;
+      }
+
+      /* evacuate large objects by re-linking them onto a different list.
+       */
+      if (bd->flags & BF_LARGE) {
+	  info = get_itbl(q);
+	  if (info->type == TSO && 
+	      ((StgTSO *)q)->what_next == ThreadRelocated) {
+	      q = (StgClosure *)((StgTSO *)q)->link;
+	      goto loop;
+	  }
+	  evacuate_large((P_)q);
+	  return q;
+      }
+      
+      /* If the object is in a step that we're compacting, then we
+       * need to use an alternative evacuate procedure.
+       */
+      if (bd->flags & BF_COMPACTED) {
+	  if (!is_marked((P_)q,bd)) {
+	      mark((P_)q,bd);
+	      if (mark_stack_full()) {
+		  mark_stack_overflowed = rtsTrue;
+		  reset_mark_stack();
+	      }
+	      push_mark_stack((P_)q);
+	  }
+	  return q;
+      }
+  }
+      
+  stp = bd->step->to;
+
+  info = get_itbl(q);
+  
+  switch (info->type) {
+
+  case MUT_VAR_CLEAN:
+  case MUT_VAR_DIRTY:
+  case MVAR:
+      return copy(q,sizeW_fromITBL(info),stp);
+
+  case CONSTR_0_1:
+  { 
+      StgWord w = (StgWord)q->payload[0];
+      if (q->header.info == Czh_con_info &&
+	  // unsigned, so always true:  (StgChar)w >= MIN_CHARLIKE &&  
+	  (StgChar)w <= MAX_CHARLIKE) {
+	  return (StgClosure *)CHARLIKE_CLOSURE((StgChar)w);
+      }
+      if (q->header.info == Izh_con_info &&
+	  (StgInt)w >= MIN_INTLIKE && (StgInt)w <= MAX_INTLIKE) {
+	  return (StgClosure *)INTLIKE_CLOSURE((StgInt)w);
+      }
+      // else
+      return copy_noscav(q,sizeofW(StgHeader)+1,stp);
+  }
+
+  case FUN_0_1:
+  case FUN_1_0:
+  case CONSTR_1_0:
+    return copy(q,sizeofW(StgHeader)+1,stp);
+
+  case THUNK_1_0:
+  case THUNK_0_1:
+    return copy(q,sizeofW(StgThunk)+1,stp);
+
+  case THUNK_1_1:
+  case THUNK_2_0:
+  case THUNK_0_2:
+#ifdef NO_PROMOTE_THUNKS
+    if (bd->gen_no == 0 && 
+	bd->step->no != 0 &&
+	bd->step->no == generations[bd->gen_no].n_steps-1) {
+      stp = bd->step;
+    }
+#endif
+    return copy(q,sizeofW(StgThunk)+2,stp);
+
+  case FUN_1_1:
+  case FUN_2_0:
+  case CONSTR_1_1:
+  case CONSTR_2_0:
+  case FUN_0_2:
+    return copy(q,sizeofW(StgHeader)+2,stp);
+
+  case CONSTR_0_2:
+    return copy_noscav(q,sizeofW(StgHeader)+2,stp);
+
+  case THUNK:
+    return copy(q,thunk_sizeW_fromITBL(info),stp);
+
+  case FUN:
+  case CONSTR:
+  case IND_PERM:
+  case IND_OLDGEN_PERM:
+  case WEAK:
+  case STABLE_NAME:
+    return copy(q,sizeW_fromITBL(info),stp);
+
+  case BCO:
+      return copy(q,bco_sizeW((StgBCO *)q),stp);
+
+  case CAF_BLACKHOLE:
+  case SE_CAF_BLACKHOLE:
+  case SE_BLACKHOLE:
+  case BLACKHOLE:
+    return copyPart(q,BLACKHOLE_sizeW(),sizeofW(StgHeader),stp);
+
+  case THUNK_SELECTOR:
+    {
+	StgClosure *p;
+	const StgInfoTable *info_ptr;
+
+	if (thunk_selector_depth > MAX_THUNK_SELECTOR_DEPTH) {
+	    return copy(q,THUNK_SELECTOR_sizeW(),stp);
+	}
+
+	// stashed away for LDV profiling, see below
+	info_ptr = q->header.info;
+
+	p = eval_thunk_selector(info->layout.selector_offset,
+				(StgSelector *)q);
+
+	if (p == NULL) {
+	    return copy(q,THUNK_SELECTOR_sizeW(),stp);
+	} else {
+	    StgClosure *val;
+	    // q is still BLACKHOLE'd.
+	    thunk_selector_depth++;
+	    val = evacuate(p);
+	    thunk_selector_depth--;
+
+#ifdef PROFILING
+	    // For the purposes of LDV profiling, we have destroyed
+	    // the original selector thunk.
+	    SET_INFO(q, info_ptr);
+	    LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC(q);
+#endif
+
+	    // Update the THUNK_SELECTOR with an indirection to the
+	    // EVACUATED closure now at p.  Why do this rather than
+	    // upd_evacuee(q,p)?  Because we have an invariant that an
+	    // EVACUATED closure always points to an object in the
+	    // same or an older generation (required by the short-cut
+	    // test in the EVACUATED case, below).
+	    SET_INFO(q, &stg_IND_info);
+	    ((StgInd *)q)->indirectee = p;
+
+	    // For the purposes of LDV profiling, we have created an
+	    // indirection.
+	    LDV_RECORD_CREATE(q);
+
+	    return val;
+	}
+    }
+
+  case IND:
+  case IND_OLDGEN:
+    // follow chains of indirections, don't evacuate them 
+    q = ((StgInd*)q)->indirectee;
+    goto loop;
+
+  case RET_BCO:
+  case RET_SMALL:
+  case RET_VEC_SMALL:
+  case RET_BIG:
+  case RET_VEC_BIG:
+  case RET_DYN:
+  case UPDATE_FRAME:
+  case STOP_FRAME:
+  case CATCH_FRAME:
+  case CATCH_STM_FRAME:
+  case CATCH_RETRY_FRAME:
+  case ATOMICALLY_FRAME:
+    // shouldn't see these 
+    barf("evacuate: stack frame at %p\n", q);
+
+  case PAP:
+      return copy(q,pap_sizeW((StgPAP*)q),stp);
+
+  case AP:
+      return copy(q,ap_sizeW((StgAP*)q),stp);
+
+  case AP_STACK:
+      return copy(q,ap_stack_sizeW((StgAP_STACK*)q),stp);
+
+  case EVACUATED:
+    /* Already evacuated, just return the forwarding address.
+     * HOWEVER: if the requested destination generation (evac_gen) is
+     * older than the actual generation (because the object was
+     * already evacuated to a younger generation) then we have to
+     * set the failed_to_evac flag to indicate that we couldn't 
+     * manage to promote the object to the desired generation.
+     */
+    /* 
+     * Optimisation: the check is fairly expensive, but we can often
+     * shortcut it if either the required generation is 0, or the
+     * current object (the EVACUATED) is in a high enough generation.
+     * We know that an EVACUATED always points to an object in the
+     * same or an older generation.  stp is the lowest step that the
+     * current object would be evacuated to, so we only do the full
+     * check if stp is too low.
+     */
+    if (evac_gen > 0 && stp->gen_no < evac_gen) {  // optimisation 
+      StgClosure *p = ((StgEvacuated*)q)->evacuee;
+      if (HEAP_ALLOCED(p) && Bdescr((P_)p)->gen_no < evac_gen) {
+	failed_to_evac = rtsTrue;
+	TICK_GC_FAILED_PROMOTION();
+      }
+    }
+    return ((StgEvacuated*)q)->evacuee;
+
+  case ARR_WORDS:
+      // just copy the block 
+      return copy_noscav(q,arr_words_sizeW((StgArrWords *)q),stp);
+
+  case MUT_ARR_PTRS_CLEAN:
+  case MUT_ARR_PTRS_DIRTY:
+  case MUT_ARR_PTRS_FROZEN:
+  case MUT_ARR_PTRS_FROZEN0:
+      // just copy the block 
+      return copy(q,mut_arr_ptrs_sizeW((StgMutArrPtrs *)q),stp);
+
+  case TSO:
+    {
+      StgTSO *tso = (StgTSO *)q;
+
+      /* Deal with redirected TSOs (a TSO that's had its stack enlarged).
+       */
+      if (tso->what_next == ThreadRelocated) {
+	q = (StgClosure *)tso->link;
+	goto loop;
+      }
+
+      /* To evacuate a small TSO, we need to relocate the update frame
+       * list it contains.  
+       */
+      {
+	  StgTSO *new_tso;
+	  StgPtr p, q;
+
+	  new_tso = (StgTSO *)copyPart((StgClosure *)tso,
+				       tso_sizeW(tso),
+				       sizeofW(StgTSO), stp);
+	  move_TSO(tso, new_tso);
+	  for (p = tso->sp, q = new_tso->sp;
+	       p < tso->stack+tso->stack_size;) {
+	      *q++ = *p++;
+	  }
+	  
+	  return (StgClosure *)new_tso;
+      }
+    }
+
+#if defined(PAR)
+  case RBH:
+    {
+      //StgInfoTable *rip = get_closure_info(q, &size, &ptrs, &nonptrs, &vhs, str);
+      to = copy(q,BLACKHOLE_sizeW(),stp); 
+      //ToDo: derive size etc from reverted IP
+      //to = copy(q,size,stp);
+      IF_DEBUG(gc,
+	       debugBelch("@@ evacuate: RBH %p (%s) to %p (%s)",
+		     q, info_type(q), to, info_type(to)));
+      return to;
+    }
+
+  case BLOCKED_FETCH:
+    ASSERT(sizeofW(StgBlockedFetch) >= MIN_PAYLOD_SIZE);
+    to = copy(q,sizeofW(StgBlockedFetch),stp);
+    IF_DEBUG(gc,
+	     debugBelch("@@ evacuate: %p (%s) to %p (%s)",
+		   q, info_type(q), to, info_type(to)));
+    return to;
+
+# ifdef DIST    
+  case REMOTE_REF:
+# endif
+  case FETCH_ME:
+    ASSERT(sizeofW(StgBlockedFetch) >= MIN_PAYLOAD_SIZE);
+    to = copy(q,sizeofW(StgFetchMe),stp);
+    IF_DEBUG(gc,
+	     debugBelch("@@ evacuate: %p (%s) to %p (%s)",
+		   q, info_type(q), to, info_type(to)));
+    return to;
+
+  case FETCH_ME_BQ:
+    ASSERT(sizeofW(StgBlockedFetch) >= MIN_PAYLOAD_SIZE);
+    to = copy(q,sizeofW(StgFetchMeBlockingQueue),stp);
+    IF_DEBUG(gc,
+	     debugBelch("@@ evacuate: %p (%s) to %p (%s)",
+		   q, info_type(q), to, info_type(to)));
+    return to;
+#endif
+
+  case TREC_HEADER: 
+    return copy(q,sizeofW(StgTRecHeader),stp);
+
+  case TVAR_WAIT_QUEUE:
+    return copy(q,sizeofW(StgTVarWaitQueue),stp);
+
+  case TVAR:
+    return copy(q,sizeofW(StgTVar),stp);
+    
+  case TREC_CHUNK:
+    return copy(q,sizeofW(StgTRecChunk),stp);
+
+  default:
+    barf("evacuate: strange closure type %d", (int)(info->type));
+  }
+
+  barf("evacuate");
+}
+
+/* -----------------------------------------------------------------------------
+   Evaluate a THUNK_SELECTOR if possible.
+
+   returns: NULL if we couldn't evaluate this THUNK_SELECTOR, or
+   a closure pointer if we evaluated it and this is the result.  Note
+   that "evaluating" the THUNK_SELECTOR doesn't necessarily mean
+   reducing it to HNF, just that we have eliminated the selection.
+   The result might be another thunk, or even another THUNK_SELECTOR.
+
+   If the return value is non-NULL, the original selector thunk has
+   been BLACKHOLE'd, and should be updated with an indirection or a
+   forwarding pointer.  If the return value is NULL, then the selector
+   thunk is unchanged.
+
+   ***
+   ToDo: the treatment of THUNK_SELECTORS could be improved in the
+   following way (from a suggestion by Ian Lynagh):
+
+   We can have a chain like this:
+
+      sel_0 --> (a,b)
+                 |
+                 |-----> sel_0 --> (a,b)
+                                    |
+                                    |-----> sel_0 --> ...
+
+   and the depth limit means we don't go all the way to the end of the
+   chain, which results in a space leak.  This affects the recursive
+   call to evacuate() in the THUNK_SELECTOR case in evacuate(): *not*
+   the recursive call to eval_thunk_selector() in
+   eval_thunk_selector().
+
+   We could eliminate the depth bound in this case, in the following
+   way:
+
+      - traverse the chain once to discover the *value* of the 
+        THUNK_SELECTOR.  Mark all THUNK_SELECTORS that we
+        visit on the way as having been visited already (somehow).
+
+      - in a second pass, traverse the chain again updating all
+        THUNK_SEELCTORS that we find on the way with indirections to
+        the value.
+
+      - if we encounter a "marked" THUNK_SELECTOR in a normal 
+        evacuate(), we konw it can't be updated so just evac it.
+
+   Program that illustrates the problem:
+
+	foo [] = ([], [])
+	foo (x:xs) = let (ys, zs) = foo xs
+	             in if x >= 0 then (x:ys, zs) else (ys, x:zs)
+
+	main = bar [1..(100000000::Int)]
+	bar xs = (\(ys, zs) -> print ys >> print zs) (foo xs)
+
+   -------------------------------------------------------------------------- */
+
+static inline rtsBool
+is_to_space ( StgClosure *p )
+{
+    bdescr *bd;
+
+    bd = Bdescr((StgPtr)p);
+    if (HEAP_ALLOCED(p) &&
+	((bd->flags & BF_EVACUATED) 
+	 || ((bd->flags & BF_COMPACTED) &&
+	     is_marked((P_)p,bd)))) {
+	return rtsTrue;
+    } else {
+	return rtsFalse;
+    }
+}    
+
+static StgClosure *
+eval_thunk_selector( nat field, StgSelector * p )
+{
+    StgInfoTable *info;
+    const StgInfoTable *info_ptr;
+    StgClosure *selectee;
+    
+    selectee = p->selectee;
+
+    // Save the real info pointer (NOTE: not the same as get_itbl()).
+    info_ptr = p->header.info;
+
+    // If the THUNK_SELECTOR is in a generation that we are not
+    // collecting, then bail out early.  We won't be able to save any
+    // space in any case, and updating with an indirection is trickier
+    // in an old gen.
+    if (Bdescr((StgPtr)p)->gen_no > N) {
+	return NULL;
+    }
+
+    // BLACKHOLE the selector thunk, since it is now under evaluation.
+    // This is important to stop us going into an infinite loop if
+    // this selector thunk eventually refers to itself.
+    SET_INFO(p,&stg_BLACKHOLE_info);
+
+selector_loop:
+
+    // We don't want to end up in to-space, because this causes
+    // problems when the GC later tries to evacuate the result of
+    // eval_thunk_selector().  There are various ways this could
+    // happen:
+    //
+    // 1. following an IND_STATIC
+    //
+    // 2. when the old generation is compacted, the mark phase updates
+    //    from-space pointers to be to-space pointers, and we can't
+    //    reliably tell which we're following (eg. from an IND_STATIC).
+    // 
+    // 3. compacting GC again: if we're looking at a constructor in
+    //    the compacted generation, it might point directly to objects
+    //    in to-space.  We must bale out here, otherwise doing the selection
+    //    will result in a to-space pointer being returned.
+    //
+    //  (1) is dealt with using a BF_EVACUATED test on the
+    //  selectee. (2) and (3): we can tell if we're looking at an
+    //  object in the compacted generation that might point to
+    //  to-space objects by testing that (a) it is BF_COMPACTED, (b)
+    //  the compacted generation is being collected, and (c) the
+    //  object is marked.  Only a marked object may have pointers that
+    //  point to to-space objects, because that happens when
+    //  scavenging.
+    //
+    //  The to-space test is now embodied in the in_to_space() inline
+    //  function, as it is re-used below.
+    //
+    if (is_to_space(selectee)) {
+	goto bale_out;
+    }
+
+    info = get_itbl(selectee);
+    switch (info->type) {
+      case CONSTR:
+      case CONSTR_1_0:
+      case CONSTR_0_1:
+      case CONSTR_2_0:
+      case CONSTR_1_1:
+      case CONSTR_0_2:
+      case CONSTR_STATIC:
+      case CONSTR_NOCAF_STATIC:
+	  // check that the size is in range 
+	  ASSERT(field <  (StgWord32)(info->layout.payload.ptrs + 
+				      info->layout.payload.nptrs));
+	  
+	  // Select the right field from the constructor, and check
+	  // that the result isn't in to-space.  It might be in
+	  // to-space if, for example, this constructor contains
+	  // pointers to younger-gen objects (and is on the mut-once
+	  // list).
+	  //
+	  { 
+	      StgClosure *q;
+	      q = selectee->payload[field];
+	      if (is_to_space(q)) {
+		  goto bale_out;
+	      } else {
+		  return q;
+	      }
+	  }
+
+      case IND:
+      case IND_PERM:
+      case IND_OLDGEN:
+      case IND_OLDGEN_PERM:
+      case IND_STATIC:
+	  selectee = ((StgInd *)selectee)->indirectee;
+	  goto selector_loop;
+
+      case EVACUATED:
+	  // We don't follow pointers into to-space; the constructor
+	  // has already been evacuated, so we won't save any space
+	  // leaks by evaluating this selector thunk anyhow.
+	  break;
+
+      case THUNK_SELECTOR:
+      {
+	  StgClosure *val;
+
+	  // check that we don't recurse too much, re-using the
+	  // depth bound also used in evacuate().
+	  if (thunk_selector_depth >= MAX_THUNK_SELECTOR_DEPTH) {
+	      break;
+	  }
+	  thunk_selector_depth++;
+
+	  val = eval_thunk_selector(info->layout.selector_offset, 
+				    (StgSelector *)selectee);
+
+	  thunk_selector_depth--;
+
+	  if (val == NULL) { 
+	      break;
+	  } else {
+	      // We evaluated this selector thunk, so update it with
+	      // an indirection.  NOTE: we don't use UPD_IND here,
+	      // because we are guaranteed that p is in a generation
+	      // that we are collecting, and we never want to put the
+	      // indirection on a mutable list.
+#ifdef PROFILING
+	      // For the purposes of LDV profiling, we have destroyed
+	      // the original selector thunk.
+	      SET_INFO(p, info_ptr);
+	      LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC(selectee);
+#endif
+	      ((StgInd *)selectee)->indirectee = val;
+	      SET_INFO(selectee,&stg_IND_info);
+
+	      // For the purposes of LDV profiling, we have created an
+	      // indirection.
+	      LDV_RECORD_CREATE(selectee);
+
+	      selectee = val;
+	      goto selector_loop;
+	  }
+      }
+
+      case AP:
+      case AP_STACK:
+      case THUNK:
+      case THUNK_1_0:
+      case THUNK_0_1:
+      case THUNK_2_0:
+      case THUNK_1_1:
+      case THUNK_0_2:
+      case THUNK_STATIC:
+      case CAF_BLACKHOLE:
+      case SE_CAF_BLACKHOLE:
+      case SE_BLACKHOLE:
+      case BLACKHOLE:
+#if defined(PAR)
+      case RBH:
+      case BLOCKED_FETCH:
+# ifdef DIST    
+      case REMOTE_REF:
+# endif
+      case FETCH_ME:
+      case FETCH_ME_BQ:
+#endif
+	  // not evaluated yet 
+	  break;
+    
+      default:
+	barf("eval_thunk_selector: strange selectee %d",
+	     (int)(info->type));
+    }
+
+bale_out:
+    // We didn't manage to evaluate this thunk; restore the old info pointer
+    SET_INFO(p, info_ptr);
+    return NULL;
+}
+
+/* -----------------------------------------------------------------------------
+   move_TSO is called to update the TSO structure after it has been
+   moved from one place to another.
+   -------------------------------------------------------------------------- */
+
+void
+move_TSO (StgTSO *src, StgTSO *dest)
+{
+    ptrdiff_t diff;
+
+    // relocate the stack pointer... 
+    diff = (StgPtr)dest - (StgPtr)src; // In *words* 
+    dest->sp = (StgPtr)dest->sp + diff;
+}
+
+/* Similar to scavenge_large_bitmap(), but we don't write back the
+ * pointers we get back from evacuate().
+ */
+static void
+scavenge_large_srt_bitmap( StgLargeSRT *large_srt )
+{
+    nat i, b, size;
+    StgWord bitmap;
+    StgClosure **p;
+    
+    b = 0;
+    bitmap = large_srt->l.bitmap[b];
+    size   = (nat)large_srt->l.size;
+    p      = (StgClosure **)large_srt->srt;
+    for (i = 0; i < size; ) {
+	if ((bitmap & 1) != 0) {
+	    evacuate(*p);
+	}
+	i++;
+	p++;
+	if (i % BITS_IN(W_) == 0) {
+	    b++;
+	    bitmap = large_srt->l.bitmap[b];
+	} else {
+	    bitmap = bitmap >> 1;
+	}
+    }
+}
+
+/* evacuate the SRT.  If srt_bitmap is zero, then there isn't an
+ * srt field in the info table.  That's ok, because we'll
+ * never dereference it.
+ */
+STATIC_INLINE void
+scavenge_srt (StgClosure **srt, nat srt_bitmap)
+{
+  nat bitmap;
+  StgClosure **p;
+
+  bitmap = srt_bitmap;
+  p = srt;
+
+  if (bitmap == (StgHalfWord)(-1)) {  
+      scavenge_large_srt_bitmap( (StgLargeSRT *)srt );
+      return;
+  }
+
+  while (bitmap != 0) {
+      if ((bitmap & 1) != 0) {
+#ifdef ENABLE_WIN32_DLL_SUPPORT
+	  // Special-case to handle references to closures hiding out in DLLs, since
+	  // double indirections required to get at those. The code generator knows
+	  // which is which when generating the SRT, so it stores the (indirect)
+	  // reference to the DLL closure in the table by first adding one to it.
+	  // We check for this here, and undo the addition before evacuating it.
+	  // 
+	  // If the SRT entry hasn't got bit 0 set, the SRT entry points to a
+	  // closure that's fixed at link-time, and no extra magic is required.
+	  if ( (unsigned long)(*srt) & 0x1 ) {
+	      evacuate(*stgCast(StgClosure**,(stgCast(unsigned long, *srt) & ~0x1)));
+	  } else {
+	      evacuate(*p);
+	  }
+#else
+	  evacuate(*p);
+#endif
+      }
+      p++;
+      bitmap = bitmap >> 1;
+  }
+}
+
+
+STATIC_INLINE void
+scavenge_thunk_srt(const StgInfoTable *info)
+{
+    StgThunkInfoTable *thunk_info;
+
+    if (!major_gc) return;
+
+    thunk_info = itbl_to_thunk_itbl(info);
+    scavenge_srt((StgClosure **)GET_SRT(thunk_info), thunk_info->i.srt_bitmap);
+}
+
+STATIC_INLINE void
+scavenge_fun_srt(const StgInfoTable *info)
+{
+    StgFunInfoTable *fun_info;
+
+    if (!major_gc) return;
+  
+    fun_info = itbl_to_fun_itbl(info);
+    scavenge_srt((StgClosure **)GET_FUN_SRT(fun_info), fun_info->i.srt_bitmap);
+}
+
+/* -----------------------------------------------------------------------------
+   Scavenge a TSO.
+   -------------------------------------------------------------------------- */
+
+static void
+scavengeTSO (StgTSO *tso)
+{
+    if (   tso->why_blocked == BlockedOnMVar
+	|| tso->why_blocked == BlockedOnBlackHole
+	|| tso->why_blocked == BlockedOnException
+#if defined(PAR)
+	|| tso->why_blocked == BlockedOnGA
+	|| tso->why_blocked == BlockedOnGA_NoSend
+#endif
+	) {
+	tso->block_info.closure = evacuate(tso->block_info.closure);
+    }
+    if ( tso->blocked_exceptions != NULL ) {
+	tso->blocked_exceptions = 
+	    (StgTSO *)evacuate((StgClosure *)tso->blocked_exceptions);
+    }
+    
+    // We don't always chase the link field: TSOs on the blackhole
+    // queue are not automatically alive, so the link field is a
+    // "weak" pointer in that case.
+    if (tso->why_blocked != BlockedOnBlackHole) {
+	tso->link = (StgTSO *)evacuate((StgClosure *)tso->link);
+    }
+
+    // scavange current transaction record
+    tso->trec = (StgTRecHeader *)evacuate((StgClosure *)tso->trec);
+    
+    // scavenge this thread's stack 
+    scavenge_stack(tso->sp, &(tso->stack[tso->stack_size]));
+}
+
+/* -----------------------------------------------------------------------------
+   Blocks of function args occur on the stack (at the top) and
+   in PAPs.
+   -------------------------------------------------------------------------- */
+
+STATIC_INLINE StgPtr
+scavenge_arg_block (StgFunInfoTable *fun_info, StgClosure **args)
+{
+    StgPtr p;
+    StgWord bitmap;
+    nat size;
+
+    p = (StgPtr)args;
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+	bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+	size = BITMAP_SIZE(fun_info->f.b.bitmap);
+	goto small_bitmap;
+    case ARG_GEN_BIG:
+	size = GET_FUN_LARGE_BITMAP(fun_info)->size;
+	scavenge_large_bitmap(p, GET_FUN_LARGE_BITMAP(fun_info), size);
+	p += size;
+	break;
+    default:
+	bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+	size = BITMAP_SIZE(stg_arg_bitmaps[fun_info->f.fun_type]);
+    small_bitmap:
+	while (size > 0) {
+	    if ((bitmap & 1) == 0) {
+		*p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	    }
+	    p++;
+	    bitmap = bitmap >> 1;
+	    size--;
+	}
+	break;
+    }
+    return p;
+}
+
+STATIC_INLINE StgPtr
+scavenge_PAP_payload (StgClosure *fun, StgClosure **payload, StgWord size)
+{
+    StgPtr p;
+    StgWord bitmap;
+    StgFunInfoTable *fun_info;
+    
+    fun_info = get_fun_itbl(fun);
+    ASSERT(fun_info->i.type != PAP);
+    p = (StgPtr)payload;
+
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+	bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+	goto small_bitmap;
+    case ARG_GEN_BIG:
+	scavenge_large_bitmap(p, GET_FUN_LARGE_BITMAP(fun_info), size);
+	p += size;
+	break;
+    case ARG_BCO:
+	scavenge_large_bitmap((StgPtr)payload, BCO_BITMAP(fun), size);
+	p += size;
+	break;
+    default:
+	bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+    small_bitmap:
+	while (size > 0) {
+	    if ((bitmap & 1) == 0) {
+		*p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	    }
+	    p++;
+	    bitmap = bitmap >> 1;
+	    size--;
+	}
+	break;
+    }
+    return p;
+}
+
+STATIC_INLINE StgPtr
+scavenge_PAP (StgPAP *pap)
+{
+    pap->fun = evacuate(pap->fun);
+    return scavenge_PAP_payload (pap->fun, pap->payload, pap->n_args);
+}
+
+STATIC_INLINE StgPtr
+scavenge_AP (StgAP *ap)
+{
+    ap->fun = evacuate(ap->fun);
+    return scavenge_PAP_payload (ap->fun, ap->payload, ap->n_args);
+}
+
+/* -----------------------------------------------------------------------------
+   Scavenge a given step until there are no more objects in this step
+   to scavenge.
+
+   evac_gen is set by the caller to be either zero (for a step in a
+   generation < N) or G where G is the generation of the step being
+   scavenged.  
+
+   We sometimes temporarily change evac_gen back to zero if we're
+   scavenging a mutable object where early promotion isn't such a good
+   idea.  
+   -------------------------------------------------------------------------- */
+
+static void
+scavenge(step *stp)
+{
+  StgPtr p, q;
+  StgInfoTable *info;
+  bdescr *bd;
+  nat saved_evac_gen = evac_gen;
+
+  p = stp->scan;
+  bd = stp->scan_bd;
+
+  failed_to_evac = rtsFalse;
+
+  /* scavenge phase - standard breadth-first scavenging of the
+   * evacuated objects 
+   */
+
+  while (bd != stp->hp_bd || p < stp->hp) {
+
+    // If we're at the end of this block, move on to the next block 
+    if (bd != stp->hp_bd && p == bd->free) {
+      bd = bd->link;
+      p = bd->start;
+      continue;
+    }
+
+    ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
+    info = get_itbl((StgClosure *)p);
+    
+    ASSERT(thunk_selector_depth == 0);
+
+    q = p;
+    switch (info->type) {
+
+    case MVAR:
+    { 
+	StgMVar *mvar = ((StgMVar *)p);
+	evac_gen = 0;
+	mvar->head = (StgTSO *)evacuate((StgClosure *)mvar->head);
+	mvar->tail = (StgTSO *)evacuate((StgClosure *)mvar->tail);
+	mvar->value = evacuate((StgClosure *)mvar->value);
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable.
+	p += sizeofW(StgMVar);
+	break;
+    }
+
+    case FUN_2_0:
+	scavenge_fun_srt(info);
+	((StgClosure *)p)->payload[1] = evacuate(((StgClosure *)p)->payload[1]);
+	((StgClosure *)p)->payload[0] = evacuate(((StgClosure *)p)->payload[0]);
+	p += sizeofW(StgHeader) + 2;
+	break;
+
+    case THUNK_2_0:
+	scavenge_thunk_srt(info);
+	((StgThunk *)p)->payload[1] = evacuate(((StgThunk *)p)->payload[1]);
+	((StgThunk *)p)->payload[0] = evacuate(((StgThunk *)p)->payload[0]);
+	p += sizeofW(StgThunk) + 2;
+	break;
+
+    case CONSTR_2_0:
+	((StgClosure *)p)->payload[1] = evacuate(((StgClosure *)p)->payload[1]);
+	((StgClosure *)p)->payload[0] = evacuate(((StgClosure *)p)->payload[0]);
+	p += sizeofW(StgHeader) + 2;
+	break;
+	
+    case THUNK_1_0:
+	scavenge_thunk_srt(info);
+	((StgThunk *)p)->payload[0] = evacuate(((StgThunk *)p)->payload[0]);
+	p += sizeofW(StgThunk) + 1;
+	break;
+	
+    case FUN_1_0:
+	scavenge_fun_srt(info);
+    case CONSTR_1_0:
+	((StgClosure *)p)->payload[0] = evacuate(((StgClosure *)p)->payload[0]);
+	p += sizeofW(StgHeader) + 1;
+	break;
+	
+    case THUNK_0_1:
+	scavenge_thunk_srt(info);
+	p += sizeofW(StgThunk) + 1;
+	break;
+	
+    case FUN_0_1:
+	scavenge_fun_srt(info);
+    case CONSTR_0_1:
+	p += sizeofW(StgHeader) + 1;
+	break;
+	
+    case THUNK_0_2:
+	scavenge_thunk_srt(info);
+	p += sizeofW(StgThunk) + 2;
+	break;
+	
+    case FUN_0_2:
+	scavenge_fun_srt(info);
+    case CONSTR_0_2:
+	p += sizeofW(StgHeader) + 2;
+	break;
+	
+    case THUNK_1_1:
+	scavenge_thunk_srt(info);
+	((StgThunk *)p)->payload[0] = evacuate(((StgThunk *)p)->payload[0]);
+	p += sizeofW(StgThunk) + 2;
+	break;
+
+    case FUN_1_1:
+	scavenge_fun_srt(info);
+    case CONSTR_1_1:
+	((StgClosure *)p)->payload[0] = evacuate(((StgClosure *)p)->payload[0]);
+	p += sizeofW(StgHeader) + 2;
+	break;
+	
+    case FUN:
+	scavenge_fun_srt(info);
+	goto gen_obj;
+
+    case THUNK:
+    {
+	StgPtr end;
+
+	scavenge_thunk_srt(info);
+	end = (P_)((StgThunk *)p)->payload + info->layout.payload.ptrs;
+	for (p = (P_)((StgThunk *)p)->payload; p < end; p++) {
+	    *p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	}
+	p += info->layout.payload.nptrs;
+	break;
+    }
+	
+    gen_obj:
+    case CONSTR:
+    case WEAK:
+    case STABLE_NAME:
+    {
+	StgPtr end;
+
+	end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+	for (p = (P_)((StgClosure *)p)->payload; p < end; p++) {
+	    *p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	}
+	p += info->layout.payload.nptrs;
+	break;
+    }
+
+    case BCO: {
+	StgBCO *bco = (StgBCO *)p;
+	bco->instrs = (StgArrWords *)evacuate((StgClosure *)bco->instrs);
+	bco->literals = (StgArrWords *)evacuate((StgClosure *)bco->literals);
+	bco->ptrs = (StgMutArrPtrs *)evacuate((StgClosure *)bco->ptrs);
+	bco->itbls = (StgArrWords *)evacuate((StgClosure *)bco->itbls);
+	p += bco_sizeW(bco);
+	break;
+    }
+
+    case IND_PERM:
+      if (stp->gen->no != 0) {
+#ifdef PROFILING
+        // @LDV profiling
+        // No need to call LDV_recordDead_FILL_SLOP_DYNAMIC() because an 
+        // IND_OLDGEN_PERM closure is larger than an IND_PERM closure.
+        LDV_recordDead((StgClosure *)p, sizeofW(StgInd));
+#endif        
+        // 
+        // Todo: maybe use SET_HDR() and remove LDV_RECORD_CREATE()?
+        //
+	SET_INFO(((StgClosure *)p), &stg_IND_OLDGEN_PERM_info);
+
+        // We pretend that p has just been created.
+        LDV_RECORD_CREATE((StgClosure *)p);
+      }
+	// fall through 
+    case IND_OLDGEN_PERM:
+	((StgInd *)p)->indirectee = evacuate(((StgInd *)p)->indirectee);
+	p += sizeofW(StgInd);
+	break;
+
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY: {
+	rtsBool saved_eager_promotion = eager_promotion;
+
+	eager_promotion = rtsFalse;
+	((StgMutVar *)p)->var = evacuate(((StgMutVar *)p)->var);
+	eager_promotion = saved_eager_promotion;
+
+	if (failed_to_evac) {
+	    ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
+	} else {
+	    ((StgClosure *)q)->header.info = &stg_MUT_VAR_CLEAN_info;
+	}
+	p += sizeofW(StgMutVar);
+	break;
+    }
+
+    case CAF_BLACKHOLE:
+    case SE_CAF_BLACKHOLE:
+    case SE_BLACKHOLE:
+    case BLACKHOLE:
+	p += BLACKHOLE_sizeW();
+	break;
+
+    case THUNK_SELECTOR:
+    { 
+	StgSelector *s = (StgSelector *)p;
+	s->selectee = evacuate(s->selectee);
+	p += THUNK_SELECTOR_sizeW();
+	break;
+    }
+
+    // A chunk of stack saved in a heap object
+    case AP_STACK:
+    {
+	StgAP_STACK *ap = (StgAP_STACK *)p;
+
+	ap->fun = evacuate(ap->fun);
+	scavenge_stack((StgPtr)ap->payload, (StgPtr)ap->payload + ap->size);
+	p = (StgPtr)ap->payload + ap->size;
+	break;
+    }
+
+    case PAP:
+	p = scavenge_PAP((StgPAP *)p);
+	break;
+
+    case AP:
+	p = scavenge_AP((StgAP *)p);
+	break;
+
+    case ARR_WORDS:
+	// nothing to follow 
+	p += arr_words_sizeW((StgArrWords *)p);
+	break;
+
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+	// follow everything 
+    {
+	StgPtr next;
+	rtsBool saved_eager;
+
+	// We don't eagerly promote objects pointed to by a mutable
+	// array, but if we find the array only points to objects in
+	// the same or an older generation, we mark it "clean" and
+	// avoid traversing it during minor GCs.
+	saved_eager = eager_promotion;
+	eager_promotion = rtsFalse;
+	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	    *p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	}
+	eager_promotion = saved_eager;
+
+	if (failed_to_evac) {
+	    ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
+	} else {
+	    ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_CLEAN_info;
+	}
+
+	failed_to_evac = rtsTrue; // always put it on the mutable list.
+	break;
+    }
+
+    case MUT_ARR_PTRS_FROZEN:
+    case MUT_ARR_PTRS_FROZEN0:
+	// follow everything 
+    {
+	StgPtr next;
+
+	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	    *p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	}
+
+	// If we're going to put this object on the mutable list, then
+	// set its info ptr to MUT_ARR_PTRS_FROZEN0 to indicate that.
+	if (failed_to_evac) {
+	    ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN0_info;
+	} else {
+	    ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_info;
+	}
+	break;
+    }
+
+    case TSO:
+    { 
+	StgTSO *tso = (StgTSO *)p;
+	rtsBool saved_eager = eager_promotion;
+
+	eager_promotion = rtsFalse;
+	scavengeTSO(tso);
+	eager_promotion = saved_eager;
+
+	if (failed_to_evac) {
+	    tso->flags |= TSO_DIRTY;
+	} else {
+	    tso->flags &= ~TSO_DIRTY;
+	}
+
+	failed_to_evac = rtsTrue; // always on the mutable list
+	p += tso_sizeW(tso);
+	break;
+    }
+
+#if defined(PAR)
+    case RBH:
+    { 
+#if 0
+	nat size, ptrs, nonptrs, vhs;
+	char str[80];
+	StgInfoTable *rip = get_closure_info(p, &size, &ptrs, &nonptrs, &vhs, str);
+#endif
+	StgRBH *rbh = (StgRBH *)p;
+	(StgClosure *)rbh->blocking_queue = 
+	    evacuate((StgClosure *)rbh->blocking_queue);
+	failed_to_evac = rtsTrue;  // mutable anyhow.
+	IF_DEBUG(gc,
+		 debugBelch("@@ scavenge: RBH %p (%s) (new blocking_queue link=%p)",
+		       p, info_type(p), (StgClosure *)rbh->blocking_queue));
+	// ToDo: use size of reverted closure here!
+	p += BLACKHOLE_sizeW(); 
+	break;
+    }
+
+    case BLOCKED_FETCH:
+    { 
+	StgBlockedFetch *bf = (StgBlockedFetch *)p;
+	// follow the pointer to the node which is being demanded 
+	(StgClosure *)bf->node = 
+	    evacuate((StgClosure *)bf->node);
+	// follow the link to the rest of the blocking queue 
+	(StgClosure *)bf->link = 
+	    evacuate((StgClosure *)bf->link);
+	IF_DEBUG(gc,
+		 debugBelch("@@ scavenge: %p (%s); node is now %p; exciting, isn't it",
+		       bf, info_type((StgClosure *)bf), 
+		       bf->node, info_type(bf->node)));
+	p += sizeofW(StgBlockedFetch);
+	break;
+    }
+
+#ifdef DIST
+    case REMOTE_REF:
+#endif
+    case FETCH_ME:
+	p += sizeofW(StgFetchMe);
+	break; // nothing to do in this case
+
+    case FETCH_ME_BQ:
+    { 
+	StgFetchMeBlockingQueue *fmbq = (StgFetchMeBlockingQueue *)p;
+	(StgClosure *)fmbq->blocking_queue = 
+	    evacuate((StgClosure *)fmbq->blocking_queue);
+	IF_DEBUG(gc,
+		 debugBelch("@@ scavenge: %p (%s) exciting, isn't it",
+		       p, info_type((StgClosure *)p)));
+	p += sizeofW(StgFetchMeBlockingQueue);
+	break;
+    }
+#endif
+
+    case TVAR_WAIT_QUEUE:
+      {
+	StgTVarWaitQueue *wq = ((StgTVarWaitQueue *) p);
+	evac_gen = 0;
+	wq->waiting_tso = (StgTSO *)evacuate((StgClosure*)wq->waiting_tso);
+	wq->next_queue_entry = (StgTVarWaitQueue *)evacuate((StgClosure*)wq->next_queue_entry);
+	wq->prev_queue_entry = (StgTVarWaitQueue *)evacuate((StgClosure*)wq->prev_queue_entry);
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable
+	p += sizeofW(StgTVarWaitQueue);
+	break;
+      }
+
+    case TVAR:
+      {
+	StgTVar *tvar = ((StgTVar *) p);
+	evac_gen = 0;
+	tvar->current_value = evacuate((StgClosure*)tvar->current_value);
+	tvar->first_wait_queue_entry = (StgTVarWaitQueue *)evacuate((StgClosure*)tvar->first_wait_queue_entry);
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable
+	p += sizeofW(StgTVar);
+	break;
+      }
+
+    case TREC_HEADER:
+      {
+        StgTRecHeader *trec = ((StgTRecHeader *) p);
+        evac_gen = 0;
+	trec->enclosing_trec = (StgTRecHeader *)evacuate((StgClosure*)trec->enclosing_trec);
+	trec->current_chunk = (StgTRecChunk *)evacuate((StgClosure*)trec->current_chunk);
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable
+	p += sizeofW(StgTRecHeader);
+        break;
+      }
+
+    case TREC_CHUNK:
+      {
+	StgWord i;
+	StgTRecChunk *tc = ((StgTRecChunk *) p);
+	TRecEntry *e = &(tc -> entries[0]);
+	evac_gen = 0;
+	tc->prev_chunk = (StgTRecChunk *)evacuate((StgClosure*)tc->prev_chunk);
+	for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) {
+	  e->tvar = (StgTVar *)evacuate((StgClosure*)e->tvar);
+	  e->expected_value = evacuate((StgClosure*)e->expected_value);
+	  e->new_value = evacuate((StgClosure*)e->new_value);
+	}
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable
+	p += sizeofW(StgTRecChunk);
+	break;
+      }
+
+    default:
+	barf("scavenge: unimplemented/strange closure type %d @ %p", 
+	     info->type, p);
+    }
+
+    /*
+     * We need to record the current object on the mutable list if
+     *  (a) It is actually mutable, or 
+     *  (b) It contains pointers to a younger generation.
+     * Case (b) arises if we didn't manage to promote everything that
+     * the current object points to into the current generation.
+     */
+    if (failed_to_evac) {
+	failed_to_evac = rtsFalse;
+	if (stp->gen_no > 0) {
+	    recordMutableGen((StgClosure *)q, stp->gen);
+	}
+    }
+  }
+
+  stp->scan_bd = bd;
+  stp->scan = p;
+}    
+
+/* -----------------------------------------------------------------------------
+   Scavenge everything on the mark stack.
+
+   This is slightly different from scavenge():
+      - we don't walk linearly through the objects, so the scavenger
+        doesn't need to advance the pointer on to the next object.
+   -------------------------------------------------------------------------- */
+
+static void
+scavenge_mark_stack(void)
+{
+    StgPtr p, q;
+    StgInfoTable *info;
+    nat saved_evac_gen;
+
+    evac_gen = oldest_gen->no;
+    saved_evac_gen = evac_gen;
+
+linear_scan:
+    while (!mark_stack_empty()) {
+	p = pop_mark_stack();
+
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
+	info = get_itbl((StgClosure *)p);
+	
+	q = p;
+	switch (info->type) {
+	    
+	case MVAR:
+	{
+	    StgMVar *mvar = ((StgMVar *)p);
+	    evac_gen = 0;
+	    mvar->head = (StgTSO *)evacuate((StgClosure *)mvar->head);
+	    mvar->tail = (StgTSO *)evacuate((StgClosure *)mvar->tail);
+	    mvar->value = evacuate((StgClosure *)mvar->value);
+	    evac_gen = saved_evac_gen;
+	    failed_to_evac = rtsTrue; // mutable.
+	    break;
+	}
+
+	case FUN_2_0:
+	    scavenge_fun_srt(info);
+	    ((StgClosure *)p)->payload[1] = evacuate(((StgClosure *)p)->payload[1]);
+	    ((StgClosure *)p)->payload[0] = evacuate(((StgClosure *)p)->payload[0]);
+	    break;
+
+	case THUNK_2_0:
+	    scavenge_thunk_srt(info);
+	    ((StgThunk *)p)->payload[1] = evacuate(((StgThunk *)p)->payload[1]);
+	    ((StgThunk *)p)->payload[0] = evacuate(((StgThunk *)p)->payload[0]);
+	    break;
+
+	case CONSTR_2_0:
+	    ((StgClosure *)p)->payload[1] = evacuate(((StgClosure *)p)->payload[1]);
+	    ((StgClosure *)p)->payload[0] = evacuate(((StgClosure *)p)->payload[0]);
+	    break;
+	
+	case FUN_1_0:
+	case FUN_1_1:
+	    scavenge_fun_srt(info);
+	    ((StgClosure *)p)->payload[0] = evacuate(((StgClosure *)p)->payload[0]);
+	    break;
+
+	case THUNK_1_0:
+	case THUNK_1_1:
+	    scavenge_thunk_srt(info);
+	    ((StgThunk *)p)->payload[0] = evacuate(((StgThunk *)p)->payload[0]);
+	    break;
+
+	case CONSTR_1_0:
+	case CONSTR_1_1:
+	    ((StgClosure *)p)->payload[0] = evacuate(((StgClosure *)p)->payload[0]);
+	    break;
+	
+	case FUN_0_1:
+	case FUN_0_2:
+	    scavenge_fun_srt(info);
+	    break;
+
+	case THUNK_0_1:
+	case THUNK_0_2:
+	    scavenge_thunk_srt(info);
+	    break;
+
+	case CONSTR_0_1:
+	case CONSTR_0_2:
+	    break;
+	
+	case FUN:
+	    scavenge_fun_srt(info);
+	    goto gen_obj;
+
+	case THUNK:
+	{
+	    StgPtr end;
+	    
+	    scavenge_thunk_srt(info);
+	    end = (P_)((StgThunk *)p)->payload + info->layout.payload.ptrs;
+	    for (p = (P_)((StgThunk *)p)->payload; p < end; p++) {
+		*p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	    }
+	    break;
+	}
+	
+	gen_obj:
+	case CONSTR:
+	case WEAK:
+	case STABLE_NAME:
+	{
+	    StgPtr end;
+	    
+	    end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+	    for (p = (P_)((StgClosure *)p)->payload; p < end; p++) {
+		*p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	    }
+	    break;
+	}
+
+	case BCO: {
+	    StgBCO *bco = (StgBCO *)p;
+	    bco->instrs = (StgArrWords *)evacuate((StgClosure *)bco->instrs);
+	    bco->literals = (StgArrWords *)evacuate((StgClosure *)bco->literals);
+	    bco->ptrs = (StgMutArrPtrs *)evacuate((StgClosure *)bco->ptrs);
+	    bco->itbls = (StgArrWords *)evacuate((StgClosure *)bco->itbls);
+	    break;
+	}
+
+	case IND_PERM:
+	    // don't need to do anything here: the only possible case
+	    // is that we're in a 1-space compacting collector, with
+	    // no "old" generation.
+	    break;
+
+	case IND_OLDGEN:
+	case IND_OLDGEN_PERM:
+	    ((StgInd *)p)->indirectee = 
+		evacuate(((StgInd *)p)->indirectee);
+	    break;
+
+	case MUT_VAR_CLEAN:
+	case MUT_VAR_DIRTY: {
+	    rtsBool saved_eager_promotion = eager_promotion;
+	    
+	    eager_promotion = rtsFalse;
+	    ((StgMutVar *)p)->var = evacuate(((StgMutVar *)p)->var);
+	    eager_promotion = saved_eager_promotion;
+	    
+	    if (failed_to_evac) {
+		((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
+	    } else {
+		((StgClosure *)q)->header.info = &stg_MUT_VAR_CLEAN_info;
+	    }
+	    break;
+	}
+
+	case CAF_BLACKHOLE:
+	case SE_CAF_BLACKHOLE:
+	case SE_BLACKHOLE:
+	case BLACKHOLE:
+	case ARR_WORDS:
+	    break;
+
+	case THUNK_SELECTOR:
+	{ 
+	    StgSelector *s = (StgSelector *)p;
+	    s->selectee = evacuate(s->selectee);
+	    break;
+	}
+
+	// A chunk of stack saved in a heap object
+	case AP_STACK:
+	{
+	    StgAP_STACK *ap = (StgAP_STACK *)p;
+	    
+	    ap->fun = evacuate(ap->fun);
+	    scavenge_stack((StgPtr)ap->payload, (StgPtr)ap->payload + ap->size);
+	    break;
+	}
+
+	case PAP:
+	    scavenge_PAP((StgPAP *)p);
+	    break;
+
+	case AP:
+	    scavenge_AP((StgAP *)p);
+	    break;
+      
+	case MUT_ARR_PTRS_CLEAN:
+	case MUT_ARR_PTRS_DIRTY:
+	    // follow everything 
+	{
+	    StgPtr next;
+	    rtsBool saved_eager;
+
+	    // We don't eagerly promote objects pointed to by a mutable
+	    // array, but if we find the array only points to objects in
+	    // the same or an older generation, we mark it "clean" and
+	    // avoid traversing it during minor GCs.
+	    saved_eager = eager_promotion;
+	    eager_promotion = rtsFalse;
+	    next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+	    for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+		*p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	    }
+	    eager_promotion = saved_eager;
+
+	    if (failed_to_evac) {
+		((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
+	    } else {
+		((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_CLEAN_info;
+	    }
+
+	    failed_to_evac = rtsTrue; // mutable anyhow.
+	    break;
+	}
+
+	case MUT_ARR_PTRS_FROZEN:
+	case MUT_ARR_PTRS_FROZEN0:
+	    // follow everything 
+	{
+	    StgPtr next, q = p;
+	    
+	    next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+	    for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+		*p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	    }
+
+	    // If we're going to put this object on the mutable list, then
+	    // set its info ptr to MUT_ARR_PTRS_FROZEN0 to indicate that.
+	    if (failed_to_evac) {
+		((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN0_info;
+	    } else {
+		((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_info;
+	    }
+	    break;
+	}
+
+	case TSO:
+	{ 
+	    StgTSO *tso = (StgTSO *)p;
+	    rtsBool saved_eager = eager_promotion;
+
+	    eager_promotion = rtsFalse;
+	    scavengeTSO(tso);
+	    eager_promotion = saved_eager;
+	    
+	    if (failed_to_evac) {
+		tso->flags |= TSO_DIRTY;
+	    } else {
+		tso->flags &= ~TSO_DIRTY;
+	    }
+	    
+	    failed_to_evac = rtsTrue; // always on the mutable list
+	    break;
+	}
+
+#if defined(PAR)
+	case RBH:
+	{ 
+#if 0
+	    nat size, ptrs, nonptrs, vhs;
+	    char str[80];
+	    StgInfoTable *rip = get_closure_info(p, &size, &ptrs, &nonptrs, &vhs, str);
+#endif
+	    StgRBH *rbh = (StgRBH *)p;
+	    bh->blocking_queue = 
+		(StgTSO *)evacuate((StgClosure *)bh->blocking_queue);
+	    failed_to_evac = rtsTrue;  // mutable anyhow.
+	    IF_DEBUG(gc,
+		     debugBelch("@@ scavenge: RBH %p (%s) (new blocking_queue link=%p)",
+			   p, info_type(p), (StgClosure *)rbh->blocking_queue));
+	    break;
+	}
+	
+	case BLOCKED_FETCH:
+	{ 
+	    StgBlockedFetch *bf = (StgBlockedFetch *)p;
+	    // follow the pointer to the node which is being demanded 
+	    (StgClosure *)bf->node = 
+		evacuate((StgClosure *)bf->node);
+	    // follow the link to the rest of the blocking queue 
+	    (StgClosure *)bf->link = 
+		evacuate((StgClosure *)bf->link);
+	    IF_DEBUG(gc,
+		     debugBelch("@@ scavenge: %p (%s); node is now %p; exciting, isn't it",
+			   bf, info_type((StgClosure *)bf), 
+			   bf->node, info_type(bf->node)));
+	    break;
+	}
+
+#ifdef DIST
+	case REMOTE_REF:
+#endif
+	case FETCH_ME:
+	    break; // nothing to do in this case
+
+	case FETCH_ME_BQ:
+	{ 
+	    StgFetchMeBlockingQueue *fmbq = (StgFetchMeBlockingQueue *)p;
+	    (StgClosure *)fmbq->blocking_queue = 
+		evacuate((StgClosure *)fmbq->blocking_queue);
+	    IF_DEBUG(gc,
+		     debugBelch("@@ scavenge: %p (%s) exciting, isn't it",
+			   p, info_type((StgClosure *)p)));
+	    break;
+	}
+#endif /* PAR */
+
+	case TVAR_WAIT_QUEUE:
+	  {
+	    StgTVarWaitQueue *wq = ((StgTVarWaitQueue *) p);
+	    evac_gen = 0;
+	    wq->waiting_tso = (StgTSO *)evacuate((StgClosure*)wq->waiting_tso);
+	    wq->next_queue_entry = (StgTVarWaitQueue *)evacuate((StgClosure*)wq->next_queue_entry);
+	    wq->prev_queue_entry = (StgTVarWaitQueue *)evacuate((StgClosure*)wq->prev_queue_entry);
+	    evac_gen = saved_evac_gen;
+	    failed_to_evac = rtsTrue; // mutable
+	    break;
+	  }
+	  
+	case TVAR:
+	  {
+	    StgTVar *tvar = ((StgTVar *) p);
+	    evac_gen = 0;
+	    tvar->current_value = evacuate((StgClosure*)tvar->current_value);
+	    tvar->first_wait_queue_entry = (StgTVarWaitQueue *)evacuate((StgClosure*)tvar->first_wait_queue_entry);
+	    evac_gen = saved_evac_gen;
+	    failed_to_evac = rtsTrue; // mutable
+	    break;
+	  }
+	  
+	case TREC_CHUNK:
+	  {
+	    StgWord i;
+	    StgTRecChunk *tc = ((StgTRecChunk *) p);
+	    TRecEntry *e = &(tc -> entries[0]);
+	    evac_gen = 0;
+	    tc->prev_chunk = (StgTRecChunk *)evacuate((StgClosure*)tc->prev_chunk);
+	    for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) {
+	      e->tvar = (StgTVar *)evacuate((StgClosure*)e->tvar);
+	      e->expected_value = evacuate((StgClosure*)e->expected_value);
+	      e->new_value = evacuate((StgClosure*)e->new_value);
+	    }
+	    evac_gen = saved_evac_gen;
+	    failed_to_evac = rtsTrue; // mutable
+	    break;
+	  }
+
+	case TREC_HEADER:
+	  {
+	    StgTRecHeader *trec = ((StgTRecHeader *) p);
+	    evac_gen = 0;
+	    trec->enclosing_trec = (StgTRecHeader *)evacuate((StgClosure*)trec->enclosing_trec);
+	    trec->current_chunk = (StgTRecChunk *)evacuate((StgClosure*)trec->current_chunk);
+	    evac_gen = saved_evac_gen;
+	    failed_to_evac = rtsTrue; // mutable
+	    break;
+	  }
+
+	default:
+	    barf("scavenge_mark_stack: unimplemented/strange closure type %d @ %p", 
+		 info->type, p);
+	}
+
+	if (failed_to_evac) {
+	    failed_to_evac = rtsFalse;
+	    if (evac_gen > 0) {
+		recordMutableGen((StgClosure *)q, &generations[evac_gen]);
+	    }
+	}
+	
+	// mark the next bit to indicate "scavenged"
+	mark(q+1, Bdescr(q));
+
+    } // while (!mark_stack_empty())
+
+    // start a new linear scan if the mark stack overflowed at some point
+    if (mark_stack_overflowed && oldgen_scan_bd == NULL) {
+	IF_DEBUG(gc, debugBelch("scavenge_mark_stack: starting linear scan"));
+	mark_stack_overflowed = rtsFalse;
+	oldgen_scan_bd = oldest_gen->steps[0].old_blocks;
+	oldgen_scan = oldgen_scan_bd->start;
+    }
+
+    if (oldgen_scan_bd) {
+	// push a new thing on the mark stack
+    loop:
+	// find a closure that is marked but not scavenged, and start
+	// from there.
+	while (oldgen_scan < oldgen_scan_bd->free 
+	       && !is_marked(oldgen_scan,oldgen_scan_bd)) {
+	    oldgen_scan++;
+	}
+
+	if (oldgen_scan < oldgen_scan_bd->free) {
+
+	    // already scavenged?
+	    if (is_marked(oldgen_scan+1,oldgen_scan_bd)) {
+		oldgen_scan += sizeofW(StgHeader) + MIN_PAYLOAD_SIZE;
+		goto loop;
+	    }
+	    push_mark_stack(oldgen_scan);
+	    // ToDo: bump the linear scan by the actual size of the object
+	    oldgen_scan += sizeofW(StgHeader) + MIN_PAYLOAD_SIZE;
+	    goto linear_scan;
+	}
+
+	oldgen_scan_bd = oldgen_scan_bd->link;
+	if (oldgen_scan_bd != NULL) {
+	    oldgen_scan = oldgen_scan_bd->start;
+	    goto loop;
+	}
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   Scavenge one object.
+
+   This is used for objects that are temporarily marked as mutable
+   because they contain old-to-new generation pointers.  Only certain
+   objects can have this property.
+   -------------------------------------------------------------------------- */
+
+static rtsBool
+scavenge_one(StgPtr p)
+{
+    const StgInfoTable *info;
+    nat saved_evac_gen = evac_gen;
+    rtsBool no_luck;
+    
+    ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
+    info = get_itbl((StgClosure *)p);
+    
+    switch (info->type) {
+	
+    case MVAR:
+    { 
+	StgMVar *mvar = ((StgMVar *)p);
+	evac_gen = 0;
+	mvar->head = (StgTSO *)evacuate((StgClosure *)mvar->head);
+	mvar->tail = (StgTSO *)evacuate((StgClosure *)mvar->tail);
+	mvar->value = evacuate((StgClosure *)mvar->value);
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable.
+	break;
+    }
+
+    case THUNK:
+    case THUNK_1_0:
+    case THUNK_0_1:
+    case THUNK_1_1:
+    case THUNK_0_2:
+    case THUNK_2_0:
+    {
+	StgPtr q, end;
+	
+	end = (StgPtr)((StgThunk *)p)->payload + info->layout.payload.ptrs;
+	for (q = (StgPtr)((StgThunk *)p)->payload; q < end; q++) {
+	    *q = (StgWord)(StgPtr)evacuate((StgClosure *)*q);
+	}
+	break;
+    }
+
+    case FUN:
+    case FUN_1_0:			// hardly worth specialising these guys
+    case FUN_0_1:
+    case FUN_1_1:
+    case FUN_0_2:
+    case FUN_2_0:
+    case CONSTR:
+    case CONSTR_1_0:
+    case CONSTR_0_1:
+    case CONSTR_1_1:
+    case CONSTR_0_2:
+    case CONSTR_2_0:
+    case WEAK:
+    case IND_PERM:
+    {
+	StgPtr q, end;
+	
+	end = (StgPtr)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+	for (q = (StgPtr)((StgClosure *)p)->payload; q < end; q++) {
+	    *q = (StgWord)(StgPtr)evacuate((StgClosure *)*q);
+	}
+	break;
+    }
+    
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY: {
+	StgPtr q = p;
+	rtsBool saved_eager_promotion = eager_promotion;
+
+	eager_promotion = rtsFalse;
+	((StgMutVar *)p)->var = evacuate(((StgMutVar *)p)->var);
+	eager_promotion = saved_eager_promotion;
+
+	if (failed_to_evac) {
+	    ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
+	} else {
+	    ((StgClosure *)q)->header.info = &stg_MUT_VAR_CLEAN_info;
+	}
+	break;
+    }
+
+    case CAF_BLACKHOLE:
+    case SE_CAF_BLACKHOLE:
+    case SE_BLACKHOLE:
+    case BLACKHOLE:
+	break;
+	
+    case THUNK_SELECTOR:
+    { 
+	StgSelector *s = (StgSelector *)p;
+	s->selectee = evacuate(s->selectee);
+	break;
+    }
+    
+    case AP_STACK:
+    {
+	StgAP_STACK *ap = (StgAP_STACK *)p;
+
+	ap->fun = evacuate(ap->fun);
+	scavenge_stack((StgPtr)ap->payload, (StgPtr)ap->payload + ap->size);
+	p = (StgPtr)ap->payload + ap->size;
+	break;
+    }
+
+    case PAP:
+	p = scavenge_PAP((StgPAP *)p);
+	break;
+
+    case AP:
+	p = scavenge_AP((StgAP *)p);
+	break;
+
+    case ARR_WORDS:
+	// nothing to follow 
+	break;
+
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    {
+	StgPtr next, q;
+	rtsBool saved_eager;
+
+	// We don't eagerly promote objects pointed to by a mutable
+	// array, but if we find the array only points to objects in
+	// the same or an older generation, we mark it "clean" and
+	// avoid traversing it during minor GCs.
+	saved_eager = eager_promotion;
+	eager_promotion = rtsFalse;
+	q = p;
+	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	    *p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	}
+	eager_promotion = saved_eager;
+
+	if (failed_to_evac) {
+	    ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
+	} else {
+	    ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_CLEAN_info;
+	}
+
+	failed_to_evac = rtsTrue;
+	break;
+    }
+
+    case MUT_ARR_PTRS_FROZEN:
+    case MUT_ARR_PTRS_FROZEN0:
+    {
+	// follow everything 
+	StgPtr next, q=p;
+      
+	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	    *p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	}
+
+	// If we're going to put this object on the mutable list, then
+	// set its info ptr to MUT_ARR_PTRS_FROZEN0 to indicate that.
+	if (failed_to_evac) {
+	    ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN0_info;
+	} else {
+	    ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_info;
+	}
+	break;
+    }
+
+    case TSO:
+    {
+	StgTSO *tso = (StgTSO *)p;
+	rtsBool saved_eager = eager_promotion;
+
+	eager_promotion = rtsFalse;
+	scavengeTSO(tso);
+	eager_promotion = saved_eager;
+
+	if (failed_to_evac) {
+	    tso->flags |= TSO_DIRTY;
+	} else {
+	    tso->flags &= ~TSO_DIRTY;
+	}
+
+	failed_to_evac = rtsTrue; // always on the mutable list
+	break;
+    }
+  
+#if defined(PAR)
+    case RBH:
+    { 
+#if 0
+	nat size, ptrs, nonptrs, vhs;
+	char str[80];
+	StgInfoTable *rip = get_closure_info(p, &size, &ptrs, &nonptrs, &vhs, str);
+#endif
+	StgRBH *rbh = (StgRBH *)p;
+	(StgClosure *)rbh->blocking_queue = 
+	    evacuate((StgClosure *)rbh->blocking_queue);
+	failed_to_evac = rtsTrue;  // mutable anyhow.
+	IF_DEBUG(gc,
+		 debugBelch("@@ scavenge: RBH %p (%s) (new blocking_queue link=%p)",
+		       p, info_type(p), (StgClosure *)rbh->blocking_queue));
+	// ToDo: use size of reverted closure here!
+	break;
+    }
+
+    case BLOCKED_FETCH:
+    { 
+	StgBlockedFetch *bf = (StgBlockedFetch *)p;
+	// follow the pointer to the node which is being demanded 
+	(StgClosure *)bf->node = 
+	    evacuate((StgClosure *)bf->node);
+	// follow the link to the rest of the blocking queue 
+	(StgClosure *)bf->link = 
+	    evacuate((StgClosure *)bf->link);
+	IF_DEBUG(gc,
+		 debugBelch("@@ scavenge: %p (%s); node is now %p; exciting, isn't it",
+		       bf, info_type((StgClosure *)bf), 
+		       bf->node, info_type(bf->node)));
+	break;
+    }
+
+#ifdef DIST
+    case REMOTE_REF:
+#endif
+    case FETCH_ME:
+	break; // nothing to do in this case
+
+    case FETCH_ME_BQ:
+    { 
+	StgFetchMeBlockingQueue *fmbq = (StgFetchMeBlockingQueue *)p;
+	(StgClosure *)fmbq->blocking_queue = 
+	    evacuate((StgClosure *)fmbq->blocking_queue);
+	IF_DEBUG(gc,
+		 debugBelch("@@ scavenge: %p (%s) exciting, isn't it",
+		       p, info_type((StgClosure *)p)));
+	break;
+    }
+#endif
+
+    case TVAR_WAIT_QUEUE:
+      {
+	StgTVarWaitQueue *wq = ((StgTVarWaitQueue *) p);
+	evac_gen = 0;
+	wq->waiting_tso = (StgTSO *)evacuate((StgClosure*)wq->waiting_tso);
+	wq->next_queue_entry = (StgTVarWaitQueue *)evacuate((StgClosure*)wq->next_queue_entry);
+	wq->prev_queue_entry = (StgTVarWaitQueue *)evacuate((StgClosure*)wq->prev_queue_entry);
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable
+	break;
+      }
+
+    case TVAR:
+      {
+	StgTVar *tvar = ((StgTVar *) p);
+	evac_gen = 0;
+	tvar->current_value = evacuate((StgClosure*)tvar->current_value);
+	tvar->first_wait_queue_entry = (StgTVarWaitQueue *)evacuate((StgClosure*)tvar->first_wait_queue_entry);
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable
+	break;
+      }
+
+    case TREC_HEADER:
+      {
+        StgTRecHeader *trec = ((StgTRecHeader *) p);
+        evac_gen = 0;
+	trec->enclosing_trec = (StgTRecHeader *)evacuate((StgClosure*)trec->enclosing_trec);
+	trec->current_chunk = (StgTRecChunk *)evacuate((StgClosure*)trec->current_chunk);
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable
+        break;
+      }
+
+    case TREC_CHUNK:
+      {
+	StgWord i;
+	StgTRecChunk *tc = ((StgTRecChunk *) p);
+	TRecEntry *e = &(tc -> entries[0]);
+	evac_gen = 0;
+	tc->prev_chunk = (StgTRecChunk *)evacuate((StgClosure*)tc->prev_chunk);
+	for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) {
+	  e->tvar = (StgTVar *)evacuate((StgClosure*)e->tvar);
+	  e->expected_value = evacuate((StgClosure*)e->expected_value);
+	  e->new_value = evacuate((StgClosure*)e->new_value);
+	}
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable
+	break;
+      }
+
+    case IND_OLDGEN:
+    case IND_OLDGEN_PERM:
+    case IND_STATIC:
+    {
+	/* Careful here: a THUNK can be on the mutable list because
+	 * it contains pointers to young gen objects.  If such a thunk
+	 * is updated, the IND_OLDGEN will be added to the mutable
+	 * list again, and we'll scavenge it twice.  evacuate()
+	 * doesn't check whether the object has already been
+	 * evacuated, so we perform that check here.
+	 */
+	StgClosure *q = ((StgInd *)p)->indirectee;
+	if (HEAP_ALLOCED(q) && Bdescr((StgPtr)q)->flags & BF_EVACUATED) {
+	    break;
+	}
+	((StgInd *)p)->indirectee = evacuate(q);
+    }
+
+#if 0 && defined(DEBUG)
+      if (RtsFlags.DebugFlags.gc) 
+      /* Debugging code to print out the size of the thing we just
+       * promoted 
+       */
+      { 
+	StgPtr start = gen->steps[0].scan;
+	bdescr *start_bd = gen->steps[0].scan_bd;
+	nat size = 0;
+	scavenge(&gen->steps[0]);
+	if (start_bd != gen->steps[0].scan_bd) {
+	  size += (P_)BLOCK_ROUND_UP(start) - start;
+	  start_bd = start_bd->link;
+	  while (start_bd != gen->steps[0].scan_bd) {
+	    size += BLOCK_SIZE_W;
+	    start_bd = start_bd->link;
+	  }
+	  size += gen->steps[0].scan -
+	    (P_)BLOCK_ROUND_DOWN(gen->steps[0].scan);
+	} else {
+	  size = gen->steps[0].scan - start;
+	}
+	debugBelch("evac IND_OLDGEN: %ld bytes", size * sizeof(W_));
+      }
+#endif
+      break;
+
+    default:
+	barf("scavenge_one: strange object %d", (int)(info->type));
+    }    
+
+    no_luck = failed_to_evac;
+    failed_to_evac = rtsFalse;
+    return (no_luck);
+}
+
+/* -----------------------------------------------------------------------------
+   Scavenging mutable lists.
+
+   We treat the mutable list of each generation > N (i.e. all the
+   generations older than the one being collected) as roots.  We also
+   remove non-mutable objects from the mutable list at this point.
+   -------------------------------------------------------------------------- */
+
+static void
+scavenge_mutable_list(generation *gen)
+{
+    bdescr *bd;
+    StgPtr p, q;
+
+    bd = gen->saved_mut_list;
+
+    evac_gen = gen->no;
+    for (; bd != NULL; bd = bd->link) {
+	for (q = bd->start; q < bd->free; q++) {
+	    p = (StgPtr)*q;
+	    ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
+
+#ifdef DEBUG	    
+	    switch (get_itbl((StgClosure *)p)->type) {
+	    case MUT_VAR_CLEAN:
+		barf("MUT_VAR_CLEAN on mutable list");
+	    case MUT_VAR_DIRTY:
+		mutlist_MUTVARS++; break;
+	    case MUT_ARR_PTRS_CLEAN:
+	    case MUT_ARR_PTRS_DIRTY:
+	    case MUT_ARR_PTRS_FROZEN:
+	    case MUT_ARR_PTRS_FROZEN0:
+		mutlist_MUTARRS++; break;
+	    default:
+		mutlist_OTHERS++; break;
+	    }
+#endif
+
+	    // Check whether this object is "clean", that is it
+	    // definitely doesn't point into a young generation.
+	    // Clean objects don't need to be scavenged.  Some clean
+	    // objects (MUT_VAR_CLEAN) are not kept on the mutable
+	    // list at all; others, such as MUT_ARR_PTRS_CLEAN and
+	    // TSO, are always on the mutable list.
+	    //
+	    switch (get_itbl((StgClosure *)p)->type) {
+	    case MUT_ARR_PTRS_CLEAN:
+		recordMutableGen((StgClosure *)p,gen);
+		continue;
+	    case TSO: {
+		StgTSO *tso = (StgTSO *)p;
+		if ((tso->flags & TSO_DIRTY) == 0) {
+		    // A clean TSO: we don't have to traverse its
+		    // stack.  However, we *do* follow the link field:
+		    // we don't want to have to mark a TSO dirty just
+		    // because we put it on a different queue.
+		    if (tso->why_blocked != BlockedOnBlackHole) {
+			tso->link = (StgTSO *)evacuate((StgClosure *)tso->link);
+		    }
+		    recordMutableGen((StgClosure *)p,gen);
+		    continue;
+		}
+	    }
+	    default:
+		;
+	    }
+
+	    if (scavenge_one(p)) {
+		// didn't manage to promote everything, so put the
+		// object back on the list.
+		recordMutableGen((StgClosure *)p,gen);
+	    }
+	}
+    }
+
+    // free the old mut_list
+    freeChain(gen->saved_mut_list);
+    gen->saved_mut_list = NULL;
+}
+
+
+static void
+scavenge_static(void)
+{
+  StgClosure* p = static_objects;
+  const StgInfoTable *info;
+
+  /* Always evacuate straight to the oldest generation for static
+   * objects */
+  evac_gen = oldest_gen->no;
+
+  /* keep going until we've scavenged all the objects on the linked
+     list... */
+  while (p != END_OF_STATIC_LIST) {
+
+    ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
+    info = get_itbl(p);
+    /*
+    if (info->type==RBH)
+      info = REVERT_INFOPTR(info); // if it's an RBH, look at the orig closure
+    */
+    // make sure the info pointer is into text space 
+    
+    /* Take this object *off* the static_objects list,
+     * and put it on the scavenged_static_objects list.
+     */
+    static_objects = *STATIC_LINK(info,p);
+    *STATIC_LINK(info,p) = scavenged_static_objects;
+    scavenged_static_objects = p;
+    
+    switch (info -> type) {
+      
+    case IND_STATIC:
+      {
+	StgInd *ind = (StgInd *)p;
+	ind->indirectee = evacuate(ind->indirectee);
+
+	/* might fail to evacuate it, in which case we have to pop it
+	 * back on the mutable list of the oldest generation.  We
+	 * leave it *on* the scavenged_static_objects list, though,
+	 * in case we visit this object again.
+	 */
+	if (failed_to_evac) {
+	  failed_to_evac = rtsFalse;
+	  recordMutableGen((StgClosure *)p,oldest_gen);
+	}
+	break;
+      }
+      
+    case THUNK_STATIC:
+      scavenge_thunk_srt(info);
+      break;
+
+    case FUN_STATIC:
+      scavenge_fun_srt(info);
+      break;
+      
+    case CONSTR_STATIC:
+      {	
+	StgPtr q, next;
+	
+	next = (P_)p->payload + info->layout.payload.ptrs;
+	// evacuate the pointers 
+	for (q = (P_)p->payload; q < next; q++) {
+	    *q = (StgWord)(StgPtr)evacuate((StgClosure *)*q);
+	}
+	break;
+      }
+      
+    default:
+      barf("scavenge_static: strange closure %d", (int)(info->type));
+    }
+
+    ASSERT(failed_to_evac == rtsFalse);
+
+    /* get the next static object from the list.  Remember, there might
+     * be more stuff on this list now that we've done some evacuating!
+     * (static_objects is a global)
+     */
+    p = static_objects;
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   scavenge a chunk of memory described by a bitmap
+   -------------------------------------------------------------------------- */
+
+static void
+scavenge_large_bitmap( StgPtr p, StgLargeBitmap *large_bitmap, nat size )
+{
+    nat i, b;
+    StgWord bitmap;
+    
+    b = 0;
+    bitmap = large_bitmap->bitmap[b];
+    for (i = 0; i < size; ) {
+	if ((bitmap & 1) == 0) {
+	    *p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	}
+	i++;
+	p++;
+	if (i % BITS_IN(W_) == 0) {
+	    b++;
+	    bitmap = large_bitmap->bitmap[b];
+	} else {
+	    bitmap = bitmap >> 1;
+	}
+    }
+}
+
+STATIC_INLINE StgPtr
+scavenge_small_bitmap (StgPtr p, nat size, StgWord bitmap)
+{
+    while (size > 0) {
+	if ((bitmap & 1) == 0) {
+	    *p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	}
+	p++;
+	bitmap = bitmap >> 1;
+	size--;
+    }
+    return p;
+}
+
+/* -----------------------------------------------------------------------------
+   scavenge_stack walks over a section of stack and evacuates all the
+   objects pointed to by it.  We can use the same code for walking
+   AP_STACK_UPDs, since these are just sections of copied stack.
+   -------------------------------------------------------------------------- */
+
+
+static void
+scavenge_stack(StgPtr p, StgPtr stack_end)
+{
+  const StgRetInfoTable* info;
+  StgWord bitmap;
+  nat size;
+
+  //IF_DEBUG(sanity, debugBelch("  scavenging stack between %p and %p", p, stack_end));
+
+  /* 
+   * Each time around this loop, we are looking at a chunk of stack
+   * that starts with an activation record. 
+   */
+
+  while (p < stack_end) {
+    info  = get_ret_itbl((StgClosure *)p);
+      
+    switch (info->i.type) {
+	
+    case UPDATE_FRAME:
+	// In SMP, we can get update frames that point to indirections
+	// when two threads evaluate the same thunk.  We do attempt to
+	// discover this situation in threadPaused(), but it's
+	// possible that the following sequence occurs:
+	//
+	//        A             B
+	//                  enter T
+	//     enter T
+	//     blackhole T
+	//                  update T
+	//     GC
+	//
+	// Now T is an indirection, and the update frame is already
+	// marked on A's stack, so we won't traverse it again in
+	// threadPaused().  We could traverse the whole stack again
+	// before GC, but that seems like overkill.
+	//
+	// Scavenging this update frame as normal would be disastrous;
+	// the updatee would end up pointing to the value.  So we turn
+	// the indirection into an IND_PERM, so that evacuate will
+	// copy the indirection into the old generation instead of
+	// discarding it.
+	if (get_itbl(((StgUpdateFrame *)p)->updatee)->type == IND) {
+	    ((StgUpdateFrame *)p)->updatee->header.info = 
+		(StgInfoTable *)&stg_IND_PERM_info;
+	}
+	((StgUpdateFrame *)p)->updatee 
+	    = evacuate(((StgUpdateFrame *)p)->updatee);
+	p += sizeofW(StgUpdateFrame);
+	continue;
+
+      // small bitmap (< 32 entries, or 64 on a 64-bit machine) 
+    case CATCH_STM_FRAME:
+    case CATCH_RETRY_FRAME:
+    case ATOMICALLY_FRAME:
+    case STOP_FRAME:
+    case CATCH_FRAME:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+	bitmap = BITMAP_BITS(info->i.layout.bitmap);
+	size   = BITMAP_SIZE(info->i.layout.bitmap);
+	// NOTE: the payload starts immediately after the info-ptr, we
+	// don't have an StgHeader in the same sense as a heap closure.
+	p++;
+	p = scavenge_small_bitmap(p, size, bitmap);
+
+    follow_srt:
+	if (major_gc) 
+	    scavenge_srt((StgClosure **)GET_SRT(info), info->i.srt_bitmap);
+	continue;
+
+    case RET_BCO: {
+	StgBCO *bco;
+	nat size;
+
+	p++;
+	*p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	bco = (StgBCO *)*p;
+	p++;
+	size = BCO_BITMAP_SIZE(bco);
+	scavenge_large_bitmap(p, BCO_BITMAP(bco), size);
+	p += size;
+	continue;
+    }
+
+      // large bitmap (> 32 entries, or > 64 on a 64-bit machine) 
+    case RET_BIG:
+    case RET_VEC_BIG:
+    {
+	nat size;
+
+	size = GET_LARGE_BITMAP(&info->i)->size;
+	p++;
+	scavenge_large_bitmap(p, GET_LARGE_BITMAP(&info->i), size);
+	p += size;
+	// and don't forget to follow the SRT 
+	goto follow_srt;
+    }
+
+      // Dynamic bitmap: the mask is stored on the stack, and
+      // there are a number of non-pointers followed by a number
+      // of pointers above the bitmapped area.  (see StgMacros.h,
+      // HEAP_CHK_GEN).
+    case RET_DYN:
+    {
+	StgWord dyn;
+	dyn = ((StgRetDyn *)p)->liveness;
+
+	// traverse the bitmap first
+	bitmap = RET_DYN_LIVENESS(dyn);
+	p      = (P_)&((StgRetDyn *)p)->payload[0];
+	size   = RET_DYN_BITMAP_SIZE;
+	p = scavenge_small_bitmap(p, size, bitmap);
+
+	// skip over the non-ptr words
+	p += RET_DYN_NONPTRS(dyn) + RET_DYN_NONPTR_REGS_SIZE;
+	
+	// follow the ptr words
+	for (size = RET_DYN_PTRS(dyn); size > 0; size--) {
+	    *p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	    p++;
+	}
+	continue;
+    }
+
+    case RET_FUN:
+    {
+	StgRetFun *ret_fun = (StgRetFun *)p;
+	StgFunInfoTable *fun_info;
+
+	ret_fun->fun = evacuate(ret_fun->fun);
+ 	fun_info = get_fun_itbl(ret_fun->fun);
+	p = scavenge_arg_block(fun_info, ret_fun->payload);
+	goto follow_srt;
+    }
+
+    default:
+	barf("scavenge_stack: weird activation record found on stack: %d", (int)(info->i.type));
+    }
+  }		     
+}
+
+/*-----------------------------------------------------------------------------
+  scavenge the large object list.
+
+  evac_gen set by caller; similar games played with evac_gen as with
+  scavenge() - see comment at the top of scavenge().  Most large
+  objects are (repeatedly) mutable, so most of the time evac_gen will
+  be zero.
+  --------------------------------------------------------------------------- */
+
+static void
+scavenge_large(step *stp)
+{
+  bdescr *bd;
+  StgPtr p;
+
+  bd = stp->new_large_objects;
+
+  for (; bd != NULL; bd = stp->new_large_objects) {
+
+    /* take this object *off* the large objects list and put it on
+     * the scavenged large objects list.  This is so that we can
+     * treat new_large_objects as a stack and push new objects on
+     * the front when evacuating.
+     */
+    stp->new_large_objects = bd->link;
+    dbl_link_onto(bd, &stp->scavenged_large_objects);
+
+    // update the block count in this step.
+    stp->n_scavenged_large_blocks += bd->blocks;
+
+    p = bd->start;
+    if (scavenge_one(p)) {
+	if (stp->gen_no > 0) {
+	    recordMutableGen((StgClosure *)p, stp->gen);
+	}
+    }
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   Initialising the static object & mutable lists
+   -------------------------------------------------------------------------- */
+
+static void
+zero_static_object_list(StgClosure* first_static)
+{
+  StgClosure* p;
+  StgClosure* link;
+  const StgInfoTable *info;
+
+  for (p = first_static; p != END_OF_STATIC_LIST; p = link) {
+    info = get_itbl(p);
+    link = *STATIC_LINK(info, p);
+    *STATIC_LINK(info,p) = NULL;
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   Reverting CAFs
+   -------------------------------------------------------------------------- */
+
+void
+revertCAFs( void )
+{
+    StgIndStatic *c;
+
+    for (c = (StgIndStatic *)revertible_caf_list; c != NULL; 
+	 c = (StgIndStatic *)c->static_link) 
+    {
+	SET_INFO(c, c->saved_info);
+	c->saved_info = NULL;
+	// could, but not necessary: c->static_link = NULL; 
+    }
+    revertible_caf_list = NULL;
+}
+
+void
+markCAFs( evac_fn evac )
+{
+    StgIndStatic *c;
+
+    for (c = (StgIndStatic *)caf_list; c != NULL; 
+	 c = (StgIndStatic *)c->static_link) 
+    {
+	evac(&c->indirectee);
+    }
+    for (c = (StgIndStatic *)revertible_caf_list; c != NULL; 
+	 c = (StgIndStatic *)c->static_link) 
+    {
+	evac(&c->indirectee);
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   Sanity code for CAF garbage collection.
+
+   With DEBUG turned on, we manage a CAF list in addition to the SRT
+   mechanism.  After GC, we run down the CAF list and blackhole any
+   CAFs which have been garbage collected.  This means we get an error
+   whenever the program tries to enter a garbage collected CAF.
+
+   Any garbage collected CAFs are taken off the CAF list at the same
+   time. 
+   -------------------------------------------------------------------------- */
+
+#if 0 && defined(DEBUG)
+
+static void
+gcCAFs(void)
+{
+  StgClosure*  p;
+  StgClosure** pp;
+  const StgInfoTable *info;
+  nat i;
+
+  i = 0;
+  p = caf_list;
+  pp = &caf_list;
+
+  while (p != NULL) {
+    
+    info = get_itbl(p);
+
+    ASSERT(info->type == IND_STATIC);
+
+    if (STATIC_LINK(info,p) == NULL) {
+      IF_DEBUG(gccafs, debugBelch("CAF gc'd at 0x%04lx", (long)p));
+      // black hole it 
+      SET_INFO(p,&stg_BLACKHOLE_info);
+      p = STATIC_LINK2(info,p);
+      *pp = p;
+    }
+    else {
+      pp = &STATIC_LINK2(info,p);
+      p = *pp;
+      i++;
+    }
+
+  }
+
+  //  debugBelch("%d CAFs live", i); 
+}
+#endif
+
+
+/* -----------------------------------------------------------------------------
+ * Stack squeezing
+ *
+ * Code largely pinched from old RTS, then hacked to bits.  We also do
+ * lazy black holing here.
+ *
+ * -------------------------------------------------------------------------- */
+
+struct stack_gap { StgWord gap_size; struct stack_gap *next_gap; };
+
+static void
+stackSqueeze(StgTSO *tso, StgPtr bottom)
+{
+    StgPtr frame;
+    rtsBool prev_was_update_frame;
+    StgClosure *updatee = NULL;
+    StgRetInfoTable *info;
+    StgWord current_gap_size;
+    struct stack_gap *gap;
+
+    // Stage 1: 
+    //    Traverse the stack upwards, replacing adjacent update frames
+    //    with a single update frame and a "stack gap".  A stack gap
+    //    contains two values: the size of the gap, and the distance
+    //    to the next gap (or the stack top).
+
+    frame = tso->sp;
+
+    ASSERT(frame < bottom);
+    
+    prev_was_update_frame = rtsFalse;
+    current_gap_size = 0;
+    gap = (struct stack_gap *) (tso->sp - sizeofW(StgUpdateFrame));
+
+    while (frame < bottom) {
+	
+	info = get_ret_itbl((StgClosure *)frame);
+	switch (info->i.type) {
+
+	case UPDATE_FRAME:
+	{ 
+	    StgUpdateFrame *upd = (StgUpdateFrame *)frame;
+
+	    if (prev_was_update_frame) {
+
+		TICK_UPD_SQUEEZED();
+		/* wasn't there something about update squeezing and ticky to be
+		 * sorted out?  oh yes: we aren't counting each enter properly
+		 * in this case.  See the log somewhere.  KSW 1999-04-21
+		 *
+		 * Check two things: that the two update frames don't point to
+		 * the same object, and that the updatee_bypass isn't already an
+		 * indirection.  Both of these cases only happen when we're in a
+		 * block hole-style loop (and there are multiple update frames
+		 * on the stack pointing to the same closure), but they can both
+		 * screw us up if we don't check.
+		 */
+		if (upd->updatee != updatee && !closure_IND(upd->updatee)) {
+		    UPD_IND_NOLOCK(upd->updatee, updatee);
+		}
+
+		// now mark this update frame as a stack gap.  The gap
+		// marker resides in the bottom-most update frame of
+		// the series of adjacent frames, and covers all the
+		// frames in this series.
+		current_gap_size += sizeofW(StgUpdateFrame);
+		((struct stack_gap *)frame)->gap_size = current_gap_size;
+		((struct stack_gap *)frame)->next_gap = gap;
+
+		frame += sizeofW(StgUpdateFrame);
+		continue;
+	    } 
+
+	    // single update frame, or the topmost update frame in a series
+	    else {
+		prev_was_update_frame = rtsTrue;
+		updatee = upd->updatee;
+		frame += sizeofW(StgUpdateFrame);
+		continue;
+	    }
+	}
+	    
+	default:
+	    prev_was_update_frame = rtsFalse;
+
+	    // we're not in a gap... check whether this is the end of a gap
+	    // (an update frame can't be the end of a gap).
+	    if (current_gap_size != 0) {
+		gap = (struct stack_gap *) (frame - sizeofW(StgUpdateFrame));
+	    }
+	    current_gap_size = 0;
+
+	    frame += stack_frame_sizeW((StgClosure *)frame);
+	    continue;
+	}
+    }
+
+    if (current_gap_size != 0) {
+	gap = (struct stack_gap *) (frame - sizeofW(StgUpdateFrame));
+    }
+
+    // Now we have a stack with gaps in it, and we have to walk down
+    // shoving the stack up to fill in the gaps.  A diagram might
+    // help:
+    //
+    //    +| ********* |
+    //     | ********* | <- sp
+    //     |           |
+    //     |           | <- gap_start
+    //     | ......... |                |
+    //     | stack_gap | <- gap         | chunk_size
+    //     | ......... |                | 
+    //     | ......... | <- gap_end     v
+    //     | ********* | 
+    //     | ********* | 
+    //     | ********* | 
+    //    -| ********* | 
+    //
+    // 'sp'  points the the current top-of-stack
+    // 'gap' points to the stack_gap structure inside the gap
+    // *****   indicates real stack data
+    // .....   indicates gap
+    // <empty> indicates unused
+    //
+    {
+	void *sp;
+	void *gap_start, *next_gap_start, *gap_end;
+	nat chunk_size;
+
+	next_gap_start = (void *)((unsigned char*)gap + sizeof(StgUpdateFrame));
+	sp = next_gap_start;
+
+	while ((StgPtr)gap > tso->sp) {
+
+	    // we're working in *bytes* now...
+	    gap_start = next_gap_start;
+	    gap_end = (void*) ((unsigned char*)gap_start - gap->gap_size * sizeof(W_));
+
+	    gap = gap->next_gap;
+	    next_gap_start = (void *)((unsigned char*)gap + sizeof(StgUpdateFrame));
+
+	    chunk_size = (unsigned char*)gap_end - (unsigned char*)next_gap_start;
+	    sp -= chunk_size;
+	    memmove(sp, next_gap_start, chunk_size);
+	}
+
+	tso->sp = (StgPtr)sp;
+    }
+}    
+
+/* -----------------------------------------------------------------------------
+ * Pausing a thread
+ * 
+ * We have to prepare for GC - this means doing lazy black holing
+ * here.  We also take the opportunity to do stack squeezing if it's
+ * turned on.
+ * -------------------------------------------------------------------------- */
+void
+threadPaused(Capability *cap, StgTSO *tso)
+{
+    StgClosure *frame;
+    StgRetInfoTable *info;
+    StgClosure *bh;
+    StgPtr stack_end;
+    nat words_to_squeeze = 0;
+    nat weight           = 0;
+    nat weight_pending   = 0;
+    rtsBool prev_was_update_frame;
+    
+    stack_end = &tso->stack[tso->stack_size];
+    
+    frame = (StgClosure *)tso->sp;
+
+    while (1) {
+	// If we've already marked this frame, then stop here.
+	if (frame->header.info == (StgInfoTable *)&stg_marked_upd_frame_info) {
+	    goto end;
+	}
+
+	info = get_ret_itbl(frame);
+	
+	switch (info->i.type) {
+	    
+	case UPDATE_FRAME:
+
+	    SET_INFO(frame, (StgInfoTable *)&stg_marked_upd_frame_info);
+
+	    bh = ((StgUpdateFrame *)frame)->updatee;
+
+	    if (closure_IND(bh) || bh->header.info == &stg_BLACKHOLE_info) {
+		IF_DEBUG(squeeze, debugBelch("suspending duplicate work: %ld words of stack\n", (StgPtr)frame - tso->sp));
+
+		// If this closure is already an indirection, then
+		// suspend the computation up to this point:
+		suspendComputation(cap,tso,(StgPtr)frame);
+
+		// Now drop the update frame, and arrange to return
+		// the value to the frame underneath:
+		tso->sp = (StgPtr)frame + sizeofW(StgUpdateFrame) - 2;
+		tso->sp[1] = (StgWord)bh;
+		tso->sp[0] = (W_)&stg_enter_info;
+
+		// And continue with threadPaused; there might be
+		// yet more computation to suspend.
+		threadPaused(cap,tso);
+		return;
+	    }
+
+	    if (bh->header.info != &stg_CAF_BLACKHOLE_info) {
+#if (!defined(LAZY_BLACKHOLING)) && defined(DEBUG)
+		debugBelch("Unexpected lazy BHing required at 0x%04lx\n",(long)bh);
+#endif
+		// zero out the slop so that the sanity checker can tell
+		// where the next closure is.
+		DEBUG_FILL_SLOP(bh);
+#ifdef PROFILING
+		// @LDV profiling
+		// We pretend that bh is now dead.
+		LDV_recordDead_FILL_SLOP_DYNAMIC((StgClosure *)bh);
+#endif
+		SET_INFO(bh,&stg_BLACKHOLE_info);
+
+		// We pretend that bh has just been created.
+		LDV_RECORD_CREATE(bh);
+	    }
+	    
+	    frame = (StgClosure *) ((StgUpdateFrame *)frame + 1);
+	    if (prev_was_update_frame) {
+		words_to_squeeze += sizeofW(StgUpdateFrame);
+		weight += weight_pending;
+		weight_pending = 0;
+	    }
+	    prev_was_update_frame = rtsTrue;
+	    break;
+	    
+	case STOP_FRAME:
+	    goto end;
+	    
+	    // normal stack frames; do nothing except advance the pointer
+	default:
+	{
+	    nat frame_size = stack_frame_sizeW(frame);
+	    weight_pending += frame_size;
+	    frame = (StgClosure *)((StgPtr)frame + frame_size);
+	    prev_was_update_frame = rtsFalse;
+	}
+	}
+    }
+
+end:
+    IF_DEBUG(squeeze, 
+	     debugBelch("words_to_squeeze: %d, weight: %d, squeeze: %s\n", 
+			words_to_squeeze, weight, 
+			weight < words_to_squeeze ? "YES" : "NO"));
+
+    // Should we squeeze or not?  Arbitrary heuristic: we squeeze if
+    // the number of words we have to shift down is less than the
+    // number of stack words we squeeze away by doing so.
+    if (RtsFlags.GcFlags.squeezeUpdFrames == rtsTrue &&
+	weight < words_to_squeeze) {
+	stackSqueeze(tso, (StgPtr)frame);
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Debugging
+ * -------------------------------------------------------------------------- */
+
+#if DEBUG
+void
+printMutableList(generation *gen)
+{
+    bdescr *bd;
+    StgPtr p;
+
+    debugBelch("@@ Mutable list %p: ", gen->mut_list);
+
+    for (bd = gen->mut_list; bd != NULL; bd = bd->link) {
+	for (p = bd->start; p < bd->free; p++) {
+	    debugBelch("%p (%s), ", (void *)*p, info_type((StgClosure *)*p));
+	}
+    }
+    debugBelch("\n");
+}
+#endif /* DEBUG */
diff --git a/rts/GCCompact.c b/rts/GCCompact.c
new file mode 100644
index 0000000000..4dfe84bbe0
--- /dev/null
+++ b/rts/GCCompact.c
@@ -0,0 +1,949 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2001
+ *
+ * Compacting garbage collector
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "OSThreads.h"
+#include "Storage.h"
+#include "BlockAlloc.h"
+#include "MBlock.h"
+#include "GCCompact.h"
+#include "Schedule.h"
+#include "Apply.h"
+
+// Turn off inlining when debugging - it obfuscates things
+#ifdef DEBUG
+# undef  STATIC_INLINE
+# define STATIC_INLINE static
+#endif
+
+/* -----------------------------------------------------------------------------
+   Threading / unthreading pointers.
+
+   The basic idea here is to chain together all the fields pointing at
+   a particular object, with the root of the chain in the object's
+   info table field.  The original contents of the info pointer goes
+   at the end of the chain.
+
+   Adding a new field to the chain is a matter of swapping the
+   contents of the field with the contents of the object's info table
+   field.
+
+   To unthread the chain, we walk down it updating all the fields on
+   the chain with the new location of the object.  We stop when we
+   reach the info pointer at the end.
+
+   We use a trick to identify the info pointer: when swapping pointers
+   for threading, we set the low bit of the original pointer, with the
+   result that all the pointers in the chain have their low bits set
+   except for the info pointer.
+   -------------------------------------------------------------------------- */
+
+STATIC_INLINE void
+thread( StgPtr p )
+{
+    StgPtr q = (StgPtr)*p;
+    bdescr *bd;
+
+    // It doesn't look like a closure at the moment, because the info
+    // ptr is possibly threaded:
+    // ASSERT(LOOKS_LIKE_CLOSURE_PTR(q));
+
+    if (HEAP_ALLOCED(q)) {
+	bd = Bdescr(q); 
+	// a handy way to discover whether the ptr is into the
+	// compacted area of the old gen, is that the EVACUATED flag
+	// is zero (it's non-zero for all the other areas of live
+	// memory).
+	if ((bd->flags & BF_EVACUATED) == 0) {
+	    *p = (StgWord)*q;
+	    *q = (StgWord)p + 1;	// set the low bit
+	}
+    }
+}
+
+STATIC_INLINE void
+unthread( StgPtr p, StgPtr free )
+{
+    StgWord q = *p, r;
+    
+    while ((q & 1) != 0) {
+	q -= 1;	// unset the low bit again
+	r = *((StgPtr)q);
+	*((StgPtr)q) = (StgWord)free;
+	q = r;
+    }
+    *p = q;
+}
+
+STATIC_INLINE StgInfoTable *
+get_threaded_info( StgPtr p )
+{
+    StgPtr q = (P_)GET_INFO((StgClosure *)p);
+
+    while (((StgWord)q & 1) != 0) {
+	q = (P_)*((StgPtr)((StgWord)q-1));
+    }
+
+    ASSERT(LOOKS_LIKE_INFO_PTR(q));
+    return INFO_PTR_TO_STRUCT((StgInfoTable *)q);
+}
+
+// A word-aligned memmove will be faster for small objects than libc's or gcc's.
+// Remember, the two regions *might* overlap, but: to <= from.
+STATIC_INLINE void
+move(StgPtr to, StgPtr from, nat size)
+{
+    for(; size > 0; --size) {
+	*to++ = *from++;
+    }
+}
+
+static void
+thread_static( StgClosure* p )
+{
+  const StgInfoTable *info;
+
+  // keep going until we've threaded all the objects on the linked
+  // list... 
+  while (p != END_OF_STATIC_LIST) {
+
+    info = get_itbl(p);
+    switch (info->type) {
+      
+    case IND_STATIC:
+	thread((StgPtr)&((StgInd *)p)->indirectee);
+	p = *IND_STATIC_LINK(p);
+	continue;
+      
+    case THUNK_STATIC:
+	p = *THUNK_STATIC_LINK(p);
+	continue;
+    case FUN_STATIC:
+	p = *FUN_STATIC_LINK(p);
+	continue;
+    case CONSTR_STATIC:
+	p = *STATIC_LINK(info,p);
+	continue;
+      
+    default:
+	barf("thread_static: strange closure %d", (int)(info->type));
+    }
+
+  }
+}
+
+STATIC_INLINE void
+thread_large_bitmap( StgPtr p, StgLargeBitmap *large_bitmap, nat size )
+{
+    nat i, b;
+    StgWord bitmap;
+
+    b = 0;
+    bitmap = large_bitmap->bitmap[b];
+    for (i = 0; i < size; ) {
+	if ((bitmap & 1) == 0) {
+	    thread(p);
+	}
+	i++;
+	p++;
+	if (i % BITS_IN(W_) == 0) {
+	    b++;
+	    bitmap = large_bitmap->bitmap[b];
+	} else {
+	    bitmap = bitmap >> 1;
+	}
+    }
+}
+
+STATIC_INLINE StgPtr
+thread_arg_block (StgFunInfoTable *fun_info, StgClosure **args)
+{
+    StgPtr p;
+    StgWord bitmap;
+    nat size;
+
+    p = (StgPtr)args;
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+	bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+	size = BITMAP_SIZE(fun_info->f.b.bitmap);
+	goto small_bitmap;
+    case ARG_GEN_BIG:
+	size = GET_FUN_LARGE_BITMAP(fun_info)->size;
+	thread_large_bitmap(p, GET_FUN_LARGE_BITMAP(fun_info), size);
+	p += size;
+	break;
+    default:
+	bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+	size = BITMAP_SIZE(stg_arg_bitmaps[fun_info->f.fun_type]);
+    small_bitmap:
+	while (size > 0) {
+	    if ((bitmap & 1) == 0) {
+		thread(p);
+	    }
+	    p++;
+	    bitmap = bitmap >> 1;
+	    size--;
+	}
+	break;
+    }
+    return p;
+}
+
+static void
+thread_stack(StgPtr p, StgPtr stack_end)
+{
+    const StgRetInfoTable* info;
+    StgWord bitmap;
+    nat size;
+    
+    // highly similar to scavenge_stack, but we do pointer threading here.
+    
+    while (p < stack_end) {
+
+	// *p must be the info pointer of an activation
+	// record.  All activation records have 'bitmap' style layout
+	// info.
+	//
+	info  = get_ret_itbl((StgClosure *)p);
+	
+	switch (info->i.type) {
+	    
+	    // Dynamic bitmap: the mask is stored on the stack 
+	case RET_DYN:
+	{
+	    StgWord dyn;
+	    dyn = ((StgRetDyn *)p)->liveness;
+
+	    // traverse the bitmap first
+	    bitmap = RET_DYN_LIVENESS(dyn);
+	    p      = (P_)&((StgRetDyn *)p)->payload[0];
+	    size   = RET_DYN_BITMAP_SIZE;
+	    while (size > 0) {
+		if ((bitmap & 1) == 0) {
+		    thread(p);
+		}
+		p++;
+		bitmap = bitmap >> 1;
+		size--;
+	    }
+	    
+	    // skip over the non-ptr words
+	    p += RET_DYN_NONPTRS(dyn) + RET_DYN_NONPTR_REGS_SIZE;
+	    
+	    // follow the ptr words
+	    for (size = RET_DYN_PTRS(dyn); size > 0; size--) {
+		thread(p);
+		p++;
+	    }
+	    continue;
+	}
+	    
+	    // small bitmap (<= 32 entries, or 64 on a 64-bit machine) 
+        case CATCH_RETRY_FRAME:
+        case CATCH_STM_FRAME:
+        case ATOMICALLY_FRAME:
+	case UPDATE_FRAME:
+	case STOP_FRAME:
+	case CATCH_FRAME:
+	case RET_SMALL:
+	case RET_VEC_SMALL:
+	    bitmap = BITMAP_BITS(info->i.layout.bitmap);
+	    size   = BITMAP_SIZE(info->i.layout.bitmap);
+	    p++;
+	    // NOTE: the payload starts immediately after the info-ptr, we
+	    // don't have an StgHeader in the same sense as a heap closure.
+	    while (size > 0) {
+		if ((bitmap & 1) == 0) {
+		    thread(p);
+		}
+		p++;
+		bitmap = bitmap >> 1;
+		size--;
+	    }
+	    continue;
+
+	case RET_BCO: {
+	    StgBCO *bco;
+	    nat size;
+	    
+	    p++;
+	    bco = (StgBCO *)*p;
+	    thread(p);
+	    p++;
+	    size = BCO_BITMAP_SIZE(bco);
+	    thread_large_bitmap(p, BCO_BITMAP(bco), size);
+	    p += size;
+	    continue;
+	}
+
+	    // large bitmap (> 32 entries, or 64 on a 64-bit machine) 
+	case RET_BIG:
+	case RET_VEC_BIG:
+	    p++;
+	    size = GET_LARGE_BITMAP(&info->i)->size;
+	    thread_large_bitmap(p, GET_LARGE_BITMAP(&info->i), size);
+	    p += size;
+	    continue;
+
+	case RET_FUN:
+	{
+	    StgRetFun *ret_fun = (StgRetFun *)p;
+	    StgFunInfoTable *fun_info;
+	    
+	    fun_info = itbl_to_fun_itbl(
+		get_threaded_info((StgPtr)ret_fun->fun));
+	         // *before* threading it!
+	    thread((StgPtr)&ret_fun->fun);
+	    p = thread_arg_block(fun_info, ret_fun->payload);
+	    continue;
+	}
+
+	default:
+	    barf("thread_stack: weird activation record found on stack: %d", 
+		 (int)(info->i.type));
+	}
+    }
+}
+
+STATIC_INLINE StgPtr
+thread_PAP_payload (StgClosure *fun, StgClosure **payload, StgWord size)
+{
+    StgPtr p;
+    StgWord bitmap;
+    StgFunInfoTable *fun_info;
+
+    fun_info = itbl_to_fun_itbl(get_threaded_info((StgPtr)fun));
+    ASSERT(fun_info->i.type != PAP);
+
+    p = (StgPtr)payload;
+
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+	bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+	goto small_bitmap;
+    case ARG_GEN_BIG:
+	thread_large_bitmap(p, GET_FUN_LARGE_BITMAP(fun_info), size);
+	p += size;
+	break;
+    case ARG_BCO:
+	thread_large_bitmap((StgPtr)payload, BCO_BITMAP(fun), size);
+	p += size;
+	break;
+    default:
+	bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+    small_bitmap:
+	while (size > 0) {
+	    if ((bitmap & 1) == 0) {
+		thread(p);
+	    }
+	    p++;
+	    bitmap = bitmap >> 1;
+	    size--;
+	}
+	break;
+    }
+
+    return p;
+}
+
+STATIC_INLINE StgPtr
+thread_PAP (StgPAP *pap)
+{
+    StgPtr p;
+    p = thread_PAP_payload(pap->fun, pap->payload, pap->n_args);
+    thread((StgPtr)&pap->fun);
+    return p;
+}
+    
+STATIC_INLINE StgPtr
+thread_AP (StgAP *ap)
+{
+    StgPtr p;
+    p = thread_PAP_payload(ap->fun, ap->payload, ap->n_args);
+    thread((StgPtr)&ap->fun);
+    return p;
+}    
+
+STATIC_INLINE StgPtr
+thread_AP_STACK (StgAP_STACK *ap)
+{
+    thread((StgPtr)&ap->fun);
+    thread_stack((P_)ap->payload, (P_)ap->payload + ap->size);
+    return (P_)ap + sizeofW(StgAP_STACK) + ap->size;
+}
+
+static StgPtr
+thread_TSO (StgTSO *tso)
+{
+    thread((StgPtr)&tso->link);
+    thread((StgPtr)&tso->global_link);
+
+    if (   tso->why_blocked == BlockedOnMVar
+	|| tso->why_blocked == BlockedOnBlackHole
+	|| tso->why_blocked == BlockedOnException
+#if defined(PAR)
+	|| tso->why_blocked == BlockedOnGA
+	|| tso->why_blocked == BlockedOnGA_NoSend
+#endif
+	) {
+	thread((StgPtr)&tso->block_info.closure);
+    }
+    if ( tso->blocked_exceptions != NULL ) {
+	thread((StgPtr)&tso->blocked_exceptions);
+    }
+    
+    thread((StgPtr)&tso->trec);
+
+    thread_stack(tso->sp, &(tso->stack[tso->stack_size]));
+    return (StgPtr)tso + tso_sizeW(tso);
+}
+
+
+static void
+update_fwd_large( bdescr *bd )
+{
+  StgPtr p;
+  const StgInfoTable* info;
+
+  for (; bd != NULL; bd = bd->link) {
+
+    p = bd->start;
+    info  = get_itbl((StgClosure *)p);
+
+    switch (info->type) {
+
+    case ARR_WORDS:
+      // nothing to follow 
+      continue;
+
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    case MUT_ARR_PTRS_FROZEN:
+    case MUT_ARR_PTRS_FROZEN0:
+      // follow everything 
+      {
+	StgPtr next;
+
+	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	    thread(p);
+	}
+	continue;
+      }
+
+    case TSO:
+	thread_TSO((StgTSO *)p);
+	continue;
+
+    case AP_STACK:
+	thread_AP_STACK((StgAP_STACK *)p);
+	continue;
+
+    case PAP:
+	thread_PAP((StgPAP *)p);
+	continue;
+
+    case TREC_CHUNK:
+    {
+        StgWord i;
+        StgTRecChunk *tc = (StgTRecChunk *)p;
+	TRecEntry *e = &(tc -> entries[0]);
+	thread((StgPtr)&tc->prev_chunk);
+	for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) {
+	  thread((StgPtr)&e->tvar);
+	  thread((StgPtr)&e->expected_value);
+	  thread((StgPtr)&e->new_value);
+	}
+	continue;
+    }
+
+    default:
+      barf("update_fwd_large: unknown/strange object  %d", (int)(info->type));
+    }
+  }
+}
+
+STATIC_INLINE StgPtr
+thread_obj (StgInfoTable *info, StgPtr p)
+{
+    switch (info->type) {
+    case THUNK_0_1:
+	return p + sizeofW(StgThunk) + 1;
+
+    case FUN_0_1:
+    case CONSTR_0_1:
+	return p + sizeofW(StgHeader) + 1;
+	
+    case FUN_1_0:
+    case CONSTR_1_0:
+	thread((StgPtr)&((StgClosure *)p)->payload[0]);
+	return p + sizeofW(StgHeader) + 1;
+	
+    case THUNK_1_0:
+	thread((StgPtr)&((StgThunk *)p)->payload[0]);
+	return p + sizeofW(StgThunk) + 1;
+	
+    case THUNK_0_2:
+	return p + sizeofW(StgThunk) + 2;
+
+    case FUN_0_2:
+    case CONSTR_0_2:
+	return p + sizeofW(StgHeader) + 2;
+	
+    case THUNK_1_1:
+	thread((StgPtr)&((StgThunk *)p)->payload[0]);
+	return p + sizeofW(StgThunk) + 2;
+
+    case FUN_1_1:
+    case CONSTR_1_1:
+	thread((StgPtr)&((StgClosure *)p)->payload[0]);
+	return p + sizeofW(StgHeader) + 2;
+	
+    case THUNK_2_0:
+	thread((StgPtr)&((StgThunk *)p)->payload[0]);
+	thread((StgPtr)&((StgThunk *)p)->payload[1]);
+	return p + sizeofW(StgThunk) + 2;
+
+    case FUN_2_0:
+    case CONSTR_2_0:
+	thread((StgPtr)&((StgClosure *)p)->payload[0]);
+	thread((StgPtr)&((StgClosure *)p)->payload[1]);
+	return p + sizeofW(StgHeader) + 2;
+	
+    case BCO: {
+	StgBCO *bco = (StgBCO *)p;
+	thread((StgPtr)&bco->instrs);
+	thread((StgPtr)&bco->literals);
+	thread((StgPtr)&bco->ptrs);
+	thread((StgPtr)&bco->itbls);
+	return p + bco_sizeW(bco);
+    }
+
+    case THUNK:
+    {
+	StgPtr end;
+	
+	end = (P_)((StgThunk *)p)->payload + 
+	    info->layout.payload.ptrs;
+	for (p = (P_)((StgThunk *)p)->payload; p < end; p++) {
+	    thread(p);
+	}
+	return p + info->layout.payload.nptrs;
+    }
+
+    case FUN:
+    case CONSTR:
+    case STABLE_NAME:
+    case IND_PERM:
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+    case CAF_BLACKHOLE:
+    case SE_CAF_BLACKHOLE:
+    case SE_BLACKHOLE:
+    case BLACKHOLE:
+    {
+	StgPtr end;
+	
+	end = (P_)((StgClosure *)p)->payload + 
+	    info->layout.payload.ptrs;
+	for (p = (P_)((StgClosure *)p)->payload; p < end; p++) {
+	    thread(p);
+	}
+	return p + info->layout.payload.nptrs;
+    }
+    
+    case WEAK:
+    {
+	StgWeak *w = (StgWeak *)p;
+	thread((StgPtr)&w->key);
+	thread((StgPtr)&w->value);
+	thread((StgPtr)&w->finalizer);
+	if (w->link != NULL) {
+	    thread((StgPtr)&w->link);
+	}
+	return p + sizeofW(StgWeak);
+    }
+    
+    case MVAR:
+    { 
+	StgMVar *mvar = (StgMVar *)p;
+	thread((StgPtr)&mvar->head);
+	thread((StgPtr)&mvar->tail);
+	thread((StgPtr)&mvar->value);
+	return p + sizeofW(StgMVar);
+    }
+    
+    case IND_OLDGEN:
+    case IND_OLDGEN_PERM:
+	thread((StgPtr)&((StgInd *)p)->indirectee);
+	return p + sizeofW(StgInd);
+
+    case THUNK_SELECTOR:
+    { 
+	StgSelector *s = (StgSelector *)p;
+	thread((StgPtr)&s->selectee);
+	return p + THUNK_SELECTOR_sizeW();
+    }
+    
+    case AP_STACK:
+	return thread_AP_STACK((StgAP_STACK *)p);
+	
+    case PAP:
+	return thread_PAP((StgPAP *)p);
+
+    case AP:
+	return thread_AP((StgAP *)p);
+	
+    case ARR_WORDS:
+	return p + arr_words_sizeW((StgArrWords *)p);
+	
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    case MUT_ARR_PTRS_FROZEN:
+    case MUT_ARR_PTRS_FROZEN0:
+	// follow everything 
+    {
+	StgPtr next;
+	
+	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	    thread(p);
+	}
+	return p;
+    }
+    
+    case TSO:
+	return thread_TSO((StgTSO *)p);
+    
+    case TVAR_WAIT_QUEUE:
+    {
+        StgTVarWaitQueue *wq = (StgTVarWaitQueue *)p;
+	thread((StgPtr)&wq->waiting_tso);
+	thread((StgPtr)&wq->next_queue_entry);
+	thread((StgPtr)&wq->prev_queue_entry);
+	return p + sizeofW(StgTVarWaitQueue);
+    }
+    
+    case TVAR:
+    {
+        StgTVar *tvar = (StgTVar *)p;
+	thread((StgPtr)&tvar->current_value);
+	thread((StgPtr)&tvar->first_wait_queue_entry);
+	return p + sizeofW(StgTVar);
+    }
+    
+    case TREC_HEADER:
+    {
+        StgTRecHeader *trec = (StgTRecHeader *)p;
+	thread((StgPtr)&trec->enclosing_trec);
+	thread((StgPtr)&trec->current_chunk);
+	return p + sizeofW(StgTRecHeader);
+    }
+
+    case TREC_CHUNK:
+    {
+        StgWord i;
+        StgTRecChunk *tc = (StgTRecChunk *)p;
+	TRecEntry *e = &(tc -> entries[0]);
+	thread((StgPtr)&tc->prev_chunk);
+	for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) {
+	  thread((StgPtr)&e->tvar);
+	  thread((StgPtr)&e->expected_value);
+	  thread((StgPtr)&e->new_value);
+	}
+	return p + sizeofW(StgTRecChunk);
+    }
+
+    default:
+	barf("update_fwd: unknown/strange object  %d", (int)(info->type));
+	return NULL;
+    }
+}
+
+static void
+update_fwd( bdescr *blocks )
+{
+    StgPtr p;
+    bdescr *bd;
+    StgInfoTable *info;
+
+    bd = blocks;
+
+#if defined(PAR)
+    barf("update_fwd: ToDo");
+#endif
+
+    // cycle through all the blocks in the step
+    for (; bd != NULL; bd = bd->link) {
+	p = bd->start;
+
+	// linearly scan the objects in this block
+	while (p < bd->free) {
+	    ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
+	    info = get_itbl((StgClosure *)p);
+	    p = thread_obj(info, p);
+	}
+    }
+} 
+
+static void
+update_fwd_compact( bdescr *blocks )
+{
+    StgPtr p, q, free;
+#if 0
+    StgWord m;
+#endif
+    bdescr *bd, *free_bd;
+    StgInfoTable *info;
+    nat size;
+
+    bd = blocks;
+    free_bd = blocks;
+    free = free_bd->start;
+
+#if defined(PAR)
+    barf("update_fwd: ToDo");
+#endif
+
+    // cycle through all the blocks in the step
+    for (; bd != NULL; bd = bd->link) {
+	p = bd->start;
+
+	while (p < bd->free ) {
+
+	    while ( p < bd->free && !is_marked(p,bd) ) {
+		p++;
+	    }
+	    if (p >= bd->free) {
+		break;
+	    }
+
+#if 0
+    next:
+	m = * ((StgPtr)bd->u.bitmap + ((p - bd->start) / (BITS_IN(StgWord))));
+	m >>= ((p - bd->start) & (BITS_IN(StgWord) - 1));
+
+	while ( p < bd->free ) {
+
+	    if ((m & 1) == 0) {
+		m >>= 1;
+		p++;
+		if (((StgWord)p & (sizeof(W_) * BITS_IN(StgWord))) == 0) {
+		    goto next;
+		} else {
+		    continue;
+		}
+	    }
+#endif
+
+	    // Problem: we need to know the destination for this cell
+	    // in order to unthread its info pointer.  But we can't
+	    // know the destination without the size, because we may
+	    // spill into the next block.  So we have to run down the 
+	    // threaded list and get the info ptr first.
+	    info = get_threaded_info(p);
+
+	    q = p;
+
+	    p = thread_obj(info, p);
+
+	    size = p - q;
+	    if (free + size > free_bd->start + BLOCK_SIZE_W) {
+		// unset the next bit in the bitmap to indicate that
+		// this object needs to be pushed into the next
+		// block.  This saves us having to run down the
+		// threaded info pointer list twice during the next pass.
+		unmark(q+1,bd);
+		free_bd = free_bd->link;
+		free = free_bd->start;
+	    } else {
+		ASSERT(is_marked(q+1,bd));
+	    }
+
+	    unthread(q,free);
+	    free += size;
+#if 0
+	    goto next;
+#endif
+	}
+    }
+}
+
+static nat
+update_bkwd_compact( step *stp )
+{
+    StgPtr p, free;
+#if 0
+    StgWord m;
+#endif
+    bdescr *bd, *free_bd;
+    StgInfoTable *info;
+    nat size, free_blocks;
+
+    bd = free_bd = stp->old_blocks;
+    free = free_bd->start;
+    free_blocks = 1;
+
+#if defined(PAR)
+    barf("update_bkwd: ToDo");
+#endif
+
+    // cycle through all the blocks in the step
+    for (; bd != NULL; bd = bd->link) {
+	p = bd->start;
+
+	while (p < bd->free ) {
+
+	    while ( p < bd->free && !is_marked(p,bd) ) {
+		p++;
+	    }
+	    if (p >= bd->free) {
+		break;
+	    }
+
+#if 0
+    next:
+	m = * ((StgPtr)bd->u.bitmap + ((p - bd->start) / (BITS_IN(StgWord))));
+	m >>= ((p - bd->start) & (BITS_IN(StgWord) - 1));
+
+	while ( p < bd->free ) {
+
+	    if ((m & 1) == 0) {
+		m >>= 1;
+		p++;
+		if (((StgWord)p & (sizeof(W_) * BITS_IN(StgWord))) == 0) {
+		    goto next;
+		} else {
+		    continue;
+		}
+	    }
+#endif
+
+	    if (!is_marked(p+1,bd)) {
+		// don't forget to update the free ptr in the block desc.
+		free_bd->free = free;
+		free_bd = free_bd->link;
+		free = free_bd->start;
+		free_blocks++;
+	    }
+
+	    unthread(p,free);
+	    ASSERT(LOOKS_LIKE_INFO_PTR(((StgClosure *)p)->header.info));
+	    info = get_itbl((StgClosure *)p);
+	    size = closure_sizeW_((StgClosure *)p,info);
+
+	    if (free != p) {
+		move(free,p,size);
+	    }
+
+	    // relocate TSOs
+	    if (info->type == TSO) {
+		move_TSO((StgTSO *)p, (StgTSO *)free);
+	    }
+
+	    free += size;
+	    p += size;
+#if 0
+	    goto next;
+#endif
+	}
+    }
+
+    // free the remaining blocks and count what's left.
+    free_bd->free = free;
+    if (free_bd->link != NULL) {
+	freeChain(free_bd->link);
+	free_bd->link = NULL;
+    }
+
+    return free_blocks;
+}
+
+void
+compact( void (*get_roots)(evac_fn) )
+{
+    nat g, s, blocks;
+    step *stp;
+
+    // 1. thread the roots
+    get_roots((evac_fn)thread);
+
+    // the weak pointer lists...
+    if (weak_ptr_list != NULL) {
+	thread((StgPtr)(void *)&weak_ptr_list);
+    }
+    if (old_weak_ptr_list != NULL) {
+	thread((StgPtr)(void *)&old_weak_ptr_list); // tmp
+    }
+
+    // mutable lists
+    for (g = 1; g < RtsFlags.GcFlags.generations; g++) {
+	bdescr *bd;
+	StgPtr p;
+	for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
+	    for (p = bd->start; p < bd->free; p++) {
+		thread(p);
+	    }
+	}
+    }
+
+    // the global thread list
+    thread((StgPtr)(void *)&all_threads);
+
+    // any threads resurrected during this GC
+    thread((StgPtr)(void *)&resurrected_threads);
+
+    // the task list
+    {
+	Task *task;
+	for (task = all_tasks; task != NULL; task = task->all_link) {
+	    if (task->tso) {
+		thread((StgPtr)&task->tso);
+	    }
+	}
+    }
+
+    // the static objects
+    thread_static(scavenged_static_objects);
+
+    // the stable pointer table
+    threadStablePtrTable((evac_fn)thread);
+
+    // the CAF list (used by GHCi)
+    markCAFs((evac_fn)thread);
+
+    // 2. update forward ptrs
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	for (s = 0; s < generations[g].n_steps; s++) {
+	    if (g==0 && s ==0) continue;
+	    stp = &generations[g].steps[s];
+	    IF_DEBUG(gc, debugBelch("update_fwd:  %d.%d\n", stp->gen->no, stp->no););
+
+	    update_fwd(stp->blocks);
+	    update_fwd_large(stp->scavenged_large_objects);
+	    if (g == RtsFlags.GcFlags.generations-1 && stp->old_blocks != NULL) {
+		IF_DEBUG(gc, debugBelch("update_fwd:  %d.%d (compact)\n", stp->gen->no, stp->no););
+		update_fwd_compact(stp->old_blocks);
+	    }
+	}
+    }
+
+    // 3. update backward ptrs
+    stp = &oldest_gen->steps[0];
+    if (stp->old_blocks != NULL) {
+	blocks = update_bkwd_compact(stp);
+	IF_DEBUG(gc, debugBelch("update_bkwd: %d.%d (compact, old: %d blocks, now %d blocks)\n", 
+			     stp->gen->no, stp->no,
+			     stp->n_old_blocks, blocks););
+	stp->n_old_blocks = blocks;
+    }
+}
diff --git a/rts/GCCompact.h b/rts/GCCompact.h
new file mode 100644
index 0000000000..0fb39b3b12
--- /dev/null
+++ b/rts/GCCompact.h
@@ -0,0 +1,44 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-2005
+ *
+ * Compacting garbage collector
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef GCCOMPACT_H
+#define GCCOMPACT_H
+
+STATIC_INLINE void 
+mark(StgPtr p, bdescr *bd)
+{
+    nat offset_within_block = p - bd->start; // in words
+    StgPtr bitmap_word = (StgPtr)bd->u.bitmap + 
+	(offset_within_block / (sizeof(W_)*BITS_PER_BYTE));
+    StgWord bit_mask = (StgWord)1 << (offset_within_block & (sizeof(W_)*BITS_PER_BYTE - 1));
+    *bitmap_word |= bit_mask;
+}
+
+STATIC_INLINE void 
+unmark(StgPtr p, bdescr *bd)
+{
+    nat offset_within_block = p - bd->start; // in words
+    StgPtr bitmap_word = (StgPtr)bd->u.bitmap + 
+	(offset_within_block / (sizeof(W_)*BITS_PER_BYTE));
+    StgWord bit_mask = (StgWord)1 << (offset_within_block & (sizeof(W_)*BITS_PER_BYTE - 1));
+    *bitmap_word &= ~bit_mask;
+}
+
+STATIC_INLINE StgWord
+is_marked(StgPtr p, bdescr *bd)
+{
+    nat offset_within_block = p - bd->start; // in words
+    StgPtr bitmap_word = (StgPtr)bd->u.bitmap + 
+	(offset_within_block / (sizeof(W_)*BITS_PER_BYTE));
+    StgWord bit_mask = (StgWord)1 << (offset_within_block & (sizeof(W_)*BITS_PER_BYTE - 1));
+    return (*bitmap_word & bit_mask);
+}
+
+void compact( void (*get_roots)(evac_fn) );
+
+#endif /* GCCOMPACT_H */
diff --git a/rts/GetTime.h b/rts/GetTime.h
new file mode 100644
index 0000000000..5f02df0625
--- /dev/null
+++ b/rts/GetTime.h
@@ -0,0 +1,26 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2005
+ *
+ * Machine-independent interface to time measurement
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef GETTIME_H
+#define GETTIME_H
+
+// We'll use a fixed resolution of usec for now.  The machine
+// dependent implementation may have a different resolution, but we'll
+// normalise to this for the machine independent interface.
+#define TICKS_PER_SECOND 1000000
+typedef StgInt64 Ticks;
+
+Ticks getProcessCPUTime     (void);
+Ticks getThreadCPUTime      (void);
+Ticks getProcessElapsedTime (void);
+void  getProcessTimes       (Ticks *user, Ticks *elapsed);
+
+// Not strictly timing, but related
+nat   getPageFaults         (void);
+
+#endif /* GETTIME_H */
diff --git a/rts/HSprel.def b/rts/HSprel.def
new file mode 100644
index 0000000000..0ffe00b48c
--- /dev/null
+++ b/rts/HSprel.def
@@ -0,0 +1,28 @@
+; list of entry points that the RTS imports from
+; the Prelude.
+EXPORTS
+PrelBase_False_closure
+PrelBase_True_closure
+PrelBase_Czh_con_info DATA
+PrelBase_Czh_static_info DATA
+PrelBase_Izh_con_info DATA
+PrelBase_Izh_static_info DATA
+PrelAddr_I64zh_con_info DATA
+PrelAddr_W64zh_con_info DATA
+PrelAddr_Azh_con_info DATA
+PrelAddr_Azh_static_info DATA
+PrelFloat_Fzh_con_info DATA
+PrelFloat_Fzh_static_info DATA
+PrelFloat_Dzh_con_info DATA
+PrelFloat_Dzh_static_info DATA
+PrelAddr_Wzh_con_info DATA
+PrelAddr_Wzh_static_info DATA
+PrelStable_StablePtr_con_info DATA
+PrelStable_StablePtr_static_info DATA
+PrelPack_unpackCString_closure
+PrelIOBase_stackOverflow_closure
+PrelIOBase_BlockedOnDeadMVar_closure
+PrelIOBase_BlockedIndefinitely_closure
+PrelIOBase_NonTermination_closure
+PrelWeak_runFinalizzerBatch_closure
+__stginit_Prelude
diff --git a/rts/Hash.c b/rts/Hash.c
new file mode 100644
index 0000000000..ada11a6a85
--- /dev/null
+++ b/rts/Hash.c
@@ -0,0 +1,376 @@
+/*-----------------------------------------------------------------------------
+ *
+ * (c) The AQUA Project, Glasgow University, 1995-1998
+ * (c) The GHC Team, 1999
+ *
+ * Dynamically expanding linear hash tables, as described in
+ * Per-\AAke Larson, ``Dynamic Hash Tables,'' CACM 31(4), April 1988,
+ * pp. 446 -- 457.
+ * -------------------------------------------------------------------------- */
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "Hash.h"
+#include "RtsUtils.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#define HSEGSIZE    1024    /* Size of a single hash table segment */
+			    /* Also the minimum size of a hash table */
+#define HDIRSIZE    1024    /* Size of the segment directory */
+			    /* Maximum hash table size is HSEGSIZE * HDIRSIZE */
+#define HLOAD	    5	    /* Maximum average load of a single hash bucket */
+
+#define HCHUNK	    (1024 * sizeof(W_) / sizeof(HashList))
+			    /* Number of HashList cells to allocate in one go */
+
+
+/* Linked list of (key, data) pairs for separate chaining */
+struct hashlist {
+    StgWord key;
+    void *data;
+    struct hashlist *next;  /* Next cell in bucket chain (same hash value) */
+};
+
+typedef struct hashlist HashList;
+
+typedef int HashFunction(HashTable *table, StgWord key);
+typedef int CompareFunction(StgWord key1, StgWord key2);
+
+struct hashtable {
+    int split;		    /* Next bucket to split when expanding */
+    int max;		    /* Max bucket of smaller table */
+    int mask1;		    /* Mask for doing the mod of h_1 (smaller table) */
+    int mask2;		    /* Mask for doing the mod of h_2 (larger table) */
+    int kcount;		    /* Number of keys */
+    int bcount;		    /* Number of buckets */
+    HashList **dir[HDIRSIZE];	/* Directory of segments */
+    HashFunction *hash;		/* hash function */
+    CompareFunction *compare;   /* key comparison function */
+};
+
+/* -----------------------------------------------------------------------------
+ * Hash first using the smaller table.  If the bucket is less than the
+ * next bucket to be split, re-hash using the larger table.
+ * -------------------------------------------------------------------------- */
+
+static int
+hashWord(HashTable *table, StgWord key)
+{
+    int bucket;
+
+    /* Strip the boring zero bits */
+    key /= sizeof(StgWord);
+
+    /* Mod the size of the hash table (a power of 2) */
+    bucket = key & table->mask1;
+
+    if (bucket < table->split) {
+	/* Mod the size of the expanded hash table (also a power of 2) */
+	bucket = key & table->mask2;
+    }
+    return bucket;
+}
+
+static int
+hashStr(HashTable *table, char *key)
+{
+    int h, bucket;
+    char *s;
+
+    s = key;
+    for (h=0; *s; s++) {
+	h *= 128;
+	h += *s;
+	h = h % 1048583;	/* some random large prime */
+    }
+
+    /* Mod the size of the hash table (a power of 2) */
+    bucket = h & table->mask1;
+
+    if (bucket < table->split) {
+	/* Mod the size of the expanded hash table (also a power of 2) */
+	bucket = h & table->mask2;
+    }
+
+    return bucket;
+}
+
+static int
+compareWord(StgWord key1, StgWord key2)
+{
+    return (key1 == key2);
+}
+
+static int
+compareStr(StgWord key1, StgWord key2)
+{
+    return (strcmp((char *)key1, (char *)key2) == 0);
+}
+
+
+/* -----------------------------------------------------------------------------
+ * Allocate a new segment of the dynamically growing hash table.
+ * -------------------------------------------------------------------------- */
+
+static void
+allocSegment(HashTable *table, int segment)
+{
+    table->dir[segment] = stgMallocBytes(HSEGSIZE * sizeof(HashList *), 
+					 "allocSegment");
+}
+
+
+/* -----------------------------------------------------------------------------
+ * Expand the larger hash table by one bucket, and split one bucket
+ * from the smaller table into two parts.  Only the bucket referenced
+ * by @table->split@ is affected by the expansion.
+ * -------------------------------------------------------------------------- */
+
+static void
+expand(HashTable *table)
+{
+    int oldsegment;
+    int oldindex;
+    int newbucket;
+    int newsegment;
+    int newindex;
+    HashList *hl;
+    HashList *next;
+    HashList *old, *new;
+
+    if (table->split + table->max >= HDIRSIZE * HSEGSIZE)
+	/* Wow!  That's big.  Too big, so don't expand. */
+	return;
+
+    /* Calculate indices of bucket to split */
+    oldsegment = table->split / HSEGSIZE;
+    oldindex = table->split % HSEGSIZE;
+
+    newbucket = table->max + table->split;
+
+    /* And the indices of the new bucket */
+    newsegment = newbucket / HSEGSIZE;
+    newindex = newbucket % HSEGSIZE;
+
+    if (newindex == 0)
+	allocSegment(table, newsegment);
+
+    if (++table->split == table->max) {
+	table->split = 0;
+	table->max *= 2;
+	table->mask1 = table->mask2;
+	table->mask2 = table->mask2 << 1 | 1;
+    }
+    table->bcount++;
+
+    /* Split the bucket, paying no attention to the original order */
+
+    old = new = NULL;
+    for (hl = table->dir[oldsegment][oldindex]; hl != NULL; hl = next) {
+	next = hl->next;
+	if (table->hash(table, hl->key) == newbucket) {
+	    hl->next = new;
+	    new = hl;
+	} else {
+	    hl->next = old;
+	    old = hl;
+	}
+    }
+    table->dir[oldsegment][oldindex] = old;
+    table->dir[newsegment][newindex] = new;
+
+    return;
+}
+
+void *
+lookupHashTable(HashTable *table, StgWord key)
+{
+    int bucket;
+    int segment;
+    int index;
+    HashList *hl;
+
+    bucket = table->hash(table, key);
+    segment = bucket / HSEGSIZE;
+    index = bucket % HSEGSIZE;
+
+    for (hl = table->dir[segment][index]; hl != NULL; hl = hl->next)
+	if (table->compare(hl->key, key))
+	    return hl->data;
+
+    /* It's not there */
+    return NULL;
+}
+
+/* -----------------------------------------------------------------------------
+ * We allocate the hashlist cells in large chunks to cut down on malloc
+ * overhead.  Although we keep a free list of hashlist cells, we make
+ * no effort to actually return the space to the malloc arena.
+ * -------------------------------------------------------------------------- */
+
+static HashList *freeList = NULL;
+
+static HashList *
+allocHashList(void)
+{
+    HashList *hl, *p;
+
+    if ((hl = freeList) != NULL) {
+	freeList = hl->next;
+    } else {
+        hl = stgMallocBytes(HCHUNK * sizeof(HashList), "allocHashList");
+
+	freeList = hl + 1;
+	for (p = freeList; p < hl + HCHUNK - 1; p++)
+	    p->next = p + 1;
+	p->next = NULL;
+    }
+    return hl;
+}
+
+static void
+freeHashList(HashList *hl)
+{
+    hl->next = freeList;
+    freeList = hl;
+}
+
+void
+insertHashTable(HashTable *table, StgWord key, void *data)
+{
+    int bucket;
+    int segment;
+    int index;
+    HashList *hl;
+
+    // Disable this assert; sometimes it's useful to be able to
+    // overwrite entries in the hash table.
+    // ASSERT(lookupHashTable(table, key) == NULL);
+
+    /* When the average load gets too high, we expand the table */
+    if (++table->kcount >= HLOAD * table->bcount)
+	expand(table);
+
+    bucket = table->hash(table, key);
+    segment = bucket / HSEGSIZE;
+    index = bucket % HSEGSIZE;
+
+    hl = allocHashList();
+
+    hl->key = key;
+    hl->data = data;
+    hl->next = table->dir[segment][index];
+    table->dir[segment][index] = hl;
+
+}
+
+void *
+removeHashTable(HashTable *table, StgWord key, void *data)
+{
+    int bucket;
+    int segment;
+    int index;
+    HashList *hl;
+    HashList *prev = NULL;
+
+    bucket = table->hash(table, key);
+    segment = bucket / HSEGSIZE;
+    index = bucket % HSEGSIZE;
+
+    for (hl = table->dir[segment][index]; hl != NULL; hl = hl->next) {
+	if (table->compare(hl->key,key) && (data == NULL || hl->data == data)) {
+	    if (prev == NULL)
+		table->dir[segment][index] = hl->next;
+	    else
+		prev->next = hl->next;
+	    freeHashList(hl);
+	    table->kcount--;
+	    return hl->data;
+	}
+	prev = hl;
+    }
+
+    /* It's not there */
+    ASSERT(data == NULL);
+    return NULL;
+}
+
+/* -----------------------------------------------------------------------------
+ * When we free a hash table, we are also good enough to free the
+ * data part of each (key, data) pair, as long as our caller can tell
+ * us how to do it.
+ * -------------------------------------------------------------------------- */
+
+void
+freeHashTable(HashTable *table, void (*freeDataFun)(void *) )
+{
+    long segment;
+    long index;
+    HashList *hl;
+    HashList *next;
+
+    /* The last bucket with something in it is table->max + table->split - 1 */
+    segment = (table->max + table->split - 1) / HSEGSIZE;
+    index = (table->max + table->split - 1) % HSEGSIZE;
+
+    while (segment >= 0) {
+	while (index >= 0) {
+	    for (hl = table->dir[segment][index]; hl != NULL; hl = next) {
+		next = hl->next;
+		if (freeDataFun != NULL)
+		    (*freeDataFun)(hl->data);
+		freeHashList(hl);
+	    }
+	    index--;
+	}
+	stgFree(table->dir[segment]);
+	segment--;
+	index = HSEGSIZE - 1;
+    }
+    stgFree(table);
+}
+
+/* -----------------------------------------------------------------------------
+ * When we initialize a hash table, we set up the first segment as well,
+ * initializing all of the first segment's hash buckets to NULL.
+ * -------------------------------------------------------------------------- */
+
+static HashTable *
+allocHashTable_(HashFunction *hash, CompareFunction *compare)
+{
+    HashTable *table;
+    HashList **hb;
+
+    table = stgMallocBytes(sizeof(HashTable),"allocHashTable");
+
+    allocSegment(table, 0);
+
+    for (hb = table->dir[0]; hb < table->dir[0] + HSEGSIZE; hb++)
+	*hb = NULL;
+
+    table->split = 0;
+    table->max = HSEGSIZE;
+    table->mask1 = HSEGSIZE - 1;
+    table->mask2 = 2 * HSEGSIZE - 1;
+    table->kcount = 0;
+    table->bcount = HSEGSIZE;
+    table->hash = hash;
+    table->compare = compare;
+
+    return table;
+}
+
+HashTable *
+allocHashTable(void)
+{
+    return allocHashTable_(hashWord, compareWord);
+}
+
+HashTable *
+allocStrHashTable(void)
+{
+    return allocHashTable_((HashFunction *)hashStr, 
+			   (CompareFunction *)compareStr);
+}
diff --git a/rts/Hash.h b/rts/Hash.h
new file mode 100644
index 0000000000..ad55953da4
--- /dev/null
+++ b/rts/Hash.h
@@ -0,0 +1,40 @@
+/*-----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1999
+ *
+ * Prototypes for Hash.c
+ *
+ * -------------------------------------------------------------------------- */
+
+#ifndef HASH_H
+#define HASH_H
+
+typedef struct hashtable HashTable; /* abstract */
+
+/* Hash table access where the keys are StgWords */
+HashTable * allocHashTable    ( void );
+void *      lookupHashTable ( HashTable *table, StgWord key );
+void        insertHashTable ( HashTable *table, StgWord key, void *data );
+void *      removeHashTable ( HashTable *table, StgWord key, void *data );
+
+/* Hash table access where the keys are C strings (the strings are
+ * assumed to be allocated by the caller, and mustn't be deallocated
+ * until the corresponding hash table entry has been removed).
+ */
+HashTable * allocStrHashTable ( void );
+
+#define lookupStrHashTable(table, key)  \
+   (lookupHashTable(table, (StgWord)key))
+
+#define insertStrHashTable(table, key, data)  \
+   (insertHashTable(table, (StgWord)key, data))
+
+#define removeStrHashTable(table, key, data) \
+   (removeHashTable(table, (StgWord)key, data))
+
+/* Freeing hash tables 
+ */
+void freeHashTable ( HashTable *table, void (*freeDataFun)(void *) );
+
+#endif /* HASH_H */
+
diff --git a/rts/HeapStackCheck.cmm b/rts/HeapStackCheck.cmm
new file mode 100644
index 0000000000..4e5dd24596
--- /dev/null
+++ b/rts/HeapStackCheck.cmm
@@ -0,0 +1,964 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * Canned Heap-Check and Stack-Check sequences.
+ *
+ * This file is written in a subset of C--, extended with various
+ * features specific to GHC.  It is compiled by GHC directly.  For the
+ * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Cmm.h"
+
+/* Stack/Heap Check Failure
+ * ------------------------
+ *
+ * On discovering that a stack or heap check has failed, we do the following:
+ *
+ *    - If the context_switch flag is set, indicating that there are more
+ *      threads waiting to run, we yield to the scheduler 
+ *	(return ThreadYielding).
+ *
+ *    - If Hp > HpLim, we've had a heap check failure.  This means we've
+ *	come to the end of the current heap block, so we try to chain
+ *	another block on with ExtendNursery().  
+ *
+ *	     - If this succeeds, we carry on without returning to the 
+ *	       scheduler.  
+ *
+ *	     - If it fails, we return to the scheduler claiming HeapOverflow
+ *	       so that a garbage collection can be performed.
+ *
+ *    - If Hp <= HpLim, it must have been a stack check that failed.  In
+ *	which case, we return to the scheduler claiming StackOverflow, the
+ *	scheduler will either increase the size of our stack, or raise
+ *	an exception if the stack is already too big.
+ *
+ * The effect of checking for context switch only in the heap/stack check
+ * failure code is that we'll switch threads after the current thread has
+ * reached the end of its heap block.  If a thread isn't allocating
+ * at all, it won't yield.  Hopefully this won't be a problem in practice.
+ */
+ 
+#define PRE_RETURN(why,what_next)			\
+  StgTSO_what_next(CurrentTSO) = what_next::I16;	\
+  StgRegTable_rRet(BaseReg) = why;           	        \
+  R1 = BaseReg;
+
+/* Remember that the return address is *removed* when returning to a
+ * ThreadRunGHC thread.
+ */
+
+#define GC_GENERIC						\
+    DEBUG_ONLY(foreign "C" heapCheckFail());			\
+    if (Hp > HpLim) {						\
+        Hp = Hp - HpAlloc/*in bytes*/;				\
+        if (HpAlloc <= BLOCK_SIZE				\
+            && bdescr_link(CurrentNursery) != NULL) {		\
+            CLOSE_NURSERY();					\
+            CurrentNursery = bdescr_link(CurrentNursery);	\
+            OPEN_NURSERY();					\
+            if (CInt[context_switch] != 0 :: CInt) {		\
+                R1 = ThreadYielding;				\
+                goto sched;					\
+            } else {						\
+                jump %ENTRY_CODE(Sp(0));			\
+            }							\
+	} else {						\
+            R1 = HeapOverflow;					\
+            goto sched;						\
+        }							\
+    } else {							\
+        R1 = StackOverflow;					\
+    }								\
+  sched:							\
+    PRE_RETURN(R1,ThreadRunGHC);				\
+    jump stg_returnToSched;
+
+#define HP_GENERIC				\
+   PRE_RETURN(HeapOverflow, ThreadRunGHC)	\
+  jump stg_returnToSched;
+
+#define BLOCK_GENERIC				\
+   PRE_RETURN(ThreadBlocked,  ThreadRunGHC)	\
+  jump stg_returnToSched;
+
+#define YIELD_GENERIC				\
+  PRE_RETURN(ThreadYielding, ThreadRunGHC)	\
+  jump stg_returnToSched;
+
+#define BLOCK_BUT_FIRST(c)			\
+  PRE_RETURN(ThreadBlocked, ThreadRunGHC)	\
+  R2 = c;					\
+  jump stg_returnToSchedButFirst;
+
+#define YIELD_TO_INTERPRETER			\
+  PRE_RETURN(ThreadYielding, ThreadInterpret)	\
+  jump stg_returnToSchedNotPaused;
+
+/* -----------------------------------------------------------------------------
+   Heap checks in thunks/functions.
+
+   In these cases, node always points to the function closure.  This gives
+   us an easy way to return to the function: just leave R1 on the top of
+   the stack, and have the scheduler enter it to return.
+
+   There are canned sequences for 'n' pointer values in registers.
+   -------------------------------------------------------------------------- */
+
+INFO_TABLE_RET( stg_enter, 1/*framesize*/, 0/*bitmap*/, RET_SMALL)
+{
+    R1 = Sp(1);
+    Sp_adj(2);
+    ENTER();
+}
+
+__stg_gc_enter_1
+{
+    Sp_adj(-2);
+    Sp(1) = R1;
+    Sp(0) = stg_enter_info;
+    GC_GENERIC
+}
+
+#if defined(GRAN)
+/*
+  ToDo: merge the block and yield macros, calling something like BLOCK(N)
+        at the end;
+*/
+
+/* 
+   Should we actually ever do a yield in such a case?? -- HWL
+*/
+gran_yield_0
+{
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadYielding;
+    jump StgReturn;
+}
+
+gran_yield_1
+{
+    Sp_adj(-1);
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadYielding;
+    jump StgReturn;
+}
+
+/*- 2 Regs--------------------------------------------------------------------*/
+
+gran_yield_2
+{
+    Sp_adj(-2);
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadYielding;
+    jump StgReturn;
+}
+
+/*- 3 Regs -------------------------------------------------------------------*/
+
+gran_yield_3
+{
+    Sp_adj(-3);
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadYielding;
+    jump StgReturn;
+}
+
+/*- 4 Regs -------------------------------------------------------------------*/
+
+gran_yield_4
+{
+    Sp_adj(-4);
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadYielding;
+    jump StgReturn;
+}
+
+/*- 5 Regs -------------------------------------------------------------------*/
+
+gran_yield_5
+{
+    Sp_adj(-5);
+    Sp(4) = R5;
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadYielding;
+    jump StgReturn;
+}
+
+/*- 6 Regs -------------------------------------------------------------------*/
+
+gran_yield_6
+{
+    Sp_adj(-6);
+    Sp(5) = R6;
+    Sp(4) = R5;
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadYielding;
+    jump StgReturn;
+}
+
+/*- 7 Regs -------------------------------------------------------------------*/
+
+gran_yield_7
+{
+    Sp_adj(-7);
+    Sp(6) = R7;
+    Sp(5) = R6;
+    Sp(4) = R5;
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadYielding;
+    jump StgReturn;
+}
+
+/*- 8 Regs -------------------------------------------------------------------*/
+
+gran_yield_8
+{
+    Sp_adj(-8);
+    Sp(7) = R8;
+    Sp(6) = R7;
+    Sp(5) = R6;
+    Sp(4) = R5;
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadYielding;
+    jump StgReturn;
+}
+
+// the same routines but with a block rather than a yield
+
+gran_block_1
+{
+    Sp_adj(-1);
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadBlocked;
+    jump StgReturn;
+}
+
+/*- 2 Regs--------------------------------------------------------------------*/
+
+gran_block_2
+{
+    Sp_adj(-2);
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadBlocked;
+    jump StgReturn;
+}
+
+/*- 3 Regs -------------------------------------------------------------------*/
+
+gran_block_3
+{
+    Sp_adj(-3);
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadBlocked;
+    jump StgReturn;
+}
+
+/*- 4 Regs -------------------------------------------------------------------*/
+
+gran_block_4
+{
+    Sp_adj(-4);
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadBlocked;
+    jump StgReturn;
+}
+
+/*- 5 Regs -------------------------------------------------------------------*/
+
+gran_block_5
+{
+    Sp_adj(-5);
+    Sp(4) = R5;
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadBlocked;
+    jump StgReturn;
+}
+
+/*- 6 Regs -------------------------------------------------------------------*/
+
+gran_block_6
+{
+    Sp_adj(-6);
+    Sp(5) = R6;
+    Sp(4) = R5;
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadBlocked;
+    jump StgReturn;
+}
+
+/*- 7 Regs -------------------------------------------------------------------*/
+
+gran_block_7
+{
+    Sp_adj(-7);
+    Sp(6) = R7;
+    Sp(5) = R6;
+    Sp(4) = R5;
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadBlocked;
+    jump StgReturn;
+}
+
+/*- 8 Regs -------------------------------------------------------------------*/
+
+gran_block_8
+{
+    Sp_adj(-8);
+    Sp(7) = R8;
+    Sp(6) = R7;
+    Sp(5) = R6;
+    Sp(4) = R5;
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadBlocked;
+    jump StgReturn;
+}
+
+#endif
+
+#if 0 && defined(PAR)
+
+/*
+  Similar to stg_block_1 (called via StgMacro BLOCK_NP) but separates the
+  saving of the thread state from the actual jump via an StgReturn.
+  We need this separation because we call RTS routines in blocking entry codes
+  before jumping back into the RTS (see parallel/FetchMe.hc).
+*/
+
+par_block_1_no_jump
+{
+    Sp_adj(-1);
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+}
+
+par_jump
+{
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadBlocked;
+    jump StgReturn;
+}
+
+#endif
+
+/* -----------------------------------------------------------------------------
+   Heap checks in Primitive case alternatives
+
+   A primitive case alternative is entered with a value either in 
+   R1, FloatReg1 or D1 depending on the return convention.  All the
+   cases are covered below.
+   -------------------------------------------------------------------------- */
+
+/*-- No Registers live ------------------------------------------------------ */
+
+stg_gc_noregs
+{
+    GC_GENERIC
+}
+
+/*-- void return ------------------------------------------------------------ */
+
+INFO_TABLE_RET( stg_gc_void, 0/*framesize*/, 0/*bitmap*/, RET_SMALL)
+{
+    Sp_adj(1);
+    jump %ENTRY_CODE(Sp(0));
+}
+
+/*-- R1 is boxed/unpointed -------------------------------------------------- */
+
+INFO_TABLE_RET( stg_gc_unpt_r1, 1/*framesize*/, 0/*bitmap*/, RET_SMALL)
+{
+    R1 = Sp(1);
+    Sp_adj(2);
+    jump %ENTRY_CODE(Sp(0));
+}
+
+stg_gc_unpt_r1
+{
+    Sp_adj(-2);
+    Sp(1) = R1;
+    Sp(0) = stg_gc_unpt_r1_info;
+    GC_GENERIC
+}
+
+/*-- R1 is unboxed -------------------------------------------------- */
+
+/* the 1 is a bitmap - i.e. 1 non-pointer word on the stack. */
+INFO_TABLE_RET(	stg_gc_unbx_r1, 1/*framesize*/, 1/*bitmap*/, RET_SMALL )
+{
+    R1 = Sp(1);
+    Sp_adj(2);
+    jump %ENTRY_CODE(Sp(0));
+}
+
+stg_gc_unbx_r1
+{
+    Sp_adj(-2);
+    Sp(1) = R1;
+    Sp(0) = stg_gc_unbx_r1_info;
+    GC_GENERIC
+}
+
+/*-- F1 contains a float ------------------------------------------------- */
+
+INFO_TABLE_RET(	stg_gc_f1, 1/*framesize*/, 1/*bitmap*/, RET_SMALL )
+{
+    F1 = F_[Sp+WDS(1)];
+    Sp_adj(2);
+    jump %ENTRY_CODE(Sp(0));
+}
+
+stg_gc_f1
+{
+    Sp_adj(-2);
+    F_[Sp + WDS(1)] = F1;
+    Sp(0) = stg_gc_f1_info;
+    GC_GENERIC
+}
+
+/*-- D1 contains a double ------------------------------------------------- */
+
+/* we support doubles of either 1 or 2 words in size */
+
+#if SIZEOF_DOUBLE == SIZEOF_VOID_P
+#  define DBL_BITMAP 1
+#  define DBL_WORDS  1
+#else
+#  define DBL_BITMAP 3
+#  define DBL_WORDS  2
+#endif 
+
+INFO_TABLE_RET(	stg_gc_d1, DBL_WORDS/*framesize*/, DBL_BITMAP/*bitmap*/, RET_SMALL )
+{
+    D1 = D_[Sp + WDS(1)];
+    Sp = Sp + WDS(1) + SIZEOF_StgDouble;
+    jump %ENTRY_CODE(Sp(0));
+}
+
+stg_gc_d1
+{
+    Sp = Sp - WDS(1) - SIZEOF_StgDouble;
+    D_[Sp + WDS(1)] = D1;
+    Sp(0) = stg_gc_d1_info;
+    GC_GENERIC
+}
+
+
+/*-- L1 contains an int64 ------------------------------------------------- */
+
+/* we support int64s of either 1 or 2 words in size */
+
+#if SIZEOF_VOID_P == 8
+#  define LLI_BITMAP 1
+#  define LLI_WORDS  1
+#else
+#  define LLI_BITMAP 3
+#  define LLI_WORDS  2
+#endif 
+
+INFO_TABLE_RET( stg_gc_l1, LLI_WORDS/*framesize*/, LLI_BITMAP/*bitmap*/, RET_SMALL )
+{
+    L1 = L_[Sp + WDS(1)];
+    Sp_adj(1) + SIZEOF_StgWord64;
+    jump %ENTRY_CODE(Sp(0));
+}
+
+stg_gc_l1
+{
+    Sp_adj(-1) - SIZEOF_StgWord64;
+    L_[Sp + WDS(1)] = L1;
+    Sp(0) = stg_gc_l1_info;
+    GC_GENERIC
+}
+
+/*-- Unboxed tuple return, one pointer (unregisterised build only) ---------- */
+
+INFO_TABLE_RET( stg_ut_1_0_unreg, 1/*size*/, 0/*BITMAP*/, RET_SMALL )
+{
+    Sp_adj(1);
+    // one ptr is on the stack (Sp(0))
+    jump %ENTRY_CODE(Sp(1));
+}
+
+/* -----------------------------------------------------------------------------
+   Generic function entry heap check code.
+
+   At a function entry point, the arguments are as per the calling convention,
+   i.e. some in regs and some on the stack.  There may or may not be 
+   a pointer to the function closure in R1 - if there isn't, then the heap
+   check failure code in the function will arrange to load it.
+
+   The function's argument types are described in its info table, so we
+   can just jump to this bit of generic code to save away all the
+   registers and return to the scheduler.
+
+   This code arranges the stack like this:
+	 
+         |        ....         |
+         |        args         |
+	 +---------------------+
+         |      f_closure      |
+	 +---------------------+
+         |        size         |
+	 +---------------------+
+         |   stg_gc_fun_info   |
+	 +---------------------+
+
+   The size is the number of words of arguments on the stack, and is cached
+   in the frame in order to simplify stack walking: otherwise the size of
+   this stack frame would have to be calculated by looking at f's info table.
+
+   -------------------------------------------------------------------------- */
+
+__stg_gc_fun
+{
+    W_ size;
+    W_ info;
+    W_ type;
+
+    info = %GET_FUN_INFO(R1);
+
+    // cache the size
+    type = TO_W_(StgFunInfoExtra_fun_type(info));
+    if (type == ARG_GEN) {
+	size = BITMAP_SIZE(StgFunInfoExtra_bitmap(info));
+    } else { 
+	if (type == ARG_GEN_BIG) {
+#ifdef TABLES_NEXT_TO_CODE
+            // bitmap field holds an offset
+            size = StgLargeBitmap_size( StgFunInfoExtra_bitmap(info)
+                                        + %GET_ENTRY(R1) /* ### */ );
+#else
+	    size = StgLargeBitmap_size( StgFunInfoExtra_bitmap(info) );
+#endif
+	} else {
+	    size = BITMAP_SIZE(W_[stg_arg_bitmaps + WDS(type)]);
+	}
+    }
+    
+#ifdef NO_ARG_REGS
+    // we don't have to save any registers away
+    Sp_adj(-3);
+    Sp(2) = R1;
+    Sp(1) = size;
+    Sp(0) = stg_gc_fun_info;
+    GC_GENERIC
+#else
+    W_ type;
+    type = TO_W_(StgFunInfoExtra_fun_type(info));
+    // cache the size
+    if (type == ARG_GEN || type == ARG_GEN_BIG) {
+        // regs already saved by the heap check code
+        Sp_adj(-3);
+        Sp(2) = R1;
+        Sp(1) = size;
+        Sp(0) = stg_gc_fun_info;
+        // DEBUG_ONLY(foreign "C" debugBelch("stg_fun_gc_gen(ARG_GEN)"););
+        GC_GENERIC
+    } else { 
+	jump W_[stg_stack_save_entries + WDS(type)];
+	    // jumps to stg_gc_noregs after saving stuff
+    }
+#endif /* !NO_ARG_REGS */
+}
+
+/* -----------------------------------------------------------------------------
+   Generic Apply (return point)
+
+   The dual to stg_fun_gc_gen (above): this fragment returns to the
+   function, passing arguments in the stack and in registers
+   appropriately.  The stack layout is given above.
+   -------------------------------------------------------------------------- */
+
+INFO_TABLE_RET( stg_gc_fun, 0/*framesize*/, 0/*bitmap*/, RET_FUN )
+{
+    R1 = Sp(2);
+    Sp_adj(3);
+#ifdef NO_ARG_REGS
+    // Minor optimisation: there are no argument registers to load up,
+    // so we can just jump straight to the function's entry point.
+    jump %GET_ENTRY(R1);
+#else
+    W_ info;
+    W_ type;
+    
+    info = %GET_FUN_INFO(R1);
+    type = TO_W_(StgFunInfoExtra_fun_type(info));
+    if (type == ARG_GEN || type == ARG_GEN_BIG) {
+	jump StgFunInfoExtra_slow_apply(info);
+    } else { 
+	if (type == ARG_BCO) {
+	    // cover this case just to be on the safe side
+	    Sp_adj(-2);
+	    Sp(1) = R1;
+	    Sp(0) = stg_apply_interp_info;
+	    jump stg_yield_to_interpreter;
+	} else {
+	    jump W_[stg_ap_stack_entries + WDS(type)];
+	}
+    }
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+   Generic Heap Check Code.
+
+   Called with Liveness mask in R9,  Return address in R10.
+   Stack must be consistent (containing all necessary info pointers
+   to relevant SRTs).
+
+   See StgMacros.h for a description of the RET_DYN stack frame.
+
+   We also define an stg_gen_yield here, because it's very similar.
+   -------------------------------------------------------------------------- */
+
+// For simplicity, we assume that SIZEOF_DOUBLE == 2*SIZEOF_VOID_P
+// on a 64-bit machine, we'll end up wasting a couple of words, but
+// it's not a big deal.
+
+#define RESTORE_EVERYTHING			\
+    L1   = L_[Sp + WDS(19)];			\
+    D2   = D_[Sp + WDS(17)];			\
+    D1   = D_[Sp + WDS(15)];			\
+    F4   = F_[Sp + WDS(14)];			\
+    F3   = F_[Sp + WDS(13)];			\
+    F2   = F_[Sp + WDS(12)];			\
+    F1   = F_[Sp + WDS(11)];			\
+    R8 = Sp(10);				\
+    R7 = Sp(9);					\
+    R6 = Sp(8);					\
+    R5 = Sp(7);					\
+    R4 = Sp(6);					\
+    R3 = Sp(5);					\
+    R2 = Sp(4);					\
+    R1 = Sp(3);					\
+    Sp_adj(21);
+
+#define RET_OFFSET (-19)
+
+#define SAVE_EVERYTHING				\
+    Sp_adj(-21);				\
+    L_[Sp + WDS(19)] = L1;			\
+    D_[Sp + WDS(17)] = D2;			\
+    D_[Sp + WDS(15)] = D1;			\
+    F_[Sp + WDS(14)] = F4;			\
+    F_[Sp + WDS(13)] = F3;			\
+    F_[Sp + WDS(12)] = F2;			\
+    F_[Sp + WDS(11)] = F1;			\
+    Sp(10) = R8;				\
+    Sp(9) = R7;					\
+    Sp(8) = R6;					\
+    Sp(7) = R5;					\
+    Sp(6) = R4;					\
+    Sp(5) = R3;					\
+    Sp(4) = R2;					\
+    Sp(3) = R1;					\
+    Sp(2) = R10;    /* return address */	\
+    Sp(1) = R9;     /* liveness mask  */	\
+    Sp(0) = stg_gc_gen_info;
+
+INFO_TABLE_RET( stg_gc_gen, 0/*framesize*/, 0/*bitmap*/, RET_DYN )
+/* bitmap in the above info table is unused, the real one is on the stack. */
+{
+    RESTORE_EVERYTHING;
+    jump Sp(RET_OFFSET); /* No %ENTRY_CODE( - this is an actual code ptr */
+}
+
+stg_gc_gen
+{
+    SAVE_EVERYTHING;
+    GC_GENERIC
+}	  
+
+// A heap check at an unboxed tuple return point.  The return address
+// is on the stack, and we can find it by using the offsets given
+// to us in the liveness mask.
+stg_gc_ut
+{
+    R10 = %ENTRY_CODE(Sp(RET_DYN_NONPTRS(R9) + RET_DYN_PTRS(R9)));
+    SAVE_EVERYTHING;
+    GC_GENERIC
+}
+
+/*
+ * stg_gen_hp is used by MAYBE_GC, where we can't use GC_GENERIC
+ * because we've just failed doYouWantToGC(), not a standard heap
+ * check.  GC_GENERIC would end up returning StackOverflow.
+ */
+stg_gc_gen_hp
+{
+    SAVE_EVERYTHING;
+    HP_GENERIC
+}	  
+
+/* -----------------------------------------------------------------------------
+   Yields
+   -------------------------------------------------------------------------- */
+
+stg_gen_yield
+{
+    SAVE_EVERYTHING;
+    YIELD_GENERIC
+}
+
+stg_yield_noregs
+{
+    YIELD_GENERIC;
+}
+
+/* -----------------------------------------------------------------------------
+   Yielding to the interpreter... top of stack says what to do next.
+   -------------------------------------------------------------------------- */
+
+stg_yield_to_interpreter
+{
+    YIELD_TO_INTERPRETER;
+}
+
+/* -----------------------------------------------------------------------------
+   Blocks
+   -------------------------------------------------------------------------- */
+
+stg_gen_block
+{
+    SAVE_EVERYTHING;
+    BLOCK_GENERIC;
+}
+
+stg_block_noregs
+{
+    BLOCK_GENERIC;
+}
+
+stg_block_1
+{
+    Sp_adj(-2);
+    Sp(1) = R1;
+    Sp(0) = stg_enter_info;
+    BLOCK_GENERIC;
+}
+
+/* -----------------------------------------------------------------------------
+ * takeMVar/putMVar-specific blocks
+ *
+ * Stack layout for a thread blocked in takeMVar:
+ *      
+ *       ret. addr
+ *       ptr to MVar   (R1)
+ *       stg_block_takemvar_info
+ *
+ * Stack layout for a thread blocked in putMVar:
+ *      
+ *       ret. addr
+ *       ptr to Value  (R2)
+ *       ptr to MVar   (R1)
+ *       stg_block_putmvar_info
+ *
+ * See PrimOps.hc for a description of the workings of take/putMVar.
+ * 
+ * -------------------------------------------------------------------------- */
+
+INFO_TABLE_RET( stg_block_takemvar, 1/*framesize*/, 0/*bitmap*/, RET_SMALL )
+{
+    R1 = Sp(1);
+    Sp_adj(2);
+    jump takeMVarzh_fast;
+}
+
+// code fragment executed just before we return to the scheduler
+stg_block_takemvar_finally
+{
+#ifdef THREADED_RTS
+    foreign "C" unlockClosure(R3 "ptr", stg_EMPTY_MVAR_info);
+#endif
+    jump StgReturn;
+}
+
+stg_block_takemvar
+{
+    Sp_adj(-2);
+    Sp(1) = R1;
+    Sp(0) = stg_block_takemvar_info;
+    R3 = R1;
+    BLOCK_BUT_FIRST(stg_block_takemvar_finally);
+}
+
+INFO_TABLE_RET( stg_block_putmvar, 2/*framesize*/, 0/*bitmap*/, RET_SMALL )
+{
+    R2 = Sp(2);
+    R1 = Sp(1);
+    Sp_adj(3);
+    jump putMVarzh_fast;
+}
+
+// code fragment executed just before we return to the scheduler
+stg_block_putmvar_finally
+{
+#ifdef THREADED_RTS
+    foreign "C" unlockClosure(R3 "ptr", stg_FULL_MVAR_info);
+#endif
+    jump StgReturn;
+}
+
+stg_block_putmvar
+{
+    Sp_adj(-3);
+    Sp(2) = R2;
+    Sp(1) = R1;
+    Sp(0) = stg_block_putmvar_info;
+    R3 = R1;
+    BLOCK_BUT_FIRST(stg_block_putmvar_finally);
+}
+
+// code fragment executed just before we return to the scheduler
+stg_block_blackhole_finally
+{
+#if defined(THREADED_RTS)
+    // The last thing we do is release sched_lock, which is
+    // preventing other threads from accessing blackhole_queue and
+    // picking up this thread before we are finished with it.
+    foreign "C" RELEASE_LOCK(sched_mutex "ptr");
+#endif
+    jump StgReturn;
+}
+
+stg_block_blackhole
+{
+    Sp_adj(-2);
+    Sp(1) = R1;
+    Sp(0) = stg_enter_info;
+    BLOCK_BUT_FIRST(stg_block_blackhole_finally);
+}
+
+#ifdef mingw32_HOST_OS
+INFO_TABLE_RET( stg_block_async, 0/*framesize*/, 0/*bitmap*/, RET_SMALL )
+{
+    W_ ares;
+    W_ len, errC;
+
+    ares = StgTSO_block_info(CurrentTSO);
+    len = StgAsyncIOResult_len(ares);
+    errC = StgAsyncIOResult_errCode(ares);
+    StgTSO_block_info(CurrentTSO) = NULL;
+    foreign "C" free(ares "ptr");
+    R1 = len;
+    Sp(0) = errC;
+    jump %ENTRY_CODE(Sp(1));
+}
+
+stg_block_async
+{
+    Sp_adj(-1);
+    Sp(0) = stg_block_async_info;
+    BLOCK_GENERIC;
+}
+
+/* Used by threadDelay implementation; it would be desirable to get rid of
+ * this free()'ing void return continuation.
+ */
+INFO_TABLE_RET( stg_block_async_void, 0/*framesize*/, 0/*bitmap*/, RET_SMALL )
+{
+    W_ ares;
+
+    ares = StgTSO_block_info(CurrentTSO);
+    StgTSO_block_info(CurrentTSO) = NULL;
+    foreign "C" free(ares "ptr");
+    Sp_adj(1);
+    jump %ENTRY_CODE(Sp(0));
+}
+
+stg_block_async_void
+{
+    Sp_adj(-1);
+    Sp(0) = stg_block_async_void_info;
+    BLOCK_GENERIC;
+}
+
+#endif
+
+/* -----------------------------------------------------------------------------
+   STM-specific waiting
+   -------------------------------------------------------------------------- */
+
+stg_block_stmwait_finally
+{
+    foreign "C" stmWaitUnlock(MyCapability() "ptr", R3 "ptr");
+    jump StgReturn;
+}
+
+stg_block_stmwait
+{
+    BLOCK_BUT_FIRST(stg_block_stmwait_finally);
+}
diff --git a/rts/HsFFI.c b/rts/HsFFI.c
new file mode 100644
index 0000000000..350bcfbdec
--- /dev/null
+++ b/rts/HsFFI.c
@@ -0,0 +1,40 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2005
+ *
+ * RTS entry points as mandated by the FFI addendum to the Haskell 98 report
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "HsFFI.h"
+#include "Rts.h"
+
+// hs_init and hs_exit are defined in RtsStartup.c
+
+void
+hs_set_argv(int argc, char *argv[])
+{
+    setProgArgv(argc,argv);
+}
+
+void
+hs_perform_gc(void)
+{
+    /* Hmmm, the FFI spec is a bit vague, but it seems to imply a major GC... */
+    performMajorGC();
+}
+
+void
+hs_free_stable_ptr(HsStablePtr sp)
+{
+    /* The cast is for clarity only, both HsStablePtr and StgStablePtr are
+       typedefs for void*. */
+    freeStablePtr((StgStablePtr)sp);
+}
+
+void
+hs_free_fun_ptr(HsFunPtr fp)
+{
+    /* I simply *love* all these similar names... */
+    freeHaskellFunctionPtr(fp);
+}
diff --git a/rts/Interpreter.c b/rts/Interpreter.c
new file mode 100644
index 0000000000..56e9bb67ce
--- /dev/null
+++ b/rts/Interpreter.c
@@ -0,0 +1,1261 @@
+/* -----------------------------------------------------------------------------
+ * Bytecode interpreter
+ *
+ * Copyright (c) The GHC Team, 1994-2002.
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsAPI.h"
+#include "RtsUtils.h"
+#include "Closures.h"
+#include "TSO.h"
+#include "Schedule.h"
+#include "RtsFlags.h"
+#include "Storage.h"
+#include "LdvProfile.h"
+#include "Updates.h"
+#include "Sanity.h"
+#include "Liveness.h"
+
+#include "Bytecodes.h"
+#include "Printer.h"
+#include "Disassembler.h"
+#include "Interpreter.h"
+
+#include <string.h>     /* for memcpy */
+#ifdef HAVE_ERRNO_H
+#include <errno.h>
+#endif
+
+
+/* --------------------------------------------------------------------------
+ * The bytecode interpreter
+ * ------------------------------------------------------------------------*/
+
+/* Gather stats about entry, opcode, opcode-pair frequencies.  For
+   tuning the interpreter. */
+
+/* #define INTERP_STATS */
+
+
+/* Sp points to the lowest live word on the stack. */
+
+#define BCO_NEXT      instrs[bciPtr++]
+#define BCO_PTR(n)    (W_)ptrs[n]
+#define BCO_LIT(n)    literals[n]
+#define BCO_ITBL(n)   itbls[n]
+
+#define LOAD_STACK_POINTERS					\
+    Sp = cap->r.rCurrentTSO->sp;				\
+    /* We don't change this ... */				\
+    SpLim = cap->r.rCurrentTSO->stack + RESERVED_STACK_WORDS;
+
+#define SAVE_STACK_POINTERS			\
+    cap->r.rCurrentTSO->sp = Sp
+
+#define RETURN_TO_SCHEDULER(todo,retcode)	\
+   SAVE_STACK_POINTERS;				\
+   cap->r.rCurrentTSO->what_next = (todo);	\
+   threadPaused(cap,cap->r.rCurrentTSO);		\
+   cap->r.rRet = (retcode);			\
+   return cap;
+
+#define RETURN_TO_SCHEDULER_NO_PAUSE(todo,retcode)	\
+   SAVE_STACK_POINTERS;					\
+   cap->r.rCurrentTSO->what_next = (todo);		\
+   cap->r.rRet = (retcode);				\
+   return cap;
+
+
+STATIC_INLINE StgPtr
+allocate_NONUPD (int n_words)
+{
+    return allocate(stg_max(sizeofW(StgHeader)+MIN_PAYLOAD_SIZE, n_words));
+}
+
+
+#ifdef INTERP_STATS
+
+/* Hacky stats, for tuning the interpreter ... */
+int it_unknown_entries[N_CLOSURE_TYPES];
+int it_total_unknown_entries;
+int it_total_entries;
+
+int it_retto_BCO;
+int it_retto_UPDATE;
+int it_retto_other;
+
+int it_slides;
+int it_insns;
+int it_BCO_entries;
+
+int it_ofreq[27];
+int it_oofreq[27][27];
+int it_lastopc;
+
+#define INTERP_TICK(n) (n)++
+
+void interp_startup ( void )
+{
+   int i, j;
+   it_retto_BCO = it_retto_UPDATE = it_retto_other = 0;
+   it_total_entries = it_total_unknown_entries = 0;
+   for (i = 0; i < N_CLOSURE_TYPES; i++)
+      it_unknown_entries[i] = 0;
+   it_slides = it_insns = it_BCO_entries = 0;
+   for (i = 0; i < 27; i++) it_ofreq[i] = 0;
+   for (i = 0; i < 27; i++) 
+     for (j = 0; j < 27; j++)
+        it_oofreq[i][j] = 0;
+   it_lastopc = 0;
+}
+
+void interp_shutdown ( void )
+{
+   int i, j, k, o_max, i_max, j_max;
+   debugBelch("%d constrs entered -> (%d BCO, %d UPD, %d ??? )\n",
+                   it_retto_BCO + it_retto_UPDATE + it_retto_other,
+                   it_retto_BCO, it_retto_UPDATE, it_retto_other );
+   debugBelch("%d total entries, %d unknown entries \n", 
+                   it_total_entries, it_total_unknown_entries);
+   for (i = 0; i < N_CLOSURE_TYPES; i++) {
+     if (it_unknown_entries[i] == 0) continue;
+     debugBelch("   type %2d: unknown entries (%4.1f%%) == %d\n",
+	     i, 100.0 * ((double)it_unknown_entries[i]) / 
+                        ((double)it_total_unknown_entries),
+             it_unknown_entries[i]);
+   }
+   debugBelch("%d insns, %d slides, %d BCO_entries\n", 
+                   it_insns, it_slides, it_BCO_entries);
+   for (i = 0; i < 27; i++) 
+      debugBelch("opcode %2d got %d\n", i, it_ofreq[i] );
+
+   for (k = 1; k < 20; k++) {
+      o_max = 0;
+      i_max = j_max = 0;
+      for (i = 0; i < 27; i++) {
+         for (j = 0; j < 27; j++) {
+	    if (it_oofreq[i][j] > o_max) {
+               o_max = it_oofreq[i][j];
+	       i_max = i; j_max = j;
+	    }
+	 }
+      }
+      
+      debugBelch("%d:  count (%4.1f%%) %6d   is %d then %d\n",
+                k, ((double)o_max) * 100.0 / ((double)it_insns), o_max,
+                   i_max, j_max );
+      it_oofreq[i_max][j_max] = 0;
+
+   }
+}
+
+#else // !INTERP_STATS
+
+#define INTERP_TICK(n) /* nothing */
+
+#endif
+
+static StgWord app_ptrs_itbl[] = {
+    (W_)&stg_ap_p_info,
+    (W_)&stg_ap_pp_info,
+    (W_)&stg_ap_ppp_info,
+    (W_)&stg_ap_pppp_info,
+    (W_)&stg_ap_ppppp_info,
+    (W_)&stg_ap_pppppp_info,
+};
+
+Capability *
+interpretBCO (Capability* cap)
+{
+    // Use of register here is primarily to make it clear to compilers
+    // that these entities are non-aliasable.
+    register StgPtr       Sp;    // local state -- stack pointer
+    register StgPtr       SpLim; // local state -- stack lim pointer
+    register StgClosure*  obj;
+    nat n, m;
+
+    LOAD_STACK_POINTERS;
+
+    // ------------------------------------------------------------------------
+    // Case 1:
+    // 
+    //       We have a closure to evaluate.  Stack looks like:
+    //       
+    //      	|   XXXX_info   |
+    //      	+---------------+
+    //       Sp |      -------------------> closure
+    //      	+---------------+
+    //       
+    if (Sp[0] == (W_)&stg_enter_info) {
+	Sp++;
+	goto eval;
+    }
+
+    // ------------------------------------------------------------------------
+    // Case 2:
+    // 
+    //       We have a BCO application to perform.  Stack looks like:
+    //
+    //      	|     ....      |
+    //      	+---------------+
+    //      	|     arg1      |
+    //      	+---------------+
+    //      	|     BCO       |
+    //      	+---------------+
+    //       Sp |   RET_BCO     |
+    //      	+---------------+
+    //       
+    else if (Sp[0] == (W_)&stg_apply_interp_info) {
+	obj = (StgClosure *)Sp[1];
+	Sp += 2;
+	goto run_BCO_fun;
+    }
+
+    // ------------------------------------------------------------------------
+    // Case 3:
+    //
+    //       We have an unboxed value to return.  See comment before
+    //       do_return_unboxed, below.
+    //
+    else {
+	goto do_return_unboxed;
+    }
+
+    // Evaluate the object on top of the stack.
+eval:
+    obj = (StgClosure*)Sp[0]; Sp++;
+
+eval_obj:
+    INTERP_TICK(it_total_evals);
+
+    IF_DEBUG(interpreter,
+             debugBelch(
+             "\n---------------------------------------------------------------\n");
+             debugBelch("Evaluating: "); printObj(obj);
+             debugBelch("Sp = %p\n", Sp);
+             debugBelch("\n" );
+
+             printStackChunk(Sp,cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size);
+             debugBelch("\n\n");
+            );
+
+    IF_DEBUG(sanity,checkStackChunk(Sp, cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size));
+
+    switch ( get_itbl(obj)->type ) {
+
+    case IND:
+    case IND_OLDGEN:
+    case IND_PERM:
+    case IND_OLDGEN_PERM:
+    case IND_STATIC:
+    { 
+	obj = ((StgInd*)obj)->indirectee;
+	goto eval_obj;
+    }
+    
+    case CONSTR:
+    case CONSTR_1_0:
+    case CONSTR_0_1:
+    case CONSTR_2_0:
+    case CONSTR_1_1:
+    case CONSTR_0_2:
+    case CONSTR_INTLIKE:
+    case CONSTR_CHARLIKE:
+    case CONSTR_STATIC:
+    case CONSTR_NOCAF_STATIC:
+    case FUN:
+    case FUN_1_0:
+    case FUN_0_1:
+    case FUN_2_0:
+    case FUN_1_1:
+    case FUN_0_2:
+    case FUN_STATIC:
+    case PAP:
+	// already in WHNF
+	break;
+	
+    case BCO:
+	ASSERT(((StgBCO *)obj)->arity > 0);
+	break;
+
+    case AP:	/* Copied from stg_AP_entry. */
+    {
+	nat i, words;
+	StgAP *ap;
+	
+	ap = (StgAP*)obj;
+	words = ap->n_args;
+	
+	// Stack check
+	if (Sp - (words+sizeofW(StgUpdateFrame)) < SpLim) {
+	    Sp -= 2;
+	    Sp[1] = (W_)obj;
+	    Sp[0] = (W_)&stg_enter_info;
+	    RETURN_TO_SCHEDULER(ThreadInterpret, StackOverflow);
+	}
+	
+	/* Ok; we're safe.  Party on.  Push an update frame. */
+	Sp -= sizeofW(StgUpdateFrame);
+	{
+	    StgUpdateFrame *__frame;
+	    __frame = (StgUpdateFrame *)Sp;
+	    SET_INFO(__frame, (StgInfoTable *)&stg_upd_frame_info);
+	    __frame->updatee = (StgClosure *)(ap);
+	}
+	
+	/* Reload the stack */
+	Sp -= words;
+	for (i=0; i < words; i++) {
+	    Sp[i] = (W_)ap->payload[i];
+	}
+
+	obj = (StgClosure*)ap->fun;
+	ASSERT(get_itbl(obj)->type == BCO);
+	goto run_BCO_fun;
+    }
+
+    default:
+#ifdef INTERP_STATS
+    { 
+	int j;
+	
+	j = get_itbl(obj)->type;
+	ASSERT(j >= 0 && j < N_CLOSURE_TYPES);
+	it_unknown_entries[j]++;
+	it_total_unknown_entries++;
+    }
+#endif
+    {
+	// Can't handle this object; yield to scheduler
+	IF_DEBUG(interpreter,
+		 debugBelch("evaluating unknown closure -- yielding to sched\n"); 
+		 printObj(obj);
+	    );
+	Sp -= 2;
+	Sp[1] = (W_)obj;
+	Sp[0] = (W_)&stg_enter_info;
+	RETURN_TO_SCHEDULER_NO_PAUSE(ThreadRunGHC, ThreadYielding);
+    }
+    }
+
+    // ------------------------------------------------------------------------
+    // We now have an evaluated object (obj).  The next thing to
+    // do is return it to the stack frame on top of the stack.
+do_return:
+    ASSERT(closure_HNF(obj));
+
+    IF_DEBUG(interpreter,
+             debugBelch(
+             "\n---------------------------------------------------------------\n");
+             debugBelch("Returning: "); printObj(obj);
+             debugBelch("Sp = %p\n", Sp);
+             debugBelch("\n" );
+             printStackChunk(Sp,cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size);
+             debugBelch("\n\n");
+            );
+
+    IF_DEBUG(sanity,checkStackChunk(Sp, cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size));
+
+    switch (get_itbl((StgClosure *)Sp)->type) {
+
+    case RET_SMALL: {
+	const StgInfoTable *info;
+
+	// NOTE: not using get_itbl().
+	info = ((StgClosure *)Sp)->header.info;
+	if (info == (StgInfoTable *)&stg_ap_v_info) {
+	    n = 1; m = 0; goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_f_info) {
+	    n = 1; m = 1; goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_d_info) {
+	    n = 1; m = sizeofW(StgDouble); goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_l_info) {
+	    n = 1; m = sizeofW(StgInt64); goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_n_info) {
+	    n = 1; m = 1; goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_p_info) {
+	    n = 1; m = 1; goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_pp_info) {
+	    n = 2; m = 2; goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_ppp_info) {
+	    n = 3; m = 3; goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_pppp_info) {
+	    n = 4; m = 4; goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_ppppp_info) {
+	    n = 5; m = 5; goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_pppppp_info) {
+	    n = 6; m = 6; goto do_apply;
+	}
+	goto do_return_unrecognised;
+    }
+
+    case UPDATE_FRAME:
+	// Returning to an update frame: do the update, pop the update
+	// frame, and continue with the next stack frame.
+	INTERP_TICK(it_retto_UPDATE);
+	UPD_IND(((StgUpdateFrame *)Sp)->updatee, obj); 
+	Sp += sizeofW(StgUpdateFrame);
+	goto do_return;
+
+    case RET_BCO:
+	// Returning to an interpreted continuation: put the object on
+	// the stack, and start executing the BCO.
+	INTERP_TICK(it_retto_BCO);
+	Sp--;
+	Sp[0] = (W_)obj;
+	obj = (StgClosure*)Sp[2];
+	ASSERT(get_itbl(obj)->type == BCO);
+	goto run_BCO_return;
+
+    default:
+    do_return_unrecognised:
+    {
+	// Can't handle this return address; yield to scheduler
+	INTERP_TICK(it_retto_other);
+	IF_DEBUG(interpreter,
+		 debugBelch("returning to unknown frame -- yielding to sched\n"); 
+		 printStackChunk(Sp,cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size);
+	    );
+	Sp -= 2;
+	Sp[1] = (W_)obj;
+	Sp[0] = (W_)&stg_enter_info;
+	RETURN_TO_SCHEDULER_NO_PAUSE(ThreadRunGHC, ThreadYielding);
+    }
+    }
+
+    // -------------------------------------------------------------------------
+    // Returning an unboxed value.  The stack looks like this:
+    //
+    // 	  |     ....      |
+    // 	  +---------------+
+    // 	  |     fv2       |
+    // 	  +---------------+
+    // 	  |     fv1       |
+    // 	  +---------------+
+    // 	  |     BCO       |
+    // 	  +---------------+
+    // 	  | stg_ctoi_ret_ |
+    // 	  +---------------+
+    // 	  |    retval     |
+    // 	  +---------------+
+    // 	  |   XXXX_info   |
+    // 	  +---------------+
+    //
+    // where XXXX_info is one of the stg_gc_unbx_r1_info family.
+    //
+    // We're only interested in the case when the real return address
+    // is a BCO; otherwise we'll return to the scheduler.
+
+do_return_unboxed:
+    { 
+	int offset;
+	
+	ASSERT( Sp[0] == (W_)&stg_gc_unbx_r1_info
+		|| Sp[0] == (W_)&stg_gc_unpt_r1_info
+		|| Sp[0] == (W_)&stg_gc_f1_info
+		|| Sp[0] == (W_)&stg_gc_d1_info
+		|| Sp[0] == (W_)&stg_gc_l1_info
+		|| Sp[0] == (W_)&stg_gc_void_info // VoidRep
+	    );
+
+	// get the offset of the stg_ctoi_ret_XXX itbl
+	offset = stack_frame_sizeW((StgClosure *)Sp);
+
+	switch (get_itbl((StgClosure *)Sp+offset)->type) {
+
+	case RET_BCO:
+	    // Returning to an interpreted continuation: put the object on
+	    // the stack, and start executing the BCO.
+	    INTERP_TICK(it_retto_BCO);
+	    obj = (StgClosure*)Sp[offset+1];
+	    ASSERT(get_itbl(obj)->type == BCO);
+	    goto run_BCO_return_unboxed;
+
+	default:
+	{
+	    // Can't handle this return address; yield to scheduler
+	    INTERP_TICK(it_retto_other);
+	    IF_DEBUG(interpreter,
+		     debugBelch("returning to unknown frame -- yielding to sched\n"); 
+		     printStackChunk(Sp,cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size);
+		);
+	    RETURN_TO_SCHEDULER_NO_PAUSE(ThreadRunGHC, ThreadYielding);
+	}
+	}
+    }
+    // not reached.
+
+
+    // -------------------------------------------------------------------------
+    // Application...
+
+do_apply:
+    // we have a function to apply (obj), and n arguments taking up m
+    // words on the stack.  The info table (stg_ap_pp_info or whatever)
+    // is on top of the arguments on the stack.
+    {
+	switch (get_itbl(obj)->type) {
+
+	case PAP: {
+	    StgPAP *pap;
+	    nat i, arity;
+
+	    pap = (StgPAP *)obj;
+
+	    // we only cope with PAPs whose function is a BCO
+	    if (get_itbl(pap->fun)->type != BCO) {
+		goto defer_apply_to_sched;
+	    }
+
+	    Sp++;
+	    arity = pap->arity;
+	    ASSERT(arity > 0);
+	    if (arity < n) {
+		// n must be greater than 1, and the only kinds of
+		// application we support with more than one argument
+		// are all pointers...
+		//
+		// Shuffle the args for this function down, and put
+		// the appropriate info table in the gap.
+		for (i = 0; i < arity; i++) {
+		    Sp[(int)i-1] = Sp[i];
+		    // ^^^^^ careful, i-1 might be negative, but i in unsigned
+		}
+		Sp[arity-1] = app_ptrs_itbl[n-arity-1];
+		Sp--;
+		// unpack the PAP's arguments onto the stack
+		Sp -= pap->n_args;
+		for (i = 0; i < pap->n_args; i++) {
+		    Sp[i] = (W_)pap->payload[i];
+		}
+		obj = pap->fun;
+		goto run_BCO_fun;
+	    } 
+	    else if (arity == n) {
+		Sp -= pap->n_args;
+		for (i = 0; i < pap->n_args; i++) {
+		    Sp[i] = (W_)pap->payload[i];
+		}
+		obj = pap->fun;
+		goto run_BCO_fun;
+	    } 
+	    else /* arity > n */ {
+		// build a new PAP and return it.
+		StgPAP *new_pap;
+		new_pap = (StgPAP *)allocate(PAP_sizeW(pap->n_args + m));
+		SET_HDR(new_pap,&stg_PAP_info,CCCS);
+		new_pap->arity = pap->arity - n;
+		new_pap->n_args = pap->n_args + m;
+		new_pap->fun = pap->fun;
+		for (i = 0; i < pap->n_args; i++) {
+		    new_pap->payload[i] = pap->payload[i];
+		}
+		for (i = 0; i < m; i++) {
+		    new_pap->payload[pap->n_args + i] = (StgClosure *)Sp[i];
+		}
+		obj = (StgClosure *)new_pap;
+		Sp += m;
+		goto do_return;
+	    }
+	}	    
+
+	case BCO: {
+	    nat arity, i;
+
+	    Sp++;
+	    arity = ((StgBCO *)obj)->arity;
+	    ASSERT(arity > 0);
+	    if (arity < n) {
+		// n must be greater than 1, and the only kinds of
+		// application we support with more than one argument
+		// are all pointers...
+		//
+		// Shuffle the args for this function down, and put
+		// the appropriate info table in the gap.
+		for (i = 0; i < arity; i++) {
+		    Sp[(int)i-1] = Sp[i];
+		    // ^^^^^ careful, i-1 might be negative, but i in unsigned
+		}
+		Sp[arity-1] = app_ptrs_itbl[n-arity-1];
+		Sp--;
+		goto run_BCO_fun;
+	    } 
+	    else if (arity == n) {
+		goto run_BCO_fun;
+	    }
+	    else /* arity > n */ {
+		// build a PAP and return it.
+		StgPAP *pap;
+		nat i;
+		pap = (StgPAP *)allocate(PAP_sizeW(m));
+		SET_HDR(pap, &stg_PAP_info,CCCS);
+		pap->arity = arity - n;
+		pap->fun = obj;
+		pap->n_args = m;
+		for (i = 0; i < m; i++) {
+		    pap->payload[i] = (StgClosure *)Sp[i];
+		}
+		obj = (StgClosure *)pap;
+		Sp += m;
+		goto do_return;
+	    }
+	}
+
+	// No point in us applying machine-code functions
+	default:
+	defer_apply_to_sched:
+	    Sp -= 2;
+	    Sp[1] = (W_)obj;
+	    Sp[0] = (W_)&stg_enter_info;
+	    RETURN_TO_SCHEDULER_NO_PAUSE(ThreadRunGHC, ThreadYielding);
+    }
+
+    // ------------------------------------------------------------------------
+    // Ok, we now have a bco (obj), and its arguments are all on the
+    // stack.  We can start executing the byte codes.
+    //
+    // The stack is in one of two states.  First, if this BCO is a
+    // function:
+    //
+    // 	  |     ....      |
+    // 	  +---------------+
+    // 	  |     arg2      |
+    // 	  +---------------+
+    // 	  |     arg1      |
+    // 	  +---------------+
+    //
+    // Second, if this BCO is a continuation:
+    //
+    // 	  |     ....      |
+    // 	  +---------------+
+    // 	  |     fv2       |
+    // 	  +---------------+
+    // 	  |     fv1       |
+    // 	  +---------------+
+    // 	  |     BCO       |
+    // 	  +---------------+
+    // 	  | stg_ctoi_ret_ |
+    // 	  +---------------+
+    // 	  |    retval     |
+    // 	  +---------------+
+    // 
+    // where retval is the value being returned to this continuation.
+    // In the event of a stack check, heap check, or context switch,
+    // we need to leave the stack in a sane state so the garbage
+    // collector can find all the pointers.
+    //
+    //  (1) BCO is a function:  the BCO's bitmap describes the
+    //      pointerhood of the arguments.
+    //
+    //  (2) BCO is a continuation: BCO's bitmap describes the
+    //      pointerhood of the free variables.
+    //
+    // Sadly we have three different kinds of stack/heap/cswitch check
+    // to do:
+
+run_BCO_return:
+    // Heap check
+    if (doYouWantToGC()) {
+	Sp--; Sp[0] = (W_)&stg_enter_info;
+	RETURN_TO_SCHEDULER(ThreadInterpret, HeapOverflow);
+    }
+    // Stack checks aren't necessary at return points, the stack use
+    // is aggregated into the enclosing function entry point.
+    goto run_BCO;
+    
+run_BCO_return_unboxed:
+    // Heap check
+    if (doYouWantToGC()) {
+	RETURN_TO_SCHEDULER(ThreadInterpret, HeapOverflow);
+    }
+    // Stack checks aren't necessary at return points, the stack use
+    // is aggregated into the enclosing function entry point.
+    goto run_BCO;
+    
+run_BCO_fun:
+    IF_DEBUG(sanity,
+	     Sp -= 2; 
+	     Sp[1] = (W_)obj; 
+	     Sp[0] = (W_)&stg_apply_interp_info;
+	     checkStackChunk(Sp,SpLim);
+	     Sp += 2;
+	);
+
+    // Heap check
+    if (doYouWantToGC()) {
+	Sp -= 2; 
+	Sp[1] = (W_)obj; 
+	Sp[0] = (W_)&stg_apply_interp_info; // placeholder, really
+	RETURN_TO_SCHEDULER(ThreadInterpret, HeapOverflow);
+    }
+    
+    // Stack check
+    if (Sp - INTERP_STACK_CHECK_THRESH < SpLim) {
+	Sp -= 2; 
+	Sp[1] = (W_)obj; 
+	Sp[0] = (W_)&stg_apply_interp_info; // placeholder, really
+	RETURN_TO_SCHEDULER(ThreadInterpret, StackOverflow);
+    }
+    goto run_BCO;
+    
+    // Now, actually interpret the BCO... (no returning to the
+    // scheduler again until the stack is in an orderly state).
+run_BCO:
+    INTERP_TICK(it_BCO_entries);
+    {
+	register int       bciPtr     = 1; /* instruction pointer */
+	register StgBCO*   bco        = (StgBCO*)obj;
+	register StgWord16* instrs    = (StgWord16*)(bco->instrs->payload);
+	register StgWord*  literals   = (StgWord*)(&bco->literals->payload[0]);
+	register StgPtr*   ptrs       = (StgPtr*)(&bco->ptrs->payload[0]);
+	register StgInfoTable** itbls = (StgInfoTable**)
+	    (&bco->itbls->payload[0]);
+
+#ifdef INTERP_STATS
+	it_lastopc = 0; /* no opcode */
+#endif
+
+    nextInsn:
+	ASSERT(bciPtr <= instrs[0]);
+	IF_DEBUG(interpreter,
+		 //if (do_print_stack) {
+		 //debugBelch("\n-- BEGIN stack\n");
+		 //printStack(Sp,cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size,iSu);
+		 //debugBelch("-- END stack\n\n");
+		 //}
+		 debugBelch("Sp = %p   pc = %d      ", Sp, bciPtr);
+		 disInstr(bco,bciPtr);
+		 if (0) { int i;
+		 debugBelch("\n");
+		 for (i = 8; i >= 0; i--) {
+		     debugBelch("%d  %p\n", i, (StgPtr)(*(Sp+i)));
+		 }
+		 debugBelch("\n");
+		 }
+		 //if (do_print_stack) checkStack(Sp,cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size,iSu);
+	    );
+
+	INTERP_TICK(it_insns);
+
+#ifdef INTERP_STATS
+	ASSERT( (int)instrs[bciPtr] >= 0 && (int)instrs[bciPtr] < 27 );
+	it_ofreq[ (int)instrs[bciPtr] ] ++;
+	it_oofreq[ it_lastopc ][ (int)instrs[bciPtr] ] ++;
+	it_lastopc = (int)instrs[bciPtr];
+#endif
+
+	switch (BCO_NEXT) {
+
+	case bci_STKCHECK: {
+	    // Explicit stack check at the beginning of a function
+	    // *only* (stack checks in case alternatives are
+	    // propagated to the enclosing function).
+	    int stk_words_reqd = BCO_NEXT + 1;
+	    if (Sp - stk_words_reqd < SpLim) {
+		Sp -= 2; 
+		Sp[1] = (W_)obj; 
+		Sp[0] = (W_)&stg_apply_interp_info;
+		RETURN_TO_SCHEDULER(ThreadInterpret, StackOverflow);
+	    } else {
+		goto nextInsn;
+	    }
+	}
+
+	case bci_PUSH_L: {
+	    int o1 = BCO_NEXT;
+	    Sp[-1] = Sp[o1];
+	    Sp--;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_LL: {
+	    int o1 = BCO_NEXT;
+	    int o2 = BCO_NEXT;
+	    Sp[-1] = Sp[o1];
+	    Sp[-2] = Sp[o2];
+	    Sp -= 2;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_LLL: {
+	    int o1 = BCO_NEXT;
+	    int o2 = BCO_NEXT;
+	    int o3 = BCO_NEXT;
+	    Sp[-1] = Sp[o1];
+	    Sp[-2] = Sp[o2];
+	    Sp[-3] = Sp[o3];
+	    Sp -= 3;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_G: {
+	    int o1 = BCO_NEXT;
+	    Sp[-1] = BCO_PTR(o1);
+	    Sp -= 1;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_ALTS: {
+	    int o_bco  = BCO_NEXT;
+	    Sp[-2] = (W_)&stg_ctoi_R1p_info;
+	    Sp[-1] = BCO_PTR(o_bco);
+	    Sp -= 2;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_ALTS_P: {
+	    int o_bco  = BCO_NEXT;
+	    Sp[-2] = (W_)&stg_ctoi_R1unpt_info;
+	    Sp[-1] = BCO_PTR(o_bco);
+	    Sp -= 2;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_ALTS_N: {
+	    int o_bco  = BCO_NEXT;
+	    Sp[-2] = (W_)&stg_ctoi_R1n_info;
+	    Sp[-1] = BCO_PTR(o_bco);
+	    Sp -= 2;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_ALTS_F: {
+	    int o_bco  = BCO_NEXT;
+	    Sp[-2] = (W_)&stg_ctoi_F1_info;
+	    Sp[-1] = BCO_PTR(o_bco);
+	    Sp -= 2;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_ALTS_D: {
+	    int o_bco  = BCO_NEXT;
+	    Sp[-2] = (W_)&stg_ctoi_D1_info;
+	    Sp[-1] = BCO_PTR(o_bco);
+	    Sp -= 2;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_ALTS_L: {
+	    int o_bco  = BCO_NEXT;
+	    Sp[-2] = (W_)&stg_ctoi_L1_info;
+	    Sp[-1] = BCO_PTR(o_bco);
+	    Sp -= 2;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_ALTS_V: {
+	    int o_bco  = BCO_NEXT;
+	    Sp[-2] = (W_)&stg_ctoi_V_info;
+	    Sp[-1] = BCO_PTR(o_bco);
+	    Sp -= 2;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_APPLY_N:
+	    Sp--; Sp[0] = (W_)&stg_ap_n_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_V:
+	    Sp--; Sp[0] = (W_)&stg_ap_v_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_F:
+	    Sp--; Sp[0] = (W_)&stg_ap_f_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_D:
+	    Sp--; Sp[0] = (W_)&stg_ap_d_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_L:
+	    Sp--; Sp[0] = (W_)&stg_ap_l_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_P:
+	    Sp--; Sp[0] = (W_)&stg_ap_p_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_PP:
+	    Sp--; Sp[0] = (W_)&stg_ap_pp_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_PPP:
+	    Sp--; Sp[0] = (W_)&stg_ap_ppp_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_PPPP:
+	    Sp--; Sp[0] = (W_)&stg_ap_pppp_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_PPPPP:
+	    Sp--; Sp[0] = (W_)&stg_ap_ppppp_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_PPPPPP:
+	    Sp--; Sp[0] = (W_)&stg_ap_pppppp_info;
+	    goto nextInsn;
+	    
+	case bci_PUSH_UBX: {
+	    int i;
+	    int o_lits = BCO_NEXT;
+	    int n_words = BCO_NEXT;
+	    Sp -= n_words;
+	    for (i = 0; i < n_words; i++) {
+		Sp[i] = (W_)BCO_LIT(o_lits+i);
+	    }
+	    goto nextInsn;
+	}
+
+	case bci_SLIDE: {
+	    int n  = BCO_NEXT;
+	    int by = BCO_NEXT;
+	    /* a_1, .. a_n, b_1, .. b_by, s => a_1, .. a_n, s */
+	    while(--n >= 0) {
+		Sp[n+by] = Sp[n];
+	    }
+	    Sp += by;
+	    INTERP_TICK(it_slides);
+	    goto nextInsn;
+	}
+
+	case bci_ALLOC_AP: {
+	    StgAP* ap; 
+	    int n_payload = BCO_NEXT;
+	    ap = (StgAP*)allocate(AP_sizeW(n_payload));
+	    Sp[-1] = (W_)ap;
+	    ap->n_args = n_payload;
+	    SET_HDR(ap, &stg_AP_info, CCS_SYSTEM/*ToDo*/)
+	    Sp --;
+	    goto nextInsn;
+	}
+
+	case bci_ALLOC_PAP: {
+	    StgPAP* pap; 
+	    int arity = BCO_NEXT;
+	    int n_payload = BCO_NEXT;
+	    pap = (StgPAP*)allocate(PAP_sizeW(n_payload));
+	    Sp[-1] = (W_)pap;
+	    pap->n_args = n_payload;
+	    pap->arity = arity;
+	    SET_HDR(pap, &stg_PAP_info, CCS_SYSTEM/*ToDo*/)
+	    Sp --;
+	    goto nextInsn;
+	}
+
+	case bci_MKAP: {
+	    int i;
+	    int stkoff = BCO_NEXT;
+	    int n_payload = BCO_NEXT;
+	    StgAP* ap = (StgAP*)Sp[stkoff];
+	    ASSERT((int)ap->n_args == n_payload);
+	    ap->fun = (StgClosure*)Sp[0];
+	    
+	    // The function should be a BCO, and its bitmap should
+	    // cover the payload of the AP correctly.
+	    ASSERT(get_itbl(ap->fun)->type == BCO
+		   && BCO_BITMAP_SIZE(ap->fun) == ap->n_args);
+	    
+	    for (i = 0; i < n_payload; i++)
+		ap->payload[i] = (StgClosure*)Sp[i+1];
+	    Sp += n_payload+1;
+	    IF_DEBUG(interpreter,
+		     debugBelch("\tBuilt "); 
+		     printObj((StgClosure*)ap);
+		);
+	    goto nextInsn;
+	}
+
+	case bci_MKPAP: {
+	    int i;
+	    int stkoff = BCO_NEXT;
+	    int n_payload = BCO_NEXT;
+	    StgPAP* pap = (StgPAP*)Sp[stkoff];
+	    ASSERT((int)pap->n_args == n_payload);
+	    pap->fun = (StgClosure*)Sp[0];
+	    
+	    // The function should be a BCO
+	    ASSERT(get_itbl(pap->fun)->type == BCO);
+	    
+	    for (i = 0; i < n_payload; i++)
+		pap->payload[i] = (StgClosure*)Sp[i+1];
+	    Sp += n_payload+1;
+	    IF_DEBUG(interpreter,
+		     debugBelch("\tBuilt "); 
+		     printObj((StgClosure*)pap);
+		);
+	    goto nextInsn;
+	}
+
+	case bci_UNPACK: {
+	    /* Unpack N ptr words from t.o.s constructor */
+	    int i;
+	    int n_words = BCO_NEXT;
+	    StgClosure* con = (StgClosure*)Sp[0];
+	    Sp -= n_words;
+	    for (i = 0; i < n_words; i++) {
+		Sp[i] = (W_)con->payload[i];
+	    }
+	    goto nextInsn;
+	}
+
+	case bci_PACK: {
+	    int i;
+	    int o_itbl         = BCO_NEXT;
+	    int n_words        = BCO_NEXT;
+	    StgInfoTable* itbl = INFO_PTR_TO_STRUCT(BCO_ITBL(o_itbl));
+	    int request        = CONSTR_sizeW( itbl->layout.payload.ptrs, 
+					       itbl->layout.payload.nptrs );
+	    StgClosure* con = (StgClosure*)allocate_NONUPD(request);
+	    ASSERT( itbl->layout.payload.ptrs + itbl->layout.payload.nptrs > 0);
+	    SET_HDR(con, BCO_ITBL(o_itbl), CCS_SYSTEM/*ToDo*/);
+	    for (i = 0; i < n_words; i++) {
+		con->payload[i] = (StgClosure*)Sp[i];
+	    }
+	    Sp += n_words;
+	    Sp --;
+	    Sp[0] = (W_)con;
+	    IF_DEBUG(interpreter,
+		     debugBelch("\tBuilt "); 
+		     printObj((StgClosure*)con);
+		);
+	    goto nextInsn;
+	}
+
+	case bci_TESTLT_P: {
+	    unsigned int discr  = BCO_NEXT;
+	    int failto = BCO_NEXT;
+	    StgClosure* con = (StgClosure*)Sp[0];
+	    if (GET_TAG(con) >= discr) {
+		bciPtr = failto;
+	    }
+	    goto nextInsn;
+	}
+
+	case bci_TESTEQ_P: {
+	    unsigned int discr  = BCO_NEXT;
+	    int failto = BCO_NEXT;
+	    StgClosure* con = (StgClosure*)Sp[0];
+	    if (GET_TAG(con) != discr) {
+		bciPtr = failto;
+	    }
+	    goto nextInsn;
+	}
+
+	case bci_TESTLT_I: {
+	    // There should be an Int at Sp[1], and an info table at Sp[0].
+	    int discr   = BCO_NEXT;
+	    int failto  = BCO_NEXT;
+	    I_ stackInt = (I_)Sp[1];
+	    if (stackInt >= (I_)BCO_LIT(discr))
+		bciPtr = failto;
+	    goto nextInsn;
+	}
+
+	case bci_TESTEQ_I: {
+	    // There should be an Int at Sp[1], and an info table at Sp[0].
+	    int discr   = BCO_NEXT;
+	    int failto  = BCO_NEXT;
+	    I_ stackInt = (I_)Sp[1];
+	    if (stackInt != (I_)BCO_LIT(discr)) {
+		bciPtr = failto;
+	    }
+	    goto nextInsn;
+	}
+
+	case bci_TESTLT_D: {
+	    // There should be a Double at Sp[1], and an info table at Sp[0].
+	    int discr   = BCO_NEXT;
+	    int failto  = BCO_NEXT;
+	    StgDouble stackDbl, discrDbl;
+	    stackDbl = PK_DBL( & Sp[1] );
+	    discrDbl = PK_DBL( & BCO_LIT(discr) );
+	    if (stackDbl >= discrDbl) {
+		bciPtr = failto;
+	    }
+	    goto nextInsn;
+	}
+
+	case bci_TESTEQ_D: {
+	    // There should be a Double at Sp[1], and an info table at Sp[0].
+	    int discr   = BCO_NEXT;
+	    int failto  = BCO_NEXT;
+	    StgDouble stackDbl, discrDbl;
+	    stackDbl = PK_DBL( & Sp[1] );
+	    discrDbl = PK_DBL( & BCO_LIT(discr) );
+	    if (stackDbl != discrDbl) {
+		bciPtr = failto;
+	    }
+	    goto nextInsn;
+	}
+
+	case bci_TESTLT_F: {
+	    // There should be a Float at Sp[1], and an info table at Sp[0].
+	    int discr   = BCO_NEXT;
+	    int failto  = BCO_NEXT;
+	    StgFloat stackFlt, discrFlt;
+	    stackFlt = PK_FLT( & Sp[1] );
+	    discrFlt = PK_FLT( & BCO_LIT(discr) );
+	    if (stackFlt >= discrFlt) {
+		bciPtr = failto;
+	    }
+	    goto nextInsn;
+	}
+
+	case bci_TESTEQ_F: {
+	    // There should be a Float at Sp[1], and an info table at Sp[0].
+	    int discr   = BCO_NEXT;
+	    int failto  = BCO_NEXT;
+	    StgFloat stackFlt, discrFlt;
+	    stackFlt = PK_FLT( & Sp[1] );
+	    discrFlt = PK_FLT( & BCO_LIT(discr) );
+	    if (stackFlt != discrFlt) {
+		bciPtr = failto;
+	    }
+	    goto nextInsn;
+	}
+
+	// Control-flow ish things
+	case bci_ENTER:
+	    // Context-switch check.  We put it here to ensure that
+	    // the interpreter has done at least *some* work before
+	    // context switching: sometimes the scheduler can invoke
+	    // the interpreter with context_switch == 1, particularly
+	    // if the -C0 flag has been given on the cmd line.
+	    if (context_switch) {
+		Sp--; Sp[0] = (W_)&stg_enter_info;
+		RETURN_TO_SCHEDULER(ThreadInterpret, ThreadYielding);
+	    }
+	    goto eval;
+
+	case bci_RETURN:
+	    obj = (StgClosure *)Sp[0];
+	    Sp++;
+	    goto do_return;
+
+	case bci_RETURN_P:
+	    Sp--;
+	    Sp[0] = (W_)&stg_gc_unpt_r1_info;
+	    goto do_return_unboxed;
+	case bci_RETURN_N:
+	    Sp--;
+	    Sp[0] = (W_)&stg_gc_unbx_r1_info;
+	    goto do_return_unboxed;
+	case bci_RETURN_F:
+	    Sp--;
+	    Sp[0] = (W_)&stg_gc_f1_info;
+	    goto do_return_unboxed;
+	case bci_RETURN_D:
+	    Sp--;
+	    Sp[0] = (W_)&stg_gc_d1_info;
+	    goto do_return_unboxed;
+	case bci_RETURN_L:
+	    Sp--;
+	    Sp[0] = (W_)&stg_gc_l1_info;
+	    goto do_return_unboxed;
+	case bci_RETURN_V:
+	    Sp--;
+	    Sp[0] = (W_)&stg_gc_void_info;
+	    goto do_return_unboxed;
+
+	case bci_SWIZZLE: {
+	    int stkoff = BCO_NEXT;
+	    signed short n = (signed short)(BCO_NEXT);
+	    Sp[stkoff] += (W_)n;
+	    goto nextInsn;
+	}
+
+	case bci_CCALL: {
+	    void *tok;
+	    int stk_offset            = BCO_NEXT;
+	    int o_itbl                = BCO_NEXT;
+	    void(*marshall_fn)(void*) = (void (*)(void*))BCO_LIT(o_itbl);
+	    int ret_dyn_size = 
+		RET_DYN_BITMAP_SIZE + RET_DYN_NONPTR_REGS_SIZE
+		+ sizeofW(StgRetDyn);
+
+#ifdef THREADED_RTS
+	    // Threaded RTS:
+	    // Arguments on the TSO stack are not good, because garbage
+	    // collection might move the TSO as soon as we call
+	    // suspendThread below.
+
+	    W_ arguments[stk_offset];
+	    
+	    memcpy(arguments, Sp, sizeof(W_) * stk_offset);
+#endif
+
+	    // Restore the Haskell thread's current value of errno
+	    errno = cap->r.rCurrentTSO->saved_errno;
+
+	    // There are a bunch of non-ptr words on the stack (the
+	    // ccall args, the ccall fun address and space for the
+	    // result), which we need to cover with an info table
+	    // since we might GC during this call.
+	    //
+	    // We know how many (non-ptr) words there are before the
+	    // next valid stack frame: it is the stk_offset arg to the
+	    // CCALL instruction.   So we build a RET_DYN stack frame
+	    // on the stack frame to describe this chunk of stack.
+	    //
+	    Sp -= ret_dyn_size;
+	    ((StgRetDyn *)Sp)->liveness = NO_PTRS | N_NONPTRS(stk_offset);
+	    ((StgRetDyn *)Sp)->info = (StgInfoTable *)&stg_gc_gen_info;
+
+	    SAVE_STACK_POINTERS;
+	    tok = suspendThread(&cap->r);
+
+#ifndef THREADED_RTS
+	    // Careful:
+	    // suspendThread might have shifted the stack
+	    // around (stack squeezing), so we have to grab the real
+	    // Sp out of the TSO to find the ccall args again.
+
+	    marshall_fn ( (void*)(cap->r.rCurrentTSO->sp + ret_dyn_size) );
+#else
+	    // Threaded RTS:
+	    // We already made a copy of the arguments above.
+
+	    marshall_fn ( arguments );
+#endif
+
+	    // And restart the thread again, popping the RET_DYN frame.
+	    cap = (Capability *)((void *)((unsigned char*)resumeThread(tok) - sizeof(StgFunTable)));
+	    LOAD_STACK_POINTERS;
+	    Sp += ret_dyn_size;
+	    
+	    // Save the Haskell thread's current value of errno
+	    cap->r.rCurrentTSO->saved_errno = errno;
+		
+#ifdef THREADED_RTS
+	    // Threaded RTS:
+	    // Copy the "arguments", which might include a return value,
+	    // back to the TSO stack. It would of course be enough to
+	    // just copy the return value, but we don't know the offset.
+	    memcpy(Sp, arguments, sizeof(W_) * stk_offset);
+#endif
+
+	    goto nextInsn;
+	}
+
+	case bci_JMP: {
+	    /* BCO_NEXT modifies bciPtr, so be conservative. */
+	    int nextpc = BCO_NEXT;
+	    bciPtr     = nextpc;
+	    goto nextInsn;
+	}
+
+	case bci_CASEFAIL:
+	    barf("interpretBCO: hit a CASEFAIL");
+	    
+	    // Errors
+	default: 
+	    barf("interpretBCO: unknown or unimplemented opcode");
+
+	} /* switch on opcode */
+    }
+    }
+
+    barf("interpretBCO: fell off end of the interpreter");
+}
diff --git a/rts/Interpreter.h b/rts/Interpreter.h
new file mode 100644
index 0000000000..d66e636084
--- /dev/null
+++ b/rts/Interpreter.h
@@ -0,0 +1,14 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2002.
+ *
+ * Prototypes for functions in Interpreter.c
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef INTERPRETER_H
+#define INTERPRETER_H
+
+extern Capability *interpretBCO (Capability* cap);
+
+#endif /* INTERPRETER_H */
diff --git a/rts/LdvProfile.c b/rts/LdvProfile.c
new file mode 100644
index 0000000000..19ebe426d3
--- /dev/null
+++ b/rts/LdvProfile.c
@@ -0,0 +1,342 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001
+ * Author: Sungwoo Park
+ *
+ * Lag/Drag/Void profiling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifdef PROFILING
+
+#include "Rts.h"
+#include "LdvProfile.h"
+#include "RtsFlags.h"
+#include "Profiling.h"
+#include "Stats.h"
+#include "Storage.h"
+#include "RtsUtils.h"
+#include "Schedule.h"
+
+/* --------------------------------------------------------------------------
+ * Fills in the slop when a *dynamic* closure changes its type.
+ * First calls LDV_recordDead() to declare the closure is dead, and then
+ * fills in the slop.
+ * 
+ *  Invoked when:
+ *    1) blackholing, UPD_BH_UPDATABLE() and UPD_BH_SINGLE_ENTRY (in
+ * 	 includes/StgMacros.h), threadLazyBlackHole() and 
+ * 	 threadSqueezeStack() (in GC.c).
+ *    2) updating with indirection closures, updateWithIndirection() 
+ * 	 and updateWithPermIndirection() (in Storage.h).
+ * 
+ *  LDV_recordDead_FILL_SLOP_DYNAMIC() is not called on 'inherently used' 
+ *  closures such as TSO. It is not called on PAP because PAP is not updatable.
+ *  ----------------------------------------------------------------------- */
+void 
+LDV_recordDead_FILL_SLOP_DYNAMIC( StgClosure *p )
+{
+    nat size, i;
+
+#if defined(__GNUC__) && __GNUC__ < 3 && defined(DEBUG)
+#error Please use gcc 3.0+ to compile this file with DEBUG; gcc < 3.0 miscompiles it
+#endif
+
+    if (era > 0) {
+	// very like FILL_SLOP(), except that we call LDV_recordDead().
+	size = closure_sizeW(p);
+
+	LDV_recordDead((StgClosure *)(p), size);
+
+	if (size > sizeofW(StgThunkHeader)) {
+	    for (i = 0; i < size - sizeofW(StgThunkHeader); i++) {
+		((StgThunk *)(p))->payload[i] = 0;
+	    }
+	}
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * This function is called eventually on every object destroyed during
+ * a garbage collection, whether it is a major garbage collection or
+ * not.  If c is an 'inherently used' closure, nothing happens.  If c
+ * is an ordinary closure, LDV_recordDead() is called on c with its
+ * proper size which excludes the profiling header portion in the
+ * closure.  Returns the size of the closure, including the profiling
+ * header portion, so that the caller can find the next closure.
+ * ----------------------------------------------------------------------- */
+STATIC_INLINE nat
+processHeapClosureForDead( StgClosure *c )
+{
+    nat size;
+    StgInfoTable *info;
+
+    info = get_itbl(c);
+
+    if (info->type != EVACUATED) {
+	ASSERT(((LDVW(c) & LDV_CREATE_MASK) >> LDV_SHIFT) <= era &&
+	       ((LDVW(c) & LDV_CREATE_MASK) >> LDV_SHIFT) > 0);
+	ASSERT(((LDVW(c) & LDV_STATE_MASK) == LDV_STATE_CREATE) ||
+	       (
+		   (LDVW(c) & LDV_LAST_MASK) <= era &&
+		   (LDVW(c) & LDV_LAST_MASK) > 0
+		   ));
+    }
+
+    if (info->type == EVACUATED) {
+	// The size of the evacuated closure is currently stored in
+	// the LDV field.  See SET_EVACUAEE_FOR_LDV() in
+	// includes/StgLdvProf.h.
+	return LDVW(c);
+    }
+
+    size = closure_sizeW(c);
+
+    switch (info->type) {
+	/*
+	  'inherently used' cases: do nothing.
+	*/
+    case TSO:
+    case MVAR:
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    case MUT_ARR_PTRS_FROZEN:
+    case MUT_ARR_PTRS_FROZEN0:
+    case ARR_WORDS:
+    case WEAK:
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+    case BCO:
+    case STABLE_NAME:
+    case TVAR_WAIT_QUEUE:
+    case TVAR:
+    case TREC_HEADER:
+    case TREC_CHUNK:
+	return size;
+
+	/*
+	  ordinary cases: call LDV_recordDead().
+	*/
+    case THUNK:
+    case THUNK_1_0:
+    case THUNK_0_1:
+    case THUNK_SELECTOR:
+    case THUNK_2_0:
+    case THUNK_1_1:
+    case THUNK_0_2:
+    case AP:
+    case PAP:
+    case AP_STACK:
+    case CONSTR:
+    case CONSTR_1_0:
+    case CONSTR_0_1:
+    case CONSTR_2_0:
+    case CONSTR_1_1:
+    case CONSTR_0_2:
+    case FUN:
+    case FUN_1_0:
+    case FUN_0_1:
+    case FUN_2_0:
+    case FUN_1_1:
+    case FUN_0_2:
+    case BLACKHOLE:
+    case SE_BLACKHOLE:
+    case CAF_BLACKHOLE:
+    case SE_CAF_BLACKHOLE:
+    case IND_PERM:
+    case IND_OLDGEN_PERM:
+	/*
+	  'Ingore' cases
+	*/
+	// Why can we ignore IND/IND_OLDGEN closures? We assume that
+	// any census is preceded by a major garbage collection, which
+	// IND/IND_OLDGEN closures cannot survive. Therefore, it is no
+	// use considering IND/IND_OLDGEN closures in the meanwhile
+	// because they will perish before the next census at any
+	// rate.
+    case IND:
+    case IND_OLDGEN:
+	// Found a dead closure: record its size
+	LDV_recordDead(c, size);
+	return size;
+
+	/*
+	  Error case
+	*/
+	// static objects
+    case IND_STATIC:
+    case CONSTR_STATIC:
+    case FUN_STATIC:
+    case THUNK_STATIC:
+    case CONSTR_INTLIKE:
+    case CONSTR_CHARLIKE:
+    case CONSTR_NOCAF_STATIC:
+	// stack objects
+    case UPDATE_FRAME:
+    case CATCH_FRAME:
+    case STOP_FRAME:
+    case RET_DYN:
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+    case RET_BIG:
+    case RET_VEC_BIG:
+	// others
+    case BLOCKED_FETCH:
+    case FETCH_ME:
+    case FETCH_ME_BQ:
+    case RBH:
+    case REMOTE_REF:
+    case INVALID_OBJECT:
+    default:
+	barf("Invalid object in processHeapClosureForDead(): %d", info->type);
+	return 0;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Calls processHeapClosureForDead() on every *dead* closures in the
+ * heap blocks starting at bd.
+ * ----------------------------------------------------------------------- */
+static void
+processHeapForDead( bdescr *bd )
+{
+    StgPtr p;
+
+    while (bd != NULL) {
+	p = bd->start;
+	while (p < bd->free) {
+	    p += processHeapClosureForDead((StgClosure *)p);
+	    while (p < bd->free && !*p)   // skip slop
+		p++;
+	}
+	ASSERT(p == bd->free);
+	bd = bd->link;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Calls processHeapClosureForDead() on every *dead* closures in the nursery.
+ * ----------------------------------------------------------------------- */
+static void
+processNurseryForDead( void )
+{
+    StgPtr p, bdLimit;
+    bdescr *bd;
+
+    bd = MainCapability.r.rNursery->blocks;
+    while (bd->start < bd->free) {
+	p = bd->start;
+	bdLimit = bd->start + BLOCK_SIZE_W;
+	while (p < bd->free && p < bdLimit) {
+	    p += processHeapClosureForDead((StgClosure *)p);
+	    while (p < bd->free && p < bdLimit && !*p)  // skip slop
+		p++;
+	}
+	bd = bd->link;
+	if (bd == NULL)
+	    break;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Calls processHeapClosureForDead() on every *dead* closures in the
+ * small object pool.
+ * ----------------------------------------------------------------------- */
+static void
+processSmallObjectPoolForDead( void )
+{
+    bdescr *bd;
+    StgPtr p;
+
+    bd = small_alloc_list;
+
+    // first block
+    if (bd == NULL)
+	return;
+
+    p = bd->start;
+    while (p < alloc_Hp) {
+	p += processHeapClosureForDead((StgClosure *)p);
+	while (p < alloc_Hp && !*p)     // skip slop
+	    p++;
+    }
+    ASSERT(p == alloc_Hp);
+
+    bd = bd->link;
+    while (bd != NULL) {
+	p = bd->start;
+	while (p < bd->free) {
+	    p += processHeapClosureForDead((StgClosure *)p);
+	    while (p < bd->free && !*p)    // skip slop
+		p++;
+	}
+	ASSERT(p == bd->free);
+	bd = bd->link;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Calls processHeapClosureForDead() on every *dead* closures in the closure
+ * chain.
+ * ----------------------------------------------------------------------- */
+static void
+processChainForDead( bdescr *bd )
+{
+    // Any object still in the chain is dead!
+    while (bd != NULL) {
+	processHeapClosureForDead((StgClosure *)bd->start);
+	bd = bd->link;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Start a census for *dead* closures, and calls
+ * processHeapClosureForDead() on every closure which died in the
+ * current garbage collection.  This function is called from a garbage
+ * collector right before tidying up, when all dead closures are still
+ * stored in the heap and easy to identify.  Generations 0 through N
+ * have just beed garbage collected.
+ * ----------------------------------------------------------------------- */
+void
+LdvCensusForDead( nat N )
+{
+    nat g, s;
+
+    // ldvTime == 0 means that LDV profiling is currently turned off.
+    if (era == 0)
+	return;
+
+    if (RtsFlags.GcFlags.generations == 1) {
+	//
+	// Todo: support LDV for two-space garbage collection.
+	//
+	barf("Lag/Drag/Void profiling not supported with -G1");
+    } else {
+	for (g = 0; g <= N; g++)
+	    for (s = 0; s < generations[g].n_steps; s++) {
+		if (g == 0 && s == 0) {
+		    processSmallObjectPoolForDead();
+		    processNurseryForDead();
+		    processChainForDead(generations[g].steps[s].large_objects);
+		} else{
+		    processHeapForDead(generations[g].steps[s].old_blocks);
+		    processChainForDead(generations[g].steps[s].large_objects);
+		}
+	    }
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Regard any closure in the current heap as dead or moribund and update
+ * LDV statistics accordingly.
+ * Called from shutdownHaskell() in RtsStartup.c.
+ * Also, stops LDV profiling by resetting ldvTime to 0.
+ * ----------------------------------------------------------------------- */
+void
+LdvCensusKillAll( void )
+{
+    LdvCensusForDead(RtsFlags.GcFlags.generations - 1);
+}
+
+#endif /* PROFILING */
diff --git a/rts/LdvProfile.h b/rts/LdvProfile.h
new file mode 100644
index 0000000000..d85b95cd6a
--- /dev/null
+++ b/rts/LdvProfile.h
@@ -0,0 +1,42 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001
+ * Author: Sungwoo Park
+ *
+ * Lag/Drag/Void profiling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef LDVPROFILE_H
+#define LDVPROFILE_H
+
+#ifdef PROFILING
+
+#include "ProfHeap.h"
+
+extern void LDV_recordDead_FILL_SLOP_DYNAMIC( StgClosure *p );
+extern void LdvCensusForDead ( nat );
+extern void LdvCensusKillAll ( void );
+
+// Creates a 0-filled slop of size 'howManyBackwards' backwards from the
+// address 'from'. 
+//
+// Invoked when: 
+//   1) Hp is incremented and exceeds HpLim (in Updates.hc).
+//   2) copypart() is called (in GC.c).
+#define LDV_FILL_SLOP(from, howManyBackwards)	\
+  if (era > 0) {				\
+    int i;					\
+    for (i = 0;i < (howManyBackwards); i++)	\
+      ((StgWord *)(from))[-i] = 0;		\
+  }
+
+// Informs the LDV profiler that closure c has just been evacuated.
+// Evacuated objects are no longer needed, so we just store its original size in
+// the LDV field.
+#define SET_EVACUAEE_FOR_LDV(c, size)   \
+    LDVW((c)) = (size)
+
+#endif /* PROFILING */
+
+#endif /* LDVPROFILE_H */
diff --git a/rts/Linker.c b/rts/Linker.c
new file mode 100644
index 0000000000..92d0106def
--- /dev/null
+++ b/rts/Linker.c
@@ -0,0 +1,4315 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2000-2004
+ *
+ * RTS Object Linker
+ *
+ * ---------------------------------------------------------------------------*/
+
+#if 0
+#include "PosixSource.h"
+#endif
+
+/* Linux needs _GNU_SOURCE to get RTLD_DEFAULT from <dlfcn.h> and 
+   MREMAP_MAYMOVE from <sys/mman.h>.
+ */
+#ifdef __linux__
+#define _GNU_SOURCE
+#endif
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "HsFFI.h"
+#include "Hash.h"
+#include "Linker.h"
+#include "LinkerInternals.h"
+#include "RtsUtils.h"
+#include "Schedule.h"
+#include "Storage.h"
+#include "Sparks.h"
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+
+#if defined(HAVE_DLFCN_H)
+#include <dlfcn.h>
+#endif
+
+#if defined(cygwin32_HOST_OS)
+#ifdef HAVE_DIRENT_H
+#include <dirent.h>
+#endif
+
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+#include <regex.h>
+#include <sys/fcntl.h>
+#include <sys/termios.h>
+#include <sys/utime.h>
+#include <sys/utsname.h>
+#include <sys/wait.h>
+#endif
+
+#if defined(ia64_HOST_ARCH) || defined(openbsd_HOST_OS) || defined(linux_HOST_OS) || defined(freebsd_HOST_OS)
+#define USE_MMAP
+#include <fcntl.h>
+#include <sys/mman.h>
+
+#if defined(openbsd_HOST_OS) || defined(linux_HOST_OS) || defined(freebsd_HOST_OS)
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#endif
+
+#endif
+
+#if defined(linux_HOST_OS) || defined(solaris2_HOST_OS) || defined(freebsd_HOST_OS) || defined(netbsd_HOST_OS) || defined(openbsd_HOST_OS)
+#  define OBJFORMAT_ELF
+#elif defined(cygwin32_HOST_OS) || defined (mingw32_HOST_OS)
+#  define OBJFORMAT_PEi386
+#  include <windows.h>
+#  include <math.h>
+#elif defined(darwin_HOST_OS)
+#  define OBJFORMAT_MACHO
+#  include <mach-o/loader.h>
+#  include <mach-o/nlist.h>
+#  include <mach-o/reloc.h>
+#  include <mach-o/dyld.h>
+#if defined(powerpc_HOST_ARCH)
+#  include <mach-o/ppc/reloc.h>
+#endif
+#endif
+
+/* Hash table mapping symbol names to Symbol */
+static /*Str*/HashTable *symhash;
+
+/* List of currently loaded objects */
+ObjectCode *objects = NULL;	/* initially empty */
+
+#if defined(OBJFORMAT_ELF)
+static int ocVerifyImage_ELF    ( ObjectCode* oc );
+static int ocGetNames_ELF       ( ObjectCode* oc );
+static int ocResolve_ELF        ( ObjectCode* oc );
+#if defined(powerpc_HOST_ARCH)
+static int ocAllocateJumpIslands_ELF ( ObjectCode* oc );
+#endif
+#elif defined(OBJFORMAT_PEi386)
+static int ocVerifyImage_PEi386 ( ObjectCode* oc );
+static int ocGetNames_PEi386    ( ObjectCode* oc );
+static int ocResolve_PEi386     ( ObjectCode* oc );
+#elif defined(OBJFORMAT_MACHO)
+static int ocVerifyImage_MachO    ( ObjectCode* oc );
+static int ocGetNames_MachO       ( ObjectCode* oc );
+static int ocResolve_MachO        ( ObjectCode* oc );
+
+static int machoGetMisalignment( FILE * );
+#ifdef powerpc_HOST_ARCH
+static int ocAllocateJumpIslands_MachO ( ObjectCode* oc );
+static void machoInitSymbolsWithoutUnderscore( void );
+#endif
+#endif
+
+#if defined(x86_64_HOST_ARCH)
+static void*x86_64_high_symbol( char *lbl, void *addr );
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Built-in symbols from the RTS
+ */
+
+typedef struct _RtsSymbolVal {
+    char   *lbl;
+    void   *addr;
+} RtsSymbolVal;
+
+
+#if !defined(PAR)
+#define Maybe_Stable_Names      SymX(mkWeakzh_fast)			\
+      				SymX(makeStableNamezh_fast)		\
+      				SymX(finalizzeWeakzh_fast)
+#else
+/* These are not available in GUM!!! -- HWL */
+#define Maybe_Stable_Names
+#endif
+
+#if !defined (mingw32_HOST_OS)
+#define RTS_POSIX_ONLY_SYMBOLS                  \
+      SymX(signal_handlers)			\
+      SymX(stg_sig_install)			\
+      Sym(nocldstop)
+#endif
+
+#if defined (cygwin32_HOST_OS)
+#define RTS_MINGW_ONLY_SYMBOLS /**/
+/* Don't have the ability to read import libs / archives, so
+ * we have to stupidly list a lot of what libcygwin.a
+ * exports; sigh.
+ */
+#define RTS_CYGWIN_ONLY_SYMBOLS                 \
+      SymX(regfree)                             \
+      SymX(regexec)                             \
+      SymX(regerror)                            \
+      SymX(regcomp)                             \
+      SymX(__errno)                             \
+      SymX(access)                              \
+      SymX(chmod)                               \
+      SymX(chdir)                               \
+      SymX(close)                               \
+      SymX(creat)                               \
+      SymX(dup)                                 \
+      SymX(dup2)                                \
+      SymX(fstat)                               \
+      SymX(fcntl)                               \
+      SymX(getcwd)                              \
+      SymX(getenv)                              \
+      SymX(lseek)                               \
+      SymX(open)                                \
+      SymX(fpathconf)                           \
+      SymX(pathconf)                            \
+      SymX(stat)                                \
+      SymX(pow)                                 \
+      SymX(tanh)                                \
+      SymX(cosh)                                \
+      SymX(sinh)                                \
+      SymX(atan)                                \
+      SymX(acos)                                \
+      SymX(asin)                                \
+      SymX(tan)                                 \
+      SymX(cos)                                 \
+      SymX(sin)                                 \
+      SymX(exp)                                 \
+      SymX(log)                                 \
+      SymX(sqrt)                                \
+      SymX(localtime_r)                         \
+      SymX(gmtime_r)                            \
+      SymX(mktime)                              \
+      Sym(_imp___tzname)                        \
+      SymX(gettimeofday)                        \
+      SymX(timezone)                            \
+      SymX(tcgetattr)                           \
+      SymX(tcsetattr)                           \
+      SymX(memcpy)                              \
+      SymX(memmove)                             \
+      SymX(realloc)                             \
+      SymX(malloc)                              \
+      SymX(free)                                \
+      SymX(fork)                                \
+      SymX(lstat)                               \
+      SymX(isatty)                              \
+      SymX(mkdir)                               \
+      SymX(opendir)                             \
+      SymX(readdir)                             \
+      SymX(rewinddir)                           \
+      SymX(closedir)                            \
+      SymX(link)                                \
+      SymX(mkfifo)                              \
+      SymX(pipe)                                \
+      SymX(read)                                \
+      SymX(rename)                              \
+      SymX(rmdir)                               \
+      SymX(select)                              \
+      SymX(system)                              \
+      SymX(write)                               \
+      SymX(strcmp)                              \
+      SymX(strcpy)                              \
+      SymX(strncpy)                             \
+      SymX(strerror)                            \
+      SymX(sigaddset)                           \
+      SymX(sigemptyset)                         \
+      SymX(sigprocmask)                         \
+      SymX(umask)                               \
+      SymX(uname)                               \
+      SymX(unlink)                              \
+      SymX(utime)                               \
+      SymX(waitpid)
+
+#elif !defined(mingw32_HOST_OS)
+#define RTS_MINGW_ONLY_SYMBOLS /**/
+#define RTS_CYGWIN_ONLY_SYMBOLS /**/
+#else /* defined(mingw32_HOST_OS) */
+#define RTS_POSIX_ONLY_SYMBOLS  /**/
+#define RTS_CYGWIN_ONLY_SYMBOLS /**/
+
+/* Extra syms gen'ed by mingw-2's gcc-3.2: */
+#if __GNUC__>=3
+#define RTS_MINGW_EXTRA_SYMS                    \
+      Sym(_imp____mb_cur_max)                   \
+      Sym(_imp___pctype)
+#else
+#define RTS_MINGW_EXTRA_SYMS
+#endif
+
+/* These are statically linked from the mingw libraries into the ghc
+   executable, so we have to employ this hack. */
+#define RTS_MINGW_ONLY_SYMBOLS                  \
+      SymX(asyncReadzh_fast)			\
+      SymX(asyncWritezh_fast)			\
+      SymX(asyncDoProczh_fast)			\
+      SymX(memset)                              \
+      SymX(inet_ntoa)                           \
+      SymX(inet_addr)                           \
+      SymX(htonl)                               \
+      SymX(recvfrom)                            \
+      SymX(listen)                              \
+      SymX(bind)                                \
+      SymX(shutdown)                            \
+      SymX(connect)                             \
+      SymX(htons)                               \
+      SymX(ntohs)                               \
+      SymX(getservbyname)                       \
+      SymX(getservbyport)                       \
+      SymX(getprotobynumber)                    \
+      SymX(getprotobyname)                      \
+      SymX(gethostbyname)                       \
+      SymX(gethostbyaddr)                       \
+      SymX(gethostname)                         \
+      SymX(strcpy)                              \
+      SymX(strncpy)                             \
+      SymX(abort)                               \
+      Sym(_alloca)                              \
+      Sym(isxdigit)                             \
+      Sym(isupper)                              \
+      Sym(ispunct)                              \
+      Sym(islower)                              \
+      Sym(isspace)                              \
+      Sym(isprint)                              \
+      Sym(isdigit)                              \
+      Sym(iscntrl)                              \
+      Sym(isalpha)                              \
+      Sym(isalnum)                              \
+      SymX(strcmp)                              \
+      SymX(memmove)                             \
+      SymX(realloc)                             \
+      SymX(malloc)                              \
+      SymX(pow)                                 \
+      SymX(tanh)                                \
+      SymX(cosh)                                \
+      SymX(sinh)                                \
+      SymX(atan)                                \
+      SymX(acos)                                \
+      SymX(asin)                                \
+      SymX(tan)                                 \
+      SymX(cos)                                 \
+      SymX(sin)                                 \
+      SymX(exp)                                 \
+      SymX(log)                                 \
+      SymX(sqrt)                                \
+      SymX(powf)                                 \
+      SymX(tanhf)                                \
+      SymX(coshf)                                \
+      SymX(sinhf)                                \
+      SymX(atanf)                                \
+      SymX(acosf)                                \
+      SymX(asinf)                                \
+      SymX(tanf)                                 \
+      SymX(cosf)                                 \
+      SymX(sinf)                                 \
+      SymX(expf)                                 \
+      SymX(logf)                                 \
+      SymX(sqrtf)                                \
+      SymX(memcpy)                              \
+      SymX(rts_InstallConsoleEvent)             \
+      SymX(rts_ConsoleHandlerDone)              \
+      Sym(mktime)                               \
+      Sym(_imp___timezone)                      \
+      Sym(_imp___tzname)                        \
+      Sym(_imp___iob)                           \
+      Sym(_imp___osver)                         \
+      Sym(localtime)                            \
+      Sym(gmtime)                               \
+      Sym(opendir)                              \
+      Sym(readdir)                              \
+      Sym(rewinddir)                            \
+      RTS_MINGW_EXTRA_SYMS                      \
+      Sym(closedir)
+#endif
+
+#if defined(darwin_TARGET_OS) && HAVE_PRINTF_LDBLSTUB
+#define RTS_DARWIN_ONLY_SYMBOLS			\
+     Sym(asprintf$LDBLStub)                     \
+     Sym(err$LDBLStub)                          \
+     Sym(errc$LDBLStub)                         \
+     Sym(errx$LDBLStub)                         \
+     Sym(fprintf$LDBLStub)                      \
+     Sym(fscanf$LDBLStub)                       \
+     Sym(fwprintf$LDBLStub)                     \
+     Sym(fwscanf$LDBLStub)                      \
+     Sym(printf$LDBLStub)                       \
+     Sym(scanf$LDBLStub)                        \
+     Sym(snprintf$LDBLStub)                     \
+     Sym(sprintf$LDBLStub)                      \
+     Sym(sscanf$LDBLStub)                       \
+     Sym(strtold$LDBLStub)                      \
+     Sym(swprintf$LDBLStub)                     \
+     Sym(swscanf$LDBLStub)                      \
+     Sym(syslog$LDBLStub)                       \
+     Sym(vasprintf$LDBLStub)                    \
+     Sym(verr$LDBLStub)                         \
+     Sym(verrc$LDBLStub)                        \
+     Sym(verrx$LDBLStub)                        \
+     Sym(vfprintf$LDBLStub)                     \
+     Sym(vfscanf$LDBLStub)                      \
+     Sym(vfwprintf$LDBLStub)                    \
+     Sym(vfwscanf$LDBLStub)                     \
+     Sym(vprintf$LDBLStub)                      \
+     Sym(vscanf$LDBLStub)                       \
+     Sym(vsnprintf$LDBLStub)                    \
+     Sym(vsprintf$LDBLStub)                     \
+     Sym(vsscanf$LDBLStub)                      \
+     Sym(vswprintf$LDBLStub)                    \
+     Sym(vswscanf$LDBLStub)                     \
+     Sym(vsyslog$LDBLStub)                      \
+     Sym(vwarn$LDBLStub)                        \
+     Sym(vwarnc$LDBLStub)                       \
+     Sym(vwarnx$LDBLStub)                       \
+     Sym(vwprintf$LDBLStub)                     \
+     Sym(vwscanf$LDBLStub)                      \
+     Sym(warn$LDBLStub)                         \
+     Sym(warnc$LDBLStub)                        \
+     Sym(warnx$LDBLStub)                        \
+     Sym(wcstold$LDBLStub)                      \
+     Sym(wprintf$LDBLStub)                      \
+     Sym(wscanf$LDBLStub)
+#else
+#define RTS_DARWIN_ONLY_SYMBOLS
+#endif
+
+#ifndef SMP
+# define MAIN_CAP_SYM SymX(MainCapability)
+#else
+# define MAIN_CAP_SYM
+#endif
+
+#if !defined(mingw32_HOST_OS)
+#define RTS_USER_SIGNALS_SYMBOLS \
+   SymX(setIOManagerPipe)
+#else
+#define RTS_USER_SIGNALS_SYMBOLS /* nothing */
+#endif
+
+#ifdef TABLES_NEXT_TO_CODE
+#define RTS_RET_SYMBOLS /* nothing */
+#else
+#define RTS_RET_SYMBOLS 			\
+      SymX(stg_enter_ret)			\
+      SymX(stg_gc_fun_ret)			\
+      SymX(stg_ap_v_ret)			\
+      SymX(stg_ap_f_ret)			\
+      SymX(stg_ap_d_ret)			\
+      SymX(stg_ap_l_ret)			\
+      SymX(stg_ap_n_ret)			\
+      SymX(stg_ap_p_ret)			\
+      SymX(stg_ap_pv_ret)			\
+      SymX(stg_ap_pp_ret)			\
+      SymX(stg_ap_ppv_ret)			\
+      SymX(stg_ap_ppp_ret)			\
+      SymX(stg_ap_pppv_ret)			\
+      SymX(stg_ap_pppp_ret)			\
+      SymX(stg_ap_ppppp_ret)			\
+      SymX(stg_ap_pppppp_ret)
+#endif
+
+#define RTS_SYMBOLS				\
+      Maybe_Stable_Names			\
+      Sym(StgReturn)				\
+      SymX(stg_enter_info)			\
+      SymX(stg_gc_void_info)			\
+      SymX(__stg_gc_enter_1)			\
+      SymX(stg_gc_noregs)			\
+      SymX(stg_gc_unpt_r1_info)			\
+      SymX(stg_gc_unpt_r1)			\
+      SymX(stg_gc_unbx_r1_info)			\
+      SymX(stg_gc_unbx_r1)			\
+      SymX(stg_gc_f1_info)			\
+      SymX(stg_gc_f1)				\
+      SymX(stg_gc_d1_info)			\
+      SymX(stg_gc_d1)				\
+      SymX(stg_gc_l1_info)			\
+      SymX(stg_gc_l1)				\
+      SymX(__stg_gc_fun)			\
+      SymX(stg_gc_fun_info)			\
+      SymX(stg_gc_gen)				\
+      SymX(stg_gc_gen_info)			\
+      SymX(stg_gc_gen_hp)			\
+      SymX(stg_gc_ut)				\
+      SymX(stg_gen_yield)			\
+      SymX(stg_yield_noregs)			\
+      SymX(stg_yield_to_interpreter)		\
+      SymX(stg_gen_block)			\
+      SymX(stg_block_noregs)			\
+      SymX(stg_block_1)				\
+      SymX(stg_block_takemvar)			\
+      SymX(stg_block_putmvar)			\
+      SymX(stg_seq_frame_info)			\
+      MAIN_CAP_SYM                              \
+      SymX(MallocFailHook)			\
+      SymX(OnExitHook)				\
+      SymX(OutOfHeapHook)			\
+      SymX(StackOverflowHook)			\
+      SymX(__encodeDouble)			\
+      SymX(__encodeFloat)			\
+      SymX(addDLL)               		\
+      SymX(__gmpn_gcd_1)			\
+      SymX(__gmpz_cmp)				\
+      SymX(__gmpz_cmp_si)			\
+      SymX(__gmpz_cmp_ui)			\
+      SymX(__gmpz_get_si)			\
+      SymX(__gmpz_get_ui)			\
+      SymX(__int_encodeDouble)			\
+      SymX(__int_encodeFloat)			\
+      SymX(andIntegerzh_fast)			\
+      SymX(atomicallyzh_fast)			\
+      SymX(barf)				\
+      SymX(debugBelch)				\
+      SymX(errorBelch)				\
+      SymX(blockAsyncExceptionszh_fast)		\
+      SymX(catchzh_fast)			\
+      SymX(catchRetryzh_fast)			\
+      SymX(catchSTMzh_fast)			\
+      SymX(closure_flags)                       \
+      SymX(cmp_thread)				\
+      SymX(cmpIntegerzh_fast)	        	\
+      SymX(cmpIntegerIntzh_fast)	      	\
+      SymX(complementIntegerzh_fast)		\
+      SymX(createAdjustor)			\
+      SymX(decodeDoublezh_fast)			\
+      SymX(decodeFloatzh_fast)			\
+      SymX(defaultsHook)			\
+      SymX(delayzh_fast)			\
+      SymX(deRefWeakzh_fast)			\
+      SymX(deRefStablePtrzh_fast)		\
+      SymX(dirty_MUT_VAR)			\
+      SymX(divExactIntegerzh_fast)		\
+      SymX(divModIntegerzh_fast)		\
+      SymX(forkzh_fast)				\
+      SymX(forkOnzh_fast)			\
+      SymX(forkProcess)				\
+      SymX(forkOS_createThread)			\
+      SymX(freeHaskellFunctionPtr)		\
+      SymX(freeStablePtr)		        \
+      SymX(gcdIntegerzh_fast)			\
+      SymX(gcdIntegerIntzh_fast)		\
+      SymX(gcdIntzh_fast)			\
+      SymX(genSymZh)				\
+      SymX(genericRaise)			\
+      SymX(getProgArgv)				\
+      SymX(getStablePtr)			\
+      SymX(hs_init)				\
+      SymX(hs_exit)				\
+      SymX(hs_set_argv)				\
+      SymX(hs_add_root)				\
+      SymX(hs_perform_gc)			\
+      SymX(hs_free_stable_ptr)			\
+      SymX(hs_free_fun_ptr)			\
+      SymX(initLinker)				\
+      SymX(int2Integerzh_fast)			\
+      SymX(integer2Intzh_fast)			\
+      SymX(integer2Wordzh_fast)			\
+      SymX(isCurrentThreadBoundzh_fast)		\
+      SymX(isDoubleDenormalized)		\
+      SymX(isDoubleInfinite)			\
+      SymX(isDoubleNaN)				\
+      SymX(isDoubleNegativeZero)		\
+      SymX(isEmptyMVarzh_fast)			\
+      SymX(isFloatDenormalized)			\
+      SymX(isFloatInfinite)			\
+      SymX(isFloatNaN)				\
+      SymX(isFloatNegativeZero)			\
+      SymX(killThreadzh_fast)			\
+      SymX(loadObj)          			\
+      SymX(lookupSymbol)     			\
+      SymX(makeStablePtrzh_fast)		\
+      SymX(minusIntegerzh_fast)			\
+      SymX(mkApUpd0zh_fast)			\
+      SymX(myThreadIdzh_fast)			\
+      SymX(labelThreadzh_fast)                  \
+      SymX(newArrayzh_fast)			\
+      SymX(newBCOzh_fast)			\
+      SymX(newByteArrayzh_fast)			\
+      SymX_redirect(newCAF, newDynCAF)		\
+      SymX(newMVarzh_fast)			\
+      SymX(newMutVarzh_fast)			\
+      SymX(newTVarzh_fast)			\
+      SymX(atomicModifyMutVarzh_fast)		\
+      SymX(newPinnedByteArrayzh_fast)		\
+      SymX(newSpark)				\
+      SymX(orIntegerzh_fast)			\
+      SymX(performGC)				\
+      SymX(performMajorGC)			\
+      SymX(plusIntegerzh_fast)			\
+      SymX(prog_argc)				\
+      SymX(prog_argv)				\
+      SymX(putMVarzh_fast)			\
+      SymX(quotIntegerzh_fast)			\
+      SymX(quotRemIntegerzh_fast)		\
+      SymX(raisezh_fast)			\
+      SymX(raiseIOzh_fast)			\
+      SymX(readTVarzh_fast)			\
+      SymX(remIntegerzh_fast)			\
+      SymX(resetNonBlockingFd)			\
+      SymX(resumeThread)			\
+      SymX(resolveObjs)                         \
+      SymX(retryzh_fast)                        \
+      SymX(rts_apply)				\
+      SymX(rts_checkSchedStatus)		\
+      SymX(rts_eval)				\
+      SymX(rts_evalIO)				\
+      SymX(rts_evalLazyIO)			\
+      SymX(rts_evalStableIO)			\
+      SymX(rts_eval_)				\
+      SymX(rts_getBool)				\
+      SymX(rts_getChar)				\
+      SymX(rts_getDouble)			\
+      SymX(rts_getFloat)			\
+      SymX(rts_getInt)				\
+      SymX(rts_getInt32)			\
+      SymX(rts_getPtr)				\
+      SymX(rts_getFunPtr)			\
+      SymX(rts_getStablePtr)			\
+      SymX(rts_getThreadId)			\
+      SymX(rts_getWord)				\
+      SymX(rts_getWord32)			\
+      SymX(rts_lock)				\
+      SymX(rts_mkBool)				\
+      SymX(rts_mkChar)				\
+      SymX(rts_mkDouble)			\
+      SymX(rts_mkFloat)				\
+      SymX(rts_mkInt)				\
+      SymX(rts_mkInt16)				\
+      SymX(rts_mkInt32)				\
+      SymX(rts_mkInt64)				\
+      SymX(rts_mkInt8)				\
+      SymX(rts_mkPtr)				\
+      SymX(rts_mkFunPtr)			\
+      SymX(rts_mkStablePtr)			\
+      SymX(rts_mkString)			\
+      SymX(rts_mkWord)				\
+      SymX(rts_mkWord16)			\
+      SymX(rts_mkWord32)			\
+      SymX(rts_mkWord64)			\
+      SymX(rts_mkWord8)				\
+      SymX(rts_unlock)				\
+      SymX(rtsSupportsBoundThreads)		\
+      SymX(__hscore_get_saved_termios)		\
+      SymX(__hscore_set_saved_termios)		\
+      SymX(setProgArgv)				\
+      SymX(startupHaskell)			\
+      SymX(shutdownHaskell)			\
+      SymX(shutdownHaskellAndExit)		\
+      SymX(stable_ptr_table)			\
+      SymX(stackOverflow)			\
+      SymX(stg_CAF_BLACKHOLE_info)		\
+      SymX(awakenBlockedQueue)			\
+      SymX(stg_CHARLIKE_closure)		\
+      SymX(stg_EMPTY_MVAR_info)			\
+      SymX(stg_IND_STATIC_info)			\
+      SymX(stg_INTLIKE_closure)			\
+      SymX(stg_MUT_ARR_PTRS_DIRTY_info)		\
+      SymX(stg_MUT_ARR_PTRS_FROZEN_info)	\
+      SymX(stg_MUT_ARR_PTRS_FROZEN0_info)	\
+      SymX(stg_WEAK_info)                       \
+      SymX(stg_ap_v_info)			\
+      SymX(stg_ap_f_info)			\
+      SymX(stg_ap_d_info)			\
+      SymX(stg_ap_l_info)			\
+      SymX(stg_ap_n_info)			\
+      SymX(stg_ap_p_info)			\
+      SymX(stg_ap_pv_info)			\
+      SymX(stg_ap_pp_info)			\
+      SymX(stg_ap_ppv_info)			\
+      SymX(stg_ap_ppp_info)			\
+      SymX(stg_ap_pppv_info)			\
+      SymX(stg_ap_pppp_info)			\
+      SymX(stg_ap_ppppp_info)			\
+      SymX(stg_ap_pppppp_info)			\
+      SymX(stg_ap_0_fast)			\
+      SymX(stg_ap_v_fast)			\
+      SymX(stg_ap_f_fast)			\
+      SymX(stg_ap_d_fast)			\
+      SymX(stg_ap_l_fast)			\
+      SymX(stg_ap_n_fast)			\
+      SymX(stg_ap_p_fast)			\
+      SymX(stg_ap_pv_fast)			\
+      SymX(stg_ap_pp_fast)			\
+      SymX(stg_ap_ppv_fast)			\
+      SymX(stg_ap_ppp_fast)			\
+      SymX(stg_ap_pppv_fast)			\
+      SymX(stg_ap_pppp_fast)			\
+      SymX(stg_ap_ppppp_fast)			\
+      SymX(stg_ap_pppppp_fast)			\
+      SymX(stg_ap_1_upd_info)			\
+      SymX(stg_ap_2_upd_info)			\
+      SymX(stg_ap_3_upd_info)			\
+      SymX(stg_ap_4_upd_info)			\
+      SymX(stg_ap_5_upd_info)			\
+      SymX(stg_ap_6_upd_info)			\
+      SymX(stg_ap_7_upd_info)			\
+      SymX(stg_exit)				\
+      SymX(stg_sel_0_upd_info)			\
+      SymX(stg_sel_10_upd_info)			\
+      SymX(stg_sel_11_upd_info)			\
+      SymX(stg_sel_12_upd_info)			\
+      SymX(stg_sel_13_upd_info)			\
+      SymX(stg_sel_14_upd_info)			\
+      SymX(stg_sel_15_upd_info)			\
+      SymX(stg_sel_1_upd_info)			\
+      SymX(stg_sel_2_upd_info)			\
+      SymX(stg_sel_3_upd_info)			\
+      SymX(stg_sel_4_upd_info)			\
+      SymX(stg_sel_5_upd_info)			\
+      SymX(stg_sel_6_upd_info)			\
+      SymX(stg_sel_7_upd_info)			\
+      SymX(stg_sel_8_upd_info)			\
+      SymX(stg_sel_9_upd_info)			\
+      SymX(stg_upd_frame_info)			\
+      SymX(suspendThread)			\
+      SymX(takeMVarzh_fast)			\
+      SymX(timesIntegerzh_fast)			\
+      SymX(tryPutMVarzh_fast)			\
+      SymX(tryTakeMVarzh_fast)			\
+      SymX(unblockAsyncExceptionszh_fast)	\
+      SymX(unloadObj)                           \
+      SymX(unsafeThawArrayzh_fast)		\
+      SymX(waitReadzh_fast)			\
+      SymX(waitWritezh_fast)			\
+      SymX(word2Integerzh_fast)			\
+      SymX(writeTVarzh_fast)			\
+      SymX(xorIntegerzh_fast)			\
+      SymX(yieldzh_fast)                        \
+      SymX(stg_interp_constr_entry)             \
+      SymX(stg_interp_constr1_entry)            \
+      SymX(stg_interp_constr2_entry)            \
+      SymX(stg_interp_constr3_entry)            \
+      SymX(stg_interp_constr4_entry)            \
+      SymX(stg_interp_constr5_entry)            \
+      SymX(stg_interp_constr6_entry)            \
+      SymX(stg_interp_constr7_entry)            \
+      SymX(stg_interp_constr8_entry)            \
+      SymX(stgMallocBytesRWX)                   \
+      SymX(getAllocations)                      \
+      SymX(revertCAFs)                          \
+      SymX(RtsFlags)                            \
+      RTS_USER_SIGNALS_SYMBOLS
+
+#ifdef SUPPORT_LONG_LONGS
+#define RTS_LONG_LONG_SYMS			\
+      SymX(int64ToIntegerzh_fast)		\
+      SymX(word64ToIntegerzh_fast)
+#else
+#define RTS_LONG_LONG_SYMS /* nothing */
+#endif
+
+// 64-bit support functions in libgcc.a
+#if defined(__GNUC__) && SIZEOF_VOID_P <= 4
+#define RTS_LIBGCC_SYMBOLS			\
+      Sym(__divdi3)                             \
+      Sym(__udivdi3)                            \
+      Sym(__moddi3)                             \
+      Sym(__umoddi3)				\
+      Sym(__muldi3)				\
+      Sym(__ashldi3)				\
+      Sym(__ashrdi3)				\
+      Sym(__lshrdi3)				\
+      Sym(__eprintf)
+#elif defined(ia64_HOST_ARCH)
+#define RTS_LIBGCC_SYMBOLS			\
+      Sym(__divdi3)				\
+      Sym(__udivdi3)                            \
+      Sym(__moddi3)				\
+      Sym(__umoddi3)				\
+      Sym(__divsf3)				\
+      Sym(__divdf3)
+#else
+#define RTS_LIBGCC_SYMBOLS
+#endif
+
+#if defined(darwin_HOST_OS) && defined(powerpc_HOST_ARCH)
+      // Symbols that don't have a leading underscore
+      // on Mac OS X. They have to receive special treatment,
+      // see machoInitSymbolsWithoutUnderscore()
+#define RTS_MACHO_NOUNDERLINE_SYMBOLS		\
+      Sym(saveFP)				\
+      Sym(restFP)
+#endif
+
+/* entirely bogus claims about types of these symbols */
+#define Sym(vvv)  extern void vvv(void);
+#define SymX(vvv) /**/
+#define SymX_redirect(vvv,xxx) /**/
+RTS_SYMBOLS
+RTS_RET_SYMBOLS
+RTS_LONG_LONG_SYMS
+RTS_POSIX_ONLY_SYMBOLS
+RTS_MINGW_ONLY_SYMBOLS
+RTS_CYGWIN_ONLY_SYMBOLS
+RTS_DARWIN_ONLY_SYMBOLS
+RTS_LIBGCC_SYMBOLS
+#undef Sym
+#undef SymX
+#undef SymX_redirect
+
+#ifdef LEADING_UNDERSCORE
+#define MAYBE_LEADING_UNDERSCORE_STR(s) ("_" s)
+#else
+#define MAYBE_LEADING_UNDERSCORE_STR(s) (s)
+#endif
+
+#define Sym(vvv) { MAYBE_LEADING_UNDERSCORE_STR(#vvv), \
+                    (void*)(&(vvv)) },
+#define SymX(vvv) Sym(vvv)
+
+// SymX_redirect allows us to redirect references to one symbol to
+// another symbol.  See newCAF/newDynCAF for an example.
+#define SymX_redirect(vvv,xxx) \
+    { MAYBE_LEADING_UNDERSCORE_STR(#vvv), \
+      (void*)(&(xxx)) },
+
+static RtsSymbolVal rtsSyms[] = {
+      RTS_SYMBOLS
+      RTS_RET_SYMBOLS
+      RTS_LONG_LONG_SYMS
+      RTS_POSIX_ONLY_SYMBOLS
+      RTS_MINGW_ONLY_SYMBOLS
+      RTS_CYGWIN_ONLY_SYMBOLS
+      RTS_LIBGCC_SYMBOLS
+#if defined(darwin_HOST_OS) && defined(i386_HOST_ARCH)
+      // dyld stub code contains references to this,
+      // but it should never be called because we treat
+      // lazy pointers as nonlazy.
+      { "dyld_stub_binding_helper", (void*)0xDEADBEEF },
+#endif
+      { 0, 0 } /* sentinel */
+};
+
+/* -----------------------------------------------------------------------------
+ * Insert symbols into hash tables, checking for duplicates.
+ */
+static void ghciInsertStrHashTable ( char* obj_name,
+                                     HashTable *table,
+                                     char* key,
+                                     void *data
+				   )
+{
+   if (lookupHashTable(table, (StgWord)key) == NULL)
+   {
+      insertStrHashTable(table, (StgWord)key, data);
+      return;
+   }
+   debugBelch(
+      "\n\n"
+      "GHCi runtime linker: fatal error: I found a duplicate definition for symbol\n"
+      "   %s\n"
+      "whilst processing object file\n"
+      "   %s\n"
+      "This could be caused by:\n"
+      "   * Loading two different object files which export the same symbol\n"
+      "   * Specifying the same object file twice on the GHCi command line\n"
+      "   * An incorrect `package.conf' entry, causing some object to be\n"
+      "     loaded twice.\n"
+      "GHCi cannot safely continue in this situation.  Exiting now.  Sorry.\n"
+      "\n",
+      (char*)key,
+      obj_name
+   );
+   exit(1);
+}
+
+
+/* -----------------------------------------------------------------------------
+ * initialize the object linker
+ */
+
+
+static int linker_init_done = 0 ;
+
+#if defined(OBJFORMAT_ELF) || defined(OBJFORMAT_MACHO)
+static void *dl_prog_handle;
+#endif
+
+/* dlopen(NULL,..) doesn't work so we grab libc explicitly */
+#if defined(openbsd_HOST_OS)
+static void *dl_libc_handle;
+#endif
+
+void
+initLinker( void )
+{
+    RtsSymbolVal *sym;
+
+    /* Make initLinker idempotent, so we can call it
+       before evey relevant operation; that means we
+       don't need to initialise the linker separately */
+    if (linker_init_done == 1) { return; } else {
+      linker_init_done = 1;
+    }
+
+    symhash = allocStrHashTable();
+
+    /* populate the symbol table with stuff from the RTS */
+    for (sym = rtsSyms; sym->lbl != NULL; sym++) {
+	ghciInsertStrHashTable("(GHCi built-in symbols)",
+                               symhash, sym->lbl, sym->addr);
+    }
+#   if defined(OBJFORMAT_MACHO) && defined(powerpc_HOST_ARCH)
+    machoInitSymbolsWithoutUnderscore();
+#   endif
+
+#   if defined(OBJFORMAT_ELF) || defined(OBJFORMAT_MACHO)
+#   if defined(RTLD_DEFAULT)
+    dl_prog_handle = RTLD_DEFAULT;
+#   else
+    dl_prog_handle = dlopen(NULL, RTLD_LAZY);
+#   if defined(openbsd_HOST_OS)
+    dl_libc_handle = dlopen("libc.so", RTLD_LAZY);
+#   endif
+#   endif /* RTLD_DEFAULT */
+#   endif
+}
+
+/* -----------------------------------------------------------------------------
+ *                  Loading DLL or .so dynamic libraries
+ * -----------------------------------------------------------------------------
+ *
+ * Add a DLL from which symbols may be found.  In the ELF case, just
+ * do RTLD_GLOBAL-style add, so no further messing around needs to
+ * happen in order that symbols in the loaded .so are findable --
+ * lookupSymbol() will subsequently see them by dlsym on the program's
+ * dl-handle.  Returns NULL if success, otherwise ptr to an err msg.
+ *
+ * In the PEi386 case, open the DLLs and put handles to them in a
+ * linked list.  When looking for a symbol, try all handles in the
+ * list.  This means that we need to load even DLLs that are guaranteed
+ * to be in the ghc.exe image already, just so we can get a handle
+ * to give to loadSymbol, so that we can find the symbols.  For such
+ * libraries, the LoadLibrary call should be a no-op except for returning
+ * the handle.
+ *
+ */
+
+#if defined(OBJFORMAT_PEi386)
+/* A record for storing handles into DLLs. */
+
+typedef
+   struct _OpenedDLL {
+      char*              name;
+      struct _OpenedDLL* next;
+      HINSTANCE instance;
+   }
+   OpenedDLL;
+
+/* A list thereof. */
+static OpenedDLL* opened_dlls = NULL;
+#endif
+
+char *
+addDLL( char *dll_name )
+{
+#  if defined(OBJFORMAT_ELF) || defined(OBJFORMAT_MACHO)
+   /* ------------------- ELF DLL loader ------------------- */
+   void *hdl;
+   char *errmsg;
+
+   initLinker();
+
+   hdl= dlopen(dll_name, RTLD_NOW | RTLD_GLOBAL);
+
+   if (hdl == NULL) {
+      /* dlopen failed; return a ptr to the error msg. */
+      errmsg = dlerror();
+      if (errmsg == NULL) errmsg = "addDLL: unknown error";
+      return errmsg;
+   } else {
+      return NULL;
+   }
+   /*NOTREACHED*/
+
+#  elif defined(OBJFORMAT_PEi386)
+   /* ------------------- Win32 DLL loader ------------------- */
+
+   char*      buf;
+   OpenedDLL* o_dll;
+   HINSTANCE  instance;
+
+   initLinker();
+
+   /* debugBelch("\naddDLL; dll_name = `%s'\n", dll_name); */
+
+   /* See if we've already got it, and ignore if so. */
+   for (o_dll = opened_dlls; o_dll != NULL; o_dll = o_dll->next) {
+      if (0 == strcmp(o_dll->name, dll_name))
+         return NULL;
+   }
+
+   /* The file name has no suffix (yet) so that we can try
+      both foo.dll and foo.drv
+
+      The documentation for LoadLibrary says:
+      	If no file name extension is specified in the lpFileName
+      	parameter, the default library extension .dll is
+      	appended. However, the file name string can include a trailing
+      	point character (.) to indicate that the module name has no
+      	extension. */
+
+   buf = stgMallocBytes(strlen(dll_name) + 10, "addDLL");
+   sprintf(buf, "%s.DLL", dll_name);
+   instance = LoadLibrary(buf);
+   if (instance == NULL) {
+	 sprintf(buf, "%s.DRV", dll_name);	// KAA: allow loading of drivers (like winspool.drv)
+	 instance = LoadLibrary(buf);
+	 if (instance == NULL) {
+		stgFree(buf);
+
+	    /* LoadLibrary failed; return a ptr to the error msg. */
+	    return "addDLL: unknown error";
+   	 }
+   }
+   stgFree(buf);
+
+   /* Add this DLL to the list of DLLs in which to search for symbols. */
+   o_dll = stgMallocBytes( sizeof(OpenedDLL), "addDLL" );
+   o_dll->name     = stgMallocBytes(1+strlen(dll_name), "addDLL");
+   strcpy(o_dll->name, dll_name);
+   o_dll->instance = instance;
+   o_dll->next     = opened_dlls;
+   opened_dlls     = o_dll;
+
+   return NULL;
+#  else
+   barf("addDLL: not implemented on this platform");
+#  endif
+}
+
+/* -----------------------------------------------------------------------------
+ * lookup a symbol in the hash table
+ */
+void *
+lookupSymbol( char *lbl )
+{
+    void *val;
+    initLinker() ;
+    ASSERT(symhash != NULL);
+    val = lookupStrHashTable(symhash, lbl);
+
+    if (val == NULL) {
+#       if defined(OBJFORMAT_ELF)
+#	if defined(openbsd_HOST_OS)
+	val = dlsym(dl_prog_handle, lbl);
+	return (val != NULL) ? val : dlsym(dl_libc_handle,lbl);
+#	elif defined(x86_64_HOST_ARCH)
+	val = dlsym(dl_prog_handle, lbl);
+	if (val >= (void *)0x80000000) {
+	    void *new_val;
+	    new_val = x86_64_high_symbol(lbl, val);
+	    IF_DEBUG(linker,debugBelch("lookupSymbol: relocating out of range symbol: %s = %p, now %p\n", lbl, val, new_val));
+	    return new_val;
+	} else {
+	    return val;
+	}
+#	else /* not openbsd */
+	return dlsym(dl_prog_handle, lbl);
+#	endif
+#       elif defined(OBJFORMAT_MACHO)
+	if(NSIsSymbolNameDefined(lbl)) {
+	    NSSymbol symbol = NSLookupAndBindSymbol(lbl);
+	    return NSAddressOfSymbol(symbol);
+	} else {
+	    return NULL;
+	}
+#       elif defined(OBJFORMAT_PEi386)
+        OpenedDLL* o_dll;
+        void* sym;
+        for (o_dll = opened_dlls; o_dll != NULL; o_dll = o_dll->next) {
+	  /* debugBelch("look in %s for %s\n", o_dll->name, lbl); */
+           if (lbl[0] == '_') {
+              /* HACK: if the name has an initial underscore, try stripping
+                 it off & look that up first. I've yet to verify whether there's
+                 a Rule that governs whether an initial '_' *should always* be
+                 stripped off when mapping from import lib name to the DLL name.
+              */
+              sym = GetProcAddress(o_dll->instance, (lbl+1));
+              if (sym != NULL) {
+		/*debugBelch("found %s in %s\n", lbl+1,o_dll->name);*/
+		return sym;
+	      }
+           }
+           sym = GetProcAddress(o_dll->instance, lbl);
+           if (sym != NULL) {
+	     /*debugBelch("found %s in %s\n", lbl,o_dll->name);*/
+	     return sym;
+	   }
+        }
+        return NULL;
+#       else
+        ASSERT(2+2 == 5);
+        return NULL;
+#       endif
+    } else {
+	return val;
+    }
+}
+
+static
+__attribute((unused))
+void *
+lookupLocalSymbol( ObjectCode* oc, char *lbl )
+{
+    void *val;
+    initLinker() ;
+    val = lookupStrHashTable(oc->lochash, lbl);
+
+    if (val == NULL) {
+        return NULL;
+    } else {
+	return val;
+    }
+}
+
+
+/* -----------------------------------------------------------------------------
+ * Debugging aid: look in GHCi's object symbol tables for symbols
+ * within DELTA bytes of the specified address, and show their names.
+ */
+#ifdef DEBUG
+void ghci_enquire ( char* addr );
+
+void ghci_enquire ( char* addr )
+{
+   int   i;
+   char* sym;
+   char* a;
+   const int DELTA = 64;
+   ObjectCode* oc;
+
+   initLinker();
+
+   for (oc = objects; oc; oc = oc->next) {
+      for (i = 0; i < oc->n_symbols; i++) {
+         sym = oc->symbols[i];
+         if (sym == NULL) continue;
+         // debugBelch("enquire %p %p\n", sym, oc->lochash);
+         a = NULL;
+         if (oc->lochash != NULL) {
+            a = lookupStrHashTable(oc->lochash, sym);
+	 }
+         if (a == NULL) {
+            a = lookupStrHashTable(symhash, sym);
+	 }
+         if (a == NULL) {
+	     // debugBelch("ghci_enquire: can't find %s\n", sym);
+         }
+         else if (addr-DELTA <= a && a <= addr+DELTA) {
+            debugBelch("%p + %3d  ==  `%s'\n", addr, (int)(a - addr), sym);
+         }
+      }
+   }
+}
+#endif
+
+#ifdef ia64_HOST_ARCH
+static unsigned int PLTSize(void);
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Load an obj (populate the global symbol table, but don't resolve yet)
+ *
+ * Returns: 1 if ok, 0 on error.
+ */
+HsInt
+loadObj( char *path )
+{
+   ObjectCode* oc;
+   struct stat st;
+   int r, n;
+#ifdef USE_MMAP
+   int fd, pagesize;
+   void *map_addr = NULL;
+#else
+   FILE *f;
+   int misalignment;
+#endif
+   initLinker();
+
+   /* debugBelch("loadObj %s\n", path ); */
+
+   /* Check that we haven't already loaded this object. 
+      Ignore requests to load multiple times */
+   {
+       ObjectCode *o;
+       int is_dup = 0;
+       for (o = objects; o; o = o->next) {
+          if (0 == strcmp(o->fileName, path)) {
+             is_dup = 1;
+             break; /* don't need to search further */
+          }
+       }
+       if (is_dup) {
+          IF_DEBUG(linker, debugBelch(
+            "GHCi runtime linker: warning: looks like you're trying to load the\n"
+            "same object file twice:\n"
+            "   %s\n"
+            "GHCi will ignore this, but be warned.\n"
+            , path));
+          return 1; /* success */
+       }
+   }
+
+   oc = stgMallocBytes(sizeof(ObjectCode), "loadObj(oc)");
+
+#  if defined(OBJFORMAT_ELF)
+   oc->formatName = "ELF";
+#  elif defined(OBJFORMAT_PEi386)
+   oc->formatName = "PEi386";
+#  elif defined(OBJFORMAT_MACHO)
+   oc->formatName = "Mach-O";
+#  else
+   stgFree(oc);
+   barf("loadObj: not implemented on this platform");
+#  endif
+
+   r = stat(path, &st);
+   if (r == -1) { return 0; }
+
+   /* sigh, strdup() isn't a POSIX function, so do it the long way */
+   oc->fileName = stgMallocBytes( strlen(path)+1, "loadObj" );
+   strcpy(oc->fileName, path);
+
+   oc->fileSize          = st.st_size;
+   oc->symbols           = NULL;
+   oc->sections          = NULL;
+   oc->lochash           = allocStrHashTable();
+   oc->proddables        = NULL;
+
+   /* chain it onto the list of objects */
+   oc->next              = objects;
+   objects               = oc;
+
+#ifdef USE_MMAP
+#define ROUND_UP(x,size) ((x + size - 1) & ~(size - 1))
+
+   /* On many architectures malloc'd memory isn't executable, so we need to use mmap. */
+
+#if defined(openbsd_HOST_OS)
+   fd = open(path, O_RDONLY, S_IRUSR);
+#else
+   fd = open(path, O_RDONLY);
+#endif
+   if (fd == -1)
+      barf("loadObj: can't open `%s'", path);
+
+   pagesize = getpagesize();
+
+#ifdef ia64_HOST_ARCH
+   /* The PLT needs to be right before the object */
+   n = ROUND_UP(PLTSize(), pagesize);
+   oc->plt = mmap(NULL, n, PROT_EXEC|PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+   if (oc->plt == MAP_FAILED)
+      barf("loadObj: can't allocate PLT");
+
+   oc->pltIndex = 0;
+   map_addr = oc->plt + n;
+#endif
+
+   n = ROUND_UP(oc->fileSize, pagesize);
+
+   /* Link objects into the lower 2Gb on x86_64.  GHC assumes the
+    * small memory model on this architecture (see gcc docs,
+    * -mcmodel=small).
+    */
+#ifdef x86_64_HOST_ARCH
+#define EXTRA_MAP_FLAGS MAP_32BIT
+#else
+#define EXTRA_MAP_FLAGS 0
+#endif
+
+   oc->image = mmap(map_addr, n, PROT_EXEC|PROT_READ|PROT_WRITE, 
+		    MAP_PRIVATE|EXTRA_MAP_FLAGS, fd, 0);
+   if (oc->image == MAP_FAILED)
+      barf("loadObj: can't map `%s'", path);
+
+   close(fd);
+
+#else /* !USE_MMAP */
+
+   /* load the image into memory */
+   f = fopen(path, "rb");
+   if (!f)
+       barf("loadObj: can't read `%s'", path);
+
+#ifdef darwin_HOST_OS
+    // In a Mach-O .o file, all sections can and will be misaligned
+    // if the total size of the headers is not a multiple of the
+    // desired alignment. This is fine for .o files that only serve
+    // as input for the static linker, but it's not fine for us,
+    // as SSE (used by gcc for floating point) and Altivec require
+    // 16-byte alignment.
+    // We calculate the correct alignment from the header before
+    // reading the file, and then we misalign oc->image on purpose so
+    // that the actual sections end up aligned again.
+   misalignment = machoGetMisalignment(f);
+   oc->misalignment = misalignment;
+#else
+   misalignment = 0;
+#endif
+
+   oc->image = stgMallocBytes(oc->fileSize + misalignment, "loadObj(image)");
+   oc->image += misalignment;
+   
+   n = fread ( oc->image, 1, oc->fileSize, f );
+   if (n != oc->fileSize)
+      barf("loadObj: error whilst reading `%s'", path);
+
+   fclose(f);
+
+#endif /* USE_MMAP */
+
+#  if defined(OBJFORMAT_MACHO) && defined(powerpc_HOST_ARCH)
+   r = ocAllocateJumpIslands_MachO ( oc );
+   if (!r) { return r; }
+#  elif defined(OBJFORMAT_ELF) && defined(powerpc_HOST_ARCH)
+   r = ocAllocateJumpIslands_ELF ( oc );
+   if (!r) { return r; }
+#endif
+
+   /* verify the in-memory image */
+#  if defined(OBJFORMAT_ELF)
+   r = ocVerifyImage_ELF ( oc );
+#  elif defined(OBJFORMAT_PEi386)
+   r = ocVerifyImage_PEi386 ( oc );
+#  elif defined(OBJFORMAT_MACHO)
+   r = ocVerifyImage_MachO ( oc );
+#  else
+   barf("loadObj: no verify method");
+#  endif
+   if (!r) { return r; }
+
+   /* build the symbol list for this image */
+#  if defined(OBJFORMAT_ELF)
+   r = ocGetNames_ELF ( oc );
+#  elif defined(OBJFORMAT_PEi386)
+   r = ocGetNames_PEi386 ( oc );
+#  elif defined(OBJFORMAT_MACHO)
+   r = ocGetNames_MachO ( oc );
+#  else
+   barf("loadObj: no getNames method");
+#  endif
+   if (!r) { return r; }
+
+   /* loaded, but not resolved yet */
+   oc->status = OBJECT_LOADED;
+
+   return 1;
+}
+
+/* -----------------------------------------------------------------------------
+ * resolve all the currently unlinked objects in memory
+ *
+ * Returns: 1 if ok, 0 on error.
+ */
+HsInt
+resolveObjs( void )
+{
+    ObjectCode *oc;
+    int r;
+
+    initLinker();
+
+    for (oc = objects; oc; oc = oc->next) {
+	if (oc->status != OBJECT_RESOLVED) {
+#           if defined(OBJFORMAT_ELF)
+	    r = ocResolve_ELF ( oc );
+#           elif defined(OBJFORMAT_PEi386)
+	    r = ocResolve_PEi386 ( oc );
+#           elif defined(OBJFORMAT_MACHO)
+	    r = ocResolve_MachO ( oc );
+#           else
+	    barf("resolveObjs: not implemented on this platform");
+#           endif
+	    if (!r) { return r; }
+	    oc->status = OBJECT_RESOLVED;
+	}
+    }
+    return 1;
+}
+
+/* -----------------------------------------------------------------------------
+ * delete an object from the pool
+ */
+HsInt
+unloadObj( char *path )
+{
+    ObjectCode *oc, *prev;
+
+    ASSERT(symhash != NULL);
+    ASSERT(objects != NULL);
+
+    initLinker();
+
+    prev = NULL;
+    for (oc = objects; oc; prev = oc, oc = oc->next) {
+	if (!strcmp(oc->fileName,path)) {
+
+	    /* Remove all the mappings for the symbols within this
+	     * object..
+	     */
+	    {
+                int i;
+                for (i = 0; i < oc->n_symbols; i++) {
+                   if (oc->symbols[i] != NULL) {
+                       removeStrHashTable(symhash, oc->symbols[i], NULL);
+                   }
+                }
+            }
+
+	    if (prev == NULL) {
+		objects = oc->next;
+	    } else {
+		prev->next = oc->next;
+	    }
+
+	    /* We're going to leave this in place, in case there are
+	       any pointers from the heap into it: */
+	    /* stgFree(oc->image); */
+	    stgFree(oc->fileName);
+	    stgFree(oc->symbols);
+	    stgFree(oc->sections);
+	    /* The local hash table should have been freed at the end
+               of the ocResolve_ call on it. */
+            ASSERT(oc->lochash == NULL);
+	    stgFree(oc);
+	    return 1;
+	}
+    }
+
+    errorBelch("unloadObj: can't find `%s' to unload", path);
+    return 0;
+}
+
+/* -----------------------------------------------------------------------------
+ * Sanity checking.  For each ObjectCode, maintain a list of address ranges
+ * which may be prodded during relocation, and abort if we try and write
+ * outside any of these.
+ */
+static void addProddableBlock ( ObjectCode* oc, void* start, int size )
+{
+   ProddableBlock* pb
+      = stgMallocBytes(sizeof(ProddableBlock), "addProddableBlock");
+   /* debugBelch("aPB %p %p %d\n", oc, start, size); */
+   ASSERT(size > 0);
+   pb->start      = start;
+   pb->size       = size;
+   pb->next       = oc->proddables;
+   oc->proddables = pb;
+}
+
+static void checkProddableBlock ( ObjectCode* oc, void* addr )
+{
+   ProddableBlock* pb;
+   for (pb = oc->proddables; pb != NULL; pb = pb->next) {
+      char* s = (char*)(pb->start);
+      char* e = s + pb->size - 1;
+      char* a = (char*)addr;
+      /* Assumes that the biggest fixup involves a 4-byte write.  This
+         probably needs to be changed to 8 (ie, +7) on 64-bit
+         plats. */
+      if (a >= s && (a+3) <= e) return;
+   }
+   barf("checkProddableBlock: invalid fixup in runtime linker");
+}
+
+/* -----------------------------------------------------------------------------
+ * Section management.
+ */
+static void addSection ( ObjectCode* oc, SectionKind kind,
+                         void* start, void* end )
+{
+   Section* s   = stgMallocBytes(sizeof(Section), "addSection");
+   s->start     = start;
+   s->end       = end;
+   s->kind      = kind;
+   s->next      = oc->sections;
+   oc->sections = s;
+   /*
+   debugBelch("addSection: %p-%p (size %d), kind %d\n",
+                   start, ((char*)end)-1, end - start + 1, kind );
+   */
+}
+
+
+/* --------------------------------------------------------------------------
+ * PowerPC specifics (jump islands)
+ * ------------------------------------------------------------------------*/
+
+#if defined(powerpc_HOST_ARCH)
+
+/*
+  ocAllocateJumpIslands
+  
+  Allocate additional space at the end of the object file image to make room
+  for jump islands.
+  
+  PowerPC relative branch instructions have a 24 bit displacement field.
+  As PPC code is always 4-byte-aligned, this yields a +-32MB range.
+  If a particular imported symbol is outside this range, we have to redirect
+  the jump to a short piece of new code that just loads the 32bit absolute
+  address and jumps there.
+  This function just allocates space for one 16 byte ppcJumpIsland for every
+  undefined symbol in the object file. The code for the islands is filled in by
+  makeJumpIsland below.
+*/
+
+static int ocAllocateJumpIslands( ObjectCode* oc, int count, int first )
+{
+#ifdef USE_MMAP
+  int pagesize, n, m;
+#endif
+  int aligned;
+  int misalignment = 0;
+#if darwin_HOST_OS
+  misalignment = oc->misalignment;
+#endif
+
+  if( count > 0 )
+  {
+    // round up to the nearest 4
+    aligned = (oc->fileSize + 3) & ~3;
+
+#ifdef USE_MMAP
+    #ifndef linux_HOST_OS /* mremap is a linux extension */
+        #error ocAllocateJumpIslands doesnt want USE_MMAP to be defined
+    #endif
+
+    pagesize = getpagesize();
+    n = ROUND_UP( oc->fileSize, pagesize );
+    m = ROUND_UP( aligned + sizeof (ppcJumpIsland) * count, pagesize );
+
+    /* If we have a half-page-size file and map one page of it then
+     * the part of the page after the size of the file remains accessible.
+     * If, however, we map in 2 pages, the 2nd page is not accessible
+     * and will give a "Bus Error" on access.  To get around this, we check
+     * if we need any extra pages for the jump islands and map them in
+     * anonymously.  We must check that we actually require extra pages
+     * otherwise the attempt to mmap 0 pages of anonymous memory will
+     * fail -EINVAL.
+     */
+
+    if( m > n )
+    {
+      /* The effect of this mremap() call is only the ensure that we have
+       * a sufficient number of virtually contiguous pages.  As returned from
+       * mremap, the pages past the end of the file are not backed.  We give
+       * them a backing by using MAP_FIXED to map in anonymous pages.
+       */
+      oc->image = mremap( oc->image, n, m, MREMAP_MAYMOVE );
+
+      if( oc->image == MAP_FAILED )
+      {
+        errorBelch( "Unable to mremap for Jump Islands\n" );
+        return 0;
+      }
+
+      if( mmap( oc->image + n, m - n, PROT_READ | PROT_WRITE | PROT_EXEC,
+                MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, 0, 0 ) == MAP_FAILED )
+      {
+        errorBelch( "Unable to mmap( MAP_FIXED ) for Jump Islands\n" );
+        return 0;
+      }
+    }
+
+#else
+    oc->image -= misalignment;
+    oc->image = stgReallocBytes( oc->image,
+                                 misalignment + 
+                                 aligned + sizeof (ppcJumpIsland) * count,
+                                 "ocAllocateJumpIslands" );
+    oc->image += misalignment;
+#endif /* USE_MMAP */
+
+    oc->jump_islands = (ppcJumpIsland *) (oc->image + aligned);
+    memset( oc->jump_islands, 0, sizeof (ppcJumpIsland) * count );
+  }
+  else
+    oc->jump_islands = NULL;
+
+  oc->island_start_symbol = first;
+  oc->n_islands = count;
+
+  return 1;
+}
+
+static unsigned long makeJumpIsland( ObjectCode* oc,
+                                     unsigned long symbolNumber,
+                                     unsigned long target )
+{
+  ppcJumpIsland *island;
+
+  if( symbolNumber < oc->island_start_symbol ||
+      symbolNumber - oc->island_start_symbol > oc->n_islands)
+    return 0;
+
+  island = &oc->jump_islands[symbolNumber - oc->island_start_symbol];
+
+  // lis r12, hi16(target)
+  island->lis_r12     = 0x3d80;
+  island->hi_addr     = target >> 16;
+
+  // ori r12, r12, lo16(target)
+  island->ori_r12_r12 = 0x618c;
+  island->lo_addr     = target & 0xffff;
+
+  // mtctr r12
+  island->mtctr_r12   = 0x7d8903a6;
+
+  // bctr
+  island->bctr        = 0x4e800420;
+    
+  return (unsigned long) island;
+}
+
+/*
+   ocFlushInstructionCache
+
+   Flush the data & instruction caches.
+   Because the PPC has split data/instruction caches, we have to
+   do that whenever we modify code at runtime.
+ */
+
+static void ocFlushInstructionCache( ObjectCode *oc )
+{
+    int n = (oc->fileSize + sizeof( ppcJumpIsland ) * oc->n_islands + 3) / 4;
+    unsigned long *p = (unsigned long *) oc->image;
+
+    while( n-- )
+    {
+        __asm__ volatile ( "dcbf 0,%0\n\t"
+                           "sync\n\t"
+                           "icbi 0,%0"
+                           :
+                           : "r" (p)
+                         );
+        p++;
+    }
+    __asm__ volatile ( "sync\n\t"
+                       "isync"
+                     );
+}
+#endif
+
+/* --------------------------------------------------------------------------
+ * PEi386 specifics (Win32 targets)
+ * ------------------------------------------------------------------------*/
+
+/* The information for this linker comes from
+      Microsoft Portable Executable
+      and Common Object File Format Specification
+      revision 5.1 January 1998
+   which SimonM says comes from the MS Developer Network CDs.
+
+   It can be found there (on older CDs), but can also be found
+   online at:
+
+      http://www.microsoft.com/hwdev/hardware/PECOFF.asp
+
+   (this is Rev 6.0 from February 1999).
+
+   Things move, so if that fails, try searching for it via
+
+      http://www.google.com/search?q=PE+COFF+specification
+
+   The ultimate reference for the PE format is the Winnt.h
+   header file that comes with the Platform SDKs; as always,
+   implementations will drift wrt their documentation.
+
+   A good background article on the PE format is Matt Pietrek's
+   March 1994 article in Microsoft System Journal (MSJ)
+   (Vol.9, No. 3): "Peering Inside the PE: A Tour of the
+   Win32 Portable Executable File Format." The info in there
+   has recently been updated in a two part article in
+   MSDN magazine, issues Feb and March 2002,
+   "Inside Windows: An In-Depth Look into the Win32 Portable
+   Executable File Format"
+
+   John Levine's book "Linkers and Loaders" contains useful
+   info on PE too.
+*/
+
+
+#if defined(OBJFORMAT_PEi386)
+
+
+
+typedef unsigned char  UChar;
+typedef unsigned short UInt16;
+typedef unsigned int   UInt32;
+typedef          int   Int32;
+
+
+typedef
+   struct {
+      UInt16 Machine;
+      UInt16 NumberOfSections;
+      UInt32 TimeDateStamp;
+      UInt32 PointerToSymbolTable;
+      UInt32 NumberOfSymbols;
+      UInt16 SizeOfOptionalHeader;
+      UInt16 Characteristics;
+   }
+   COFF_header;
+
+#define sizeof_COFF_header 20
+
+
+typedef
+   struct {
+      UChar  Name[8];
+      UInt32 VirtualSize;
+      UInt32 VirtualAddress;
+      UInt32 SizeOfRawData;
+      UInt32 PointerToRawData;
+      UInt32 PointerToRelocations;
+      UInt32 PointerToLinenumbers;
+      UInt16 NumberOfRelocations;
+      UInt16 NumberOfLineNumbers;
+      UInt32 Characteristics;
+   }
+   COFF_section;
+
+#define sizeof_COFF_section 40
+
+
+typedef
+   struct {
+      UChar  Name[8];
+      UInt32 Value;
+      UInt16 SectionNumber;
+      UInt16 Type;
+      UChar  StorageClass;
+      UChar  NumberOfAuxSymbols;
+   }
+   COFF_symbol;
+
+#define sizeof_COFF_symbol 18
+
+
+typedef
+   struct {
+      UInt32 VirtualAddress;
+      UInt32 SymbolTableIndex;
+      UInt16 Type;
+   }
+   COFF_reloc;
+
+#define sizeof_COFF_reloc 10
+
+
+/* From PE spec doc, section 3.3.2 */
+/* Note use of MYIMAGE_* since IMAGE_* are already defined in
+   windows.h -- for the same purpose, but I want to know what I'm
+   getting, here. */
+#define MYIMAGE_FILE_RELOCS_STRIPPED     0x0001
+#define MYIMAGE_FILE_EXECUTABLE_IMAGE    0x0002
+#define MYIMAGE_FILE_DLL                 0x2000
+#define MYIMAGE_FILE_SYSTEM              0x1000
+#define MYIMAGE_FILE_BYTES_REVERSED_HI   0x8000
+#define MYIMAGE_FILE_BYTES_REVERSED_LO   0x0080
+#define MYIMAGE_FILE_32BIT_MACHINE       0x0100
+
+/* From PE spec doc, section 5.4.2 and 5.4.4 */
+#define MYIMAGE_SYM_CLASS_EXTERNAL       2
+#define MYIMAGE_SYM_CLASS_STATIC         3
+#define MYIMAGE_SYM_UNDEFINED            0
+
+/* From PE spec doc, section 4.1 */
+#define MYIMAGE_SCN_CNT_CODE             0x00000020
+#define MYIMAGE_SCN_CNT_INITIALIZED_DATA 0x00000040
+#define MYIMAGE_SCN_LNK_NRELOC_OVFL      0x01000000
+
+/* From PE spec doc, section 5.2.1 */
+#define MYIMAGE_REL_I386_DIR32           0x0006
+#define MYIMAGE_REL_I386_REL32           0x0014
+
+
+/* We use myindex to calculate array addresses, rather than
+   simply doing the normal subscript thing.  That's because
+   some of the above structs have sizes which are not
+   a whole number of words.  GCC rounds their sizes up to a
+   whole number of words, which means that the address calcs
+   arising from using normal C indexing or pointer arithmetic
+   are just plain wrong.  Sigh.
+*/
+static UChar *
+myindex ( int scale, void* base, int index )
+{
+   return
+      ((UChar*)base) + scale * index;
+}
+
+
+static void
+printName ( UChar* name, UChar* strtab )
+{
+   if (name[0]==0 && name[1]==0 && name[2]==0 && name[3]==0) {
+      UInt32 strtab_offset = * (UInt32*)(name+4);
+      debugBelch("%s", strtab + strtab_offset );
+   } else {
+      int i;
+      for (i = 0; i < 8; i++) {
+         if (name[i] == 0) break;
+         debugBelch("%c", name[i] );
+      }
+   }
+}
+
+
+static void
+copyName ( UChar* name, UChar* strtab, UChar* dst, int dstSize )
+{
+   if (name[0]==0 && name[1]==0 && name[2]==0 && name[3]==0) {
+      UInt32 strtab_offset = * (UInt32*)(name+4);
+      strncpy ( dst, strtab+strtab_offset, dstSize );
+      dst[dstSize-1] = 0;
+   } else {
+      int i = 0;
+      while (1) {
+         if (i >= 8) break;
+         if (name[i] == 0) break;
+         dst[i] = name[i];
+         i++;
+      }
+      dst[i] = 0;
+   }
+}
+
+
+static UChar *
+cstring_from_COFF_symbol_name ( UChar* name, UChar* strtab )
+{
+   UChar* newstr;
+   /* If the string is longer than 8 bytes, look in the
+      string table for it -- this will be correctly zero terminated.
+   */
+   if (name[0]==0 && name[1]==0 && name[2]==0 && name[3]==0) {
+      UInt32 strtab_offset = * (UInt32*)(name+4);
+      return ((UChar*)strtab) + strtab_offset;
+   }
+   /* Otherwise, if shorter than 8 bytes, return the original,
+      which by defn is correctly terminated.
+   */
+   if (name[7]==0) return name;
+   /* The annoying case: 8 bytes.  Copy into a temporary
+      (which is never freed ...)
+   */
+   newstr = stgMallocBytes(9, "cstring_from_COFF_symbol_name");
+   ASSERT(newstr);
+   strncpy(newstr,name,8);
+   newstr[8] = 0;
+   return newstr;
+}
+
+
+/* Just compares the short names (first 8 chars) */
+static COFF_section *
+findPEi386SectionCalled ( ObjectCode* oc,  char* name )
+{
+   int i;
+   COFF_header* hdr
+      = (COFF_header*)(oc->image);
+   COFF_section* sectab
+      = (COFF_section*) (
+           ((UChar*)(oc->image))
+           + sizeof_COFF_header + hdr->SizeOfOptionalHeader
+        );
+   for (i = 0; i < hdr->NumberOfSections; i++) {
+      UChar* n1;
+      UChar* n2;
+      COFF_section* section_i
+         = (COFF_section*)
+           myindex ( sizeof_COFF_section, sectab, i );
+      n1 = (UChar*) &(section_i->Name);
+      n2 = name;
+      if (n1[0]==n2[0] && n1[1]==n2[1] && n1[2]==n2[2] &&
+          n1[3]==n2[3] && n1[4]==n2[4] && n1[5]==n2[5] &&
+          n1[6]==n2[6] && n1[7]==n2[7])
+         return section_i;
+   }
+
+   return NULL;
+}
+
+
+static void
+zapTrailingAtSign ( UChar* sym )
+{
+#  define my_isdigit(c) ((c) >= '0' && (c) <= '9')
+   int i, j;
+   if (sym[0] == 0) return;
+   i = 0;
+   while (sym[i] != 0) i++;
+   i--;
+   j = i;
+   while (j > 0 && my_isdigit(sym[j])) j--;
+   if (j > 0 && sym[j] == '@' && j != i) sym[j] = 0;
+#  undef my_isdigit
+}
+
+
+static int
+ocVerifyImage_PEi386 ( ObjectCode* oc )
+{
+   int i;
+   UInt32 j, noRelocs;
+   COFF_header*  hdr;
+   COFF_section* sectab;
+   COFF_symbol*  symtab;
+   UChar*        strtab;
+   /* debugBelch("\nLOADING %s\n", oc->fileName); */
+   hdr = (COFF_header*)(oc->image);
+   sectab = (COFF_section*) (
+               ((UChar*)(oc->image))
+               + sizeof_COFF_header + hdr->SizeOfOptionalHeader
+            );
+   symtab = (COFF_symbol*) (
+               ((UChar*)(oc->image))
+               + hdr->PointerToSymbolTable
+            );
+   strtab = ((UChar*)symtab)
+            + hdr->NumberOfSymbols * sizeof_COFF_symbol;
+
+   if (hdr->Machine != 0x14c) {
+      errorBelch("%s: Not x86 PEi386", oc->fileName);
+      return 0;
+   }
+   if (hdr->SizeOfOptionalHeader != 0) {
+      errorBelch("%s: PEi386 with nonempty optional header", oc->fileName);
+      return 0;
+   }
+   if ( /* (hdr->Characteristics & MYIMAGE_FILE_RELOCS_STRIPPED) || */
+        (hdr->Characteristics & MYIMAGE_FILE_EXECUTABLE_IMAGE) ||
+        (hdr->Characteristics & MYIMAGE_FILE_DLL) ||
+        (hdr->Characteristics & MYIMAGE_FILE_SYSTEM) ) {
+      errorBelch("%s: Not a PEi386 object file", oc->fileName);
+      return 0;
+   }
+   if ( (hdr->Characteristics & MYIMAGE_FILE_BYTES_REVERSED_HI)
+        /* || !(hdr->Characteristics & MYIMAGE_FILE_32BIT_MACHINE) */ ) {
+      errorBelch("%s: Invalid PEi386 word size or endiannness: %d",
+		 oc->fileName,
+		 (int)(hdr->Characteristics));
+      return 0;
+   }
+   /* If the string table size is way crazy, this might indicate that
+      there are more than 64k relocations, despite claims to the
+      contrary.  Hence this test. */
+   /* debugBelch("strtab size %d\n", * (UInt32*)strtab); */
+#if 0
+   if ( (*(UInt32*)strtab) > 600000 ) {
+      /* Note that 600k has no special significance other than being
+         big enough to handle the almost-2MB-sized lumps that
+         constitute HSwin32*.o. */
+      debugBelch("PEi386 object has suspiciously large string table; > 64k relocs?");
+      return 0;
+   }
+#endif
+
+   /* No further verification after this point; only debug printing. */
+   i = 0;
+   IF_DEBUG(linker, i=1);
+   if (i == 0) return 1;
+
+   debugBelch( "sectab offset = %d\n", ((UChar*)sectab) - ((UChar*)hdr) );
+   debugBelch( "symtab offset = %d\n", ((UChar*)symtab) - ((UChar*)hdr) );
+   debugBelch( "strtab offset = %d\n", ((UChar*)strtab) - ((UChar*)hdr) );
+
+   debugBelch("\n" );
+   debugBelch( "Machine:           0x%x\n", (UInt32)(hdr->Machine) );
+   debugBelch( "# sections:        %d\n",   (UInt32)(hdr->NumberOfSections) );
+   debugBelch( "time/date:         0x%x\n", (UInt32)(hdr->TimeDateStamp) );
+   debugBelch( "symtab offset:     %d\n",   (UInt32)(hdr->PointerToSymbolTable) );
+   debugBelch( "# symbols:         %d\n",   (UInt32)(hdr->NumberOfSymbols) );
+   debugBelch( "sz of opt hdr:     %d\n",   (UInt32)(hdr->SizeOfOptionalHeader) );
+   debugBelch( "characteristics:   0x%x\n", (UInt32)(hdr->Characteristics) );
+
+   /* Print the section table. */
+   debugBelch("\n" );
+   for (i = 0; i < hdr->NumberOfSections; i++) {
+      COFF_reloc* reltab;
+      COFF_section* sectab_i
+         = (COFF_section*)
+           myindex ( sizeof_COFF_section, sectab, i );
+      debugBelch(
+                "\n"
+                "section %d\n"
+                "     name `",
+                i
+              );
+      printName ( sectab_i->Name, strtab );
+      debugBelch(
+                "'\n"
+                "    vsize %d\n"
+                "    vaddr %d\n"
+                "  data sz %d\n"
+                " data off %d\n"
+                "  num rel %d\n"
+                "  off rel %d\n"
+                "  ptr raw 0x%x\n",
+                sectab_i->VirtualSize,
+                sectab_i->VirtualAddress,
+                sectab_i->SizeOfRawData,
+                sectab_i->PointerToRawData,
+                sectab_i->NumberOfRelocations,
+                sectab_i->PointerToRelocations,
+                sectab_i->PointerToRawData
+              );
+      reltab = (COFF_reloc*) (
+                  ((UChar*)(oc->image)) + sectab_i->PointerToRelocations
+               );
+
+      if ( sectab_i->Characteristics & MYIMAGE_SCN_LNK_NRELOC_OVFL ) {
+	/* If the relocation field (a short) has overflowed, the
+	 * real count can be found in the first reloc entry.
+	 *
+	 * See Section 4.1 (last para) of the PE spec (rev6.0).
+	 */
+        COFF_reloc* rel = (COFF_reloc*)
+                           myindex ( sizeof_COFF_reloc, reltab, 0 );
+	noRelocs = rel->VirtualAddress;
+	j = 1;
+      } else {
+	noRelocs = sectab_i->NumberOfRelocations;
+        j = 0;
+      }
+
+      for (; j < noRelocs; j++) {
+         COFF_symbol* sym;
+         COFF_reloc* rel = (COFF_reloc*)
+                           myindex ( sizeof_COFF_reloc, reltab, j );
+         debugBelch(
+                   "        type 0x%-4x   vaddr 0x%-8x   name `",
+                   (UInt32)rel->Type,
+                   rel->VirtualAddress );
+         sym = (COFF_symbol*)
+               myindex ( sizeof_COFF_symbol, symtab, rel->SymbolTableIndex );
+	 /* Hmm..mysterious looking offset - what's it for? SOF */
+         printName ( sym->Name, strtab -10 );
+         debugBelch("'\n" );
+      }
+
+      debugBelch("\n" );
+   }
+   debugBelch("\n" );
+   debugBelch("string table has size 0x%x\n", * (UInt32*)strtab );
+   debugBelch("---START of string table---\n");
+   for (i = 4; i < *(Int32*)strtab; i++) {
+      if (strtab[i] == 0)
+         debugBelch("\n"); else
+         debugBelch("%c", strtab[i] );
+   }
+   debugBelch("--- END  of string table---\n");
+
+   debugBelch("\n" );
+   i = 0;
+   while (1) {
+      COFF_symbol* symtab_i;
+      if (i >= (Int32)(hdr->NumberOfSymbols)) break;
+      symtab_i = (COFF_symbol*)
+                 myindex ( sizeof_COFF_symbol, symtab, i );
+      debugBelch(
+                "symbol %d\n"
+                "     name `",
+                i
+              );
+      printName ( symtab_i->Name, strtab );
+      debugBelch(
+                "'\n"
+                "    value 0x%x\n"
+                "   1+sec# %d\n"
+                "     type 0x%x\n"
+                "   sclass 0x%x\n"
+                "     nAux %d\n",
+                symtab_i->Value,
+                (Int32)(symtab_i->SectionNumber),
+                (UInt32)symtab_i->Type,
+                (UInt32)symtab_i->StorageClass,
+                (UInt32)symtab_i->NumberOfAuxSymbols
+              );
+      i += symtab_i->NumberOfAuxSymbols;
+      i++;
+   }
+
+   debugBelch("\n" );
+   return 1;
+}
+
+
+static int
+ocGetNames_PEi386 ( ObjectCode* oc )
+{
+   COFF_header*  hdr;
+   COFF_section* sectab;
+   COFF_symbol*  symtab;
+   UChar*        strtab;
+
+   UChar* sname;
+   void*  addr;
+   int    i;
+
+   hdr = (COFF_header*)(oc->image);
+   sectab = (COFF_section*) (
+               ((UChar*)(oc->image))
+               + sizeof_COFF_header + hdr->SizeOfOptionalHeader
+            );
+   symtab = (COFF_symbol*) (
+               ((UChar*)(oc->image))
+               + hdr->PointerToSymbolTable
+            );
+   strtab = ((UChar*)(oc->image))
+            + hdr->PointerToSymbolTable
+            + hdr->NumberOfSymbols * sizeof_COFF_symbol;
+
+   /* Allocate space for any (local, anonymous) .bss sections. */
+
+   for (i = 0; i < hdr->NumberOfSections; i++) {
+      UInt32 bss_sz;
+      UChar* zspace;
+      COFF_section* sectab_i
+         = (COFF_section*)
+           myindex ( sizeof_COFF_section, sectab, i );
+      if (0 != strcmp(sectab_i->Name, ".bss")) continue;
+      /* sof 10/05: the PE spec text isn't too clear regarding what
+       * the SizeOfRawData field is supposed to hold for object
+       * file sections containing just uninitialized data -- for executables,
+       * it is supposed to be zero; unclear what it's supposed to be
+       * for object files. However, VirtualSize is guaranteed to be
+       * zero for object files, which definitely suggests that SizeOfRawData
+       * will be non-zero (where else would the size of this .bss section be
+       * stored?) Looking at the COFF_section info for incoming object files,
+       * this certainly appears to be the case.
+       *
+       * => I suspect we've been incorrectly handling .bss sections in (relocatable)
+       * object files up until now. This turned out to bite us with ghc-6.4.1's use
+       * of gcc-3.4.x, which has started to emit initially-zeroed-out local 'static'
+       * variable decls into to the .bss section. (The specific function in Q which
+       * triggered this is libraries/base/cbits/dirUtils.c:__hscore_getFolderPath())
+       */
+      if (sectab_i->VirtualSize == 0 && sectab_i->SizeOfRawData == 0) continue;
+      /* This is a non-empty .bss section.  Allocate zeroed space for
+         it, and set its PointerToRawData field such that oc->image +
+         PointerToRawData == addr_of_zeroed_space.  */
+      bss_sz = sectab_i->VirtualSize;
+      if ( bss_sz < sectab_i->SizeOfRawData) { bss_sz = sectab_i->SizeOfRawData; }
+      zspace = stgCallocBytes(1, bss_sz, "ocGetNames_PEi386(anonymous bss)");
+      sectab_i->PointerToRawData = ((UChar*)zspace) - ((UChar*)(oc->image));
+      addProddableBlock(oc, zspace, bss_sz);
+      /* debugBelch("BSS anon section at 0x%x\n", zspace); */
+   }
+
+   /* Copy section information into the ObjectCode. */
+
+   for (i = 0; i < hdr->NumberOfSections; i++) {
+      UChar* start;
+      UChar* end;
+      UInt32 sz;
+
+      SectionKind kind
+         = SECTIONKIND_OTHER;
+      COFF_section* sectab_i
+         = (COFF_section*)
+           myindex ( sizeof_COFF_section, sectab, i );
+      IF_DEBUG(linker, debugBelch("section name = %s\n", sectab_i->Name ));
+
+#     if 0
+      /* I'm sure this is the Right Way to do it.  However, the
+         alternative of testing the sectab_i->Name field seems to
+         work ok with Cygwin.
+      */
+      if (sectab_i->Characteristics & MYIMAGE_SCN_CNT_CODE ||
+          sectab_i->Characteristics & MYIMAGE_SCN_CNT_INITIALIZED_DATA)
+         kind = SECTIONKIND_CODE_OR_RODATA;
+#     endif
+
+      if (0==strcmp(".text",sectab_i->Name) ||
+          0==strcmp(".rdata",sectab_i->Name)||
+          0==strcmp(".rodata",sectab_i->Name))
+         kind = SECTIONKIND_CODE_OR_RODATA;
+      if (0==strcmp(".data",sectab_i->Name) ||
+          0==strcmp(".bss",sectab_i->Name))
+         kind = SECTIONKIND_RWDATA;
+
+      ASSERT(sectab_i->SizeOfRawData == 0 || sectab_i->VirtualSize == 0);
+      sz = sectab_i->SizeOfRawData;
+      if (sz < sectab_i->VirtualSize) sz = sectab_i->VirtualSize;
+
+      start = ((UChar*)(oc->image)) + sectab_i->PointerToRawData;
+      end   = start + sz - 1;
+
+      if (kind == SECTIONKIND_OTHER
+          /* Ignore sections called which contain stabs debugging
+             information. */
+          && 0 != strcmp(".stab", sectab_i->Name)
+          && 0 != strcmp(".stabstr", sectab_i->Name)
+          /* ignore constructor section for now */
+          && 0 != strcmp(".ctors", sectab_i->Name)
+         ) {
+         errorBelch("Unknown PEi386 section name `%s' (while processing: %s)", sectab_i->Name, oc->fileName);
+         return 0;
+      }
+
+      if (kind != SECTIONKIND_OTHER && end >= start) {
+         addSection(oc, kind, start, end);
+         addProddableBlock(oc, start, end - start + 1);
+      }
+   }
+
+   /* Copy exported symbols into the ObjectCode. */
+
+   oc->n_symbols = hdr->NumberOfSymbols;
+   oc->symbols   = stgMallocBytes(oc->n_symbols * sizeof(char*),
+                                  "ocGetNames_PEi386(oc->symbols)");
+   /* Call me paranoid; I don't care. */
+   for (i = 0; i < oc->n_symbols; i++)
+      oc->symbols[i] = NULL;
+
+   i = 0;
+   while (1) {
+      COFF_symbol* symtab_i;
+      if (i >= (Int32)(hdr->NumberOfSymbols)) break;
+      symtab_i = (COFF_symbol*)
+                 myindex ( sizeof_COFF_symbol, symtab, i );
+
+      addr  = NULL;
+
+      if (symtab_i->StorageClass == MYIMAGE_SYM_CLASS_EXTERNAL
+          && symtab_i->SectionNumber != MYIMAGE_SYM_UNDEFINED) {
+         /* This symbol is global and defined, viz, exported */
+         /* for MYIMAGE_SYMCLASS_EXTERNAL
+                && !MYIMAGE_SYM_UNDEFINED,
+            the address of the symbol is:
+                address of relevant section + offset in section
+         */
+         COFF_section* sectabent
+            = (COFF_section*) myindex ( sizeof_COFF_section,
+                                        sectab,
+                                        symtab_i->SectionNumber-1 );
+         addr = ((UChar*)(oc->image))
+                + (sectabent->PointerToRawData
+                   + symtab_i->Value);
+      }
+      else
+      if (symtab_i->SectionNumber == MYIMAGE_SYM_UNDEFINED
+	  && symtab_i->Value > 0) {
+         /* This symbol isn't in any section at all, ie, global bss.
+            Allocate zeroed space for it. */
+         addr = stgCallocBytes(1, symtab_i->Value,
+                               "ocGetNames_PEi386(non-anonymous bss)");
+         addSection(oc, SECTIONKIND_RWDATA, addr,
+                        ((UChar*)addr) + symtab_i->Value - 1);
+         addProddableBlock(oc, addr, symtab_i->Value);
+         /* debugBelch("BSS      section at 0x%x\n", addr); */
+      }
+
+      if (addr != NULL ) {
+         sname = cstring_from_COFF_symbol_name ( symtab_i->Name, strtab );
+         /* debugBelch("addSymbol %p `%s \n", addr,sname);  */
+         IF_DEBUG(linker, debugBelch("addSymbol %p `%s'\n", addr,sname);)
+         ASSERT(i >= 0 && i < oc->n_symbols);
+         /* cstring_from_COFF_symbol_name always succeeds. */
+         oc->symbols[i] = sname;
+         ghciInsertStrHashTable(oc->fileName, symhash, sname, addr);
+      } else {
+#        if 0
+         debugBelch(
+                   "IGNORING symbol %d\n"
+                   "     name `",
+                   i
+                 );
+         printName ( symtab_i->Name, strtab );
+         debugBelch(
+                   "'\n"
+                   "    value 0x%x\n"
+                   "   1+sec# %d\n"
+                   "     type 0x%x\n"
+                   "   sclass 0x%x\n"
+                   "     nAux %d\n",
+                   symtab_i->Value,
+                   (Int32)(symtab_i->SectionNumber),
+                   (UInt32)symtab_i->Type,
+                   (UInt32)symtab_i->StorageClass,
+                   (UInt32)symtab_i->NumberOfAuxSymbols
+                 );
+#        endif
+      }
+
+      i += symtab_i->NumberOfAuxSymbols;
+      i++;
+   }
+
+   return 1;
+}
+
+
+static int
+ocResolve_PEi386 ( ObjectCode* oc )
+{
+   COFF_header*  hdr;
+   COFF_section* sectab;
+   COFF_symbol*  symtab;
+   UChar*        strtab;
+
+   UInt32        A;
+   UInt32        S;
+   UInt32*       pP;
+
+   int i;
+   UInt32 j, noRelocs;
+
+   /* ToDo: should be variable-sized?  But is at least safe in the
+      sense of buffer-overrun-proof. */
+   char symbol[1000];
+   /* debugBelch("resolving for %s\n", oc->fileName); */
+
+   hdr = (COFF_header*)(oc->image);
+   sectab = (COFF_section*) (
+               ((UChar*)(oc->image))
+               + sizeof_COFF_header + hdr->SizeOfOptionalHeader
+            );
+   symtab = (COFF_symbol*) (
+               ((UChar*)(oc->image))
+               + hdr->PointerToSymbolTable
+            );
+   strtab = ((UChar*)(oc->image))
+            + hdr->PointerToSymbolTable
+            + hdr->NumberOfSymbols * sizeof_COFF_symbol;
+
+   for (i = 0; i < hdr->NumberOfSections; i++) {
+      COFF_section* sectab_i
+         = (COFF_section*)
+           myindex ( sizeof_COFF_section, sectab, i );
+      COFF_reloc* reltab
+         = (COFF_reloc*) (
+              ((UChar*)(oc->image)) + sectab_i->PointerToRelocations
+           );
+
+      /* Ignore sections called which contain stabs debugging
+         information. */
+      if (0 == strcmp(".stab", sectab_i->Name)
+          || 0 == strcmp(".stabstr", sectab_i->Name)
+          || 0 == strcmp(".ctors", sectab_i->Name))
+         continue;
+
+      if ( sectab_i->Characteristics & MYIMAGE_SCN_LNK_NRELOC_OVFL ) {
+	/* If the relocation field (a short) has overflowed, the
+	 * real count can be found in the first reloc entry.
+         *
+	 * See Section 4.1 (last para) of the PE spec (rev6.0).
+	 *
+	 * Nov2003 update: the GNU linker still doesn't correctly
+	 * handle the generation of relocatable object files with
+	 * overflown relocations. Hence the output to warn of potential
+	 * troubles.
+	 */
+        COFF_reloc* rel = (COFF_reloc*)
+                           myindex ( sizeof_COFF_reloc, reltab, 0 );
+	noRelocs = rel->VirtualAddress;
+
+	/* 10/05: we now assume (and check for) a GNU ld that is capable
+	 * of handling object files with (>2^16) of relocs.
+	 */
+#if 0
+	debugBelch("WARNING: Overflown relocation field (# relocs found: %u)\n",
+		   noRelocs);
+#endif
+	j = 1;
+      } else {
+	noRelocs = sectab_i->NumberOfRelocations;
+        j = 0;
+      }
+
+
+      for (; j < noRelocs; j++) {
+         COFF_symbol* sym;
+         COFF_reloc* reltab_j
+            = (COFF_reloc*)
+              myindex ( sizeof_COFF_reloc, reltab, j );
+
+         /* the location to patch */
+         pP = (UInt32*)(
+                 ((UChar*)(oc->image))
+                 + (sectab_i->PointerToRawData
+                    + reltab_j->VirtualAddress
+                    - sectab_i->VirtualAddress )
+              );
+         /* the existing contents of pP */
+         A = *pP;
+         /* the symbol to connect to */
+         sym = (COFF_symbol*)
+               myindex ( sizeof_COFF_symbol,
+                         symtab, reltab_j->SymbolTableIndex );
+         IF_DEBUG(linker,
+                  debugBelch(
+                            "reloc sec %2d num %3d:  type 0x%-4x   "
+                            "vaddr 0x%-8x   name `",
+                            i, j,
+                            (UInt32)reltab_j->Type,
+                            reltab_j->VirtualAddress );
+                            printName ( sym->Name, strtab );
+                            debugBelch("'\n" ));
+
+         if (sym->StorageClass == MYIMAGE_SYM_CLASS_STATIC) {
+            COFF_section* section_sym
+               = findPEi386SectionCalled ( oc, sym->Name );
+            if (!section_sym) {
+               errorBelch("%s: can't find section `%s'", oc->fileName, sym->Name);
+               return 0;
+            }
+            S = ((UInt32)(oc->image))
+                + (section_sym->PointerToRawData
+                   + sym->Value);
+         } else {
+            copyName ( sym->Name, strtab, symbol, 1000-1 );
+            (void*)S = lookupLocalSymbol( oc, symbol );
+            if ((void*)S != NULL) goto foundit;
+            (void*)S = lookupSymbol( symbol );
+            if ((void*)S != NULL) goto foundit;
+            zapTrailingAtSign ( symbol );
+            (void*)S = lookupLocalSymbol( oc, symbol );
+            if ((void*)S != NULL) goto foundit;
+            (void*)S = lookupSymbol( symbol );
+            if ((void*)S != NULL) goto foundit;
+	    /* Newline first because the interactive linker has printed "linking..." */
+            errorBelch("\n%s: unknown symbol `%s'", oc->fileName, symbol);
+            return 0;
+           foundit:;
+         }
+         checkProddableBlock(oc, pP);
+         switch (reltab_j->Type) {
+            case MYIMAGE_REL_I386_DIR32:
+               *pP = A + S;
+               break;
+            case MYIMAGE_REL_I386_REL32:
+               /* Tricky.  We have to insert a displacement at
+                  pP which, when added to the PC for the _next_
+                  insn, gives the address of the target (S).
+                  Problem is to know the address of the next insn
+                  when we only know pP.  We assume that this
+                  literal field is always the last in the insn,
+                  so that the address of the next insn is pP+4
+                  -- hence the constant 4.
+                  Also I don't know if A should be added, but so
+                  far it has always been zero.
+
+		  SOF 05/2005: 'A' (old contents of *pP) have been observed
+		  to contain values other than zero (the 'wx' object file
+		  that came with wxhaskell-0.9.4; dunno how it was compiled..).
+		  So, add displacement to old value instead of asserting
+		  A to be zero. Fixes wxhaskell-related crashes, and no other
+		  ill effects have been observed.
+		  
+		  Update: the reason why we're seeing these more elaborate
+		  relocations is due to a switch in how the NCG compiles SRTs 
+		  and offsets to them from info tables. SRTs live in .(ro)data, 
+		  while info tables live in .text, causing GAS to emit REL32/DISP32 
+		  relocations with non-zero values. Adding the displacement is
+		  the right thing to do.
+	       */
+               *pP = S - ((UInt32)pP) - 4 + A;
+               break;
+            default:
+               debugBelch("%s: unhandled PEi386 relocation type %d",
+		     oc->fileName, reltab_j->Type);
+               return 0;
+         }
+
+      }
+   }
+
+   IF_DEBUG(linker, debugBelch("completed %s", oc->fileName));
+   return 1;
+}
+
+#endif /* defined(OBJFORMAT_PEi386) */
+
+
+/* --------------------------------------------------------------------------
+ * ELF specifics
+ * ------------------------------------------------------------------------*/
+
+#if defined(OBJFORMAT_ELF)
+
+#define FALSE 0
+#define TRUE  1
+
+#if defined(sparc_HOST_ARCH)
+#  define ELF_TARGET_SPARC  /* Used inside <elf.h> */
+#elif defined(i386_HOST_ARCH)
+#  define ELF_TARGET_386    /* Used inside <elf.h> */
+#elif defined(x86_64_HOST_ARCH)
+#  define ELF_TARGET_X64_64
+#  define ELF_64BIT
+#elif defined (ia64_HOST_ARCH)
+#  define ELF_TARGET_IA64   /* Used inside <elf.h> */
+#  define ELF_64BIT
+#  define ELF_FUNCTION_DESC /* calling convention uses function descriptors */
+#  define ELF_NEED_GOT      /* needs Global Offset Table */
+#  define ELF_NEED_PLT      /* needs Procedure Linkage Tables */
+#endif
+
+#if !defined(openbsd_HOST_OS)
+#include <elf.h>
+#else
+/* openbsd elf has things in different places, with diff names */
+#include <elf_abi.h>
+#include <machine/reloc.h>
+#define R_386_32    RELOC_32
+#define R_386_PC32  RELOC_PC32
+#endif
+
+/*
+ * Define a set of types which can be used for both ELF32 and ELF64
+ */
+
+#ifdef ELF_64BIT
+#define ELFCLASS    ELFCLASS64
+#define Elf_Addr    Elf64_Addr
+#define Elf_Word    Elf64_Word
+#define Elf_Sword   Elf64_Sword
+#define Elf_Ehdr    Elf64_Ehdr
+#define Elf_Phdr    Elf64_Phdr
+#define Elf_Shdr    Elf64_Shdr
+#define Elf_Sym     Elf64_Sym
+#define Elf_Rel     Elf64_Rel
+#define Elf_Rela    Elf64_Rela
+#define ELF_ST_TYPE ELF64_ST_TYPE
+#define ELF_ST_BIND ELF64_ST_BIND
+#define ELF_R_TYPE  ELF64_R_TYPE
+#define ELF_R_SYM   ELF64_R_SYM
+#else
+#define ELFCLASS    ELFCLASS32
+#define Elf_Addr    Elf32_Addr
+#define Elf_Word    Elf32_Word
+#define Elf_Sword   Elf32_Sword
+#define Elf_Ehdr    Elf32_Ehdr
+#define Elf_Phdr    Elf32_Phdr
+#define Elf_Shdr    Elf32_Shdr
+#define Elf_Sym     Elf32_Sym
+#define Elf_Rel     Elf32_Rel
+#define Elf_Rela    Elf32_Rela
+#ifndef ELF_ST_TYPE
+#define ELF_ST_TYPE ELF32_ST_TYPE
+#endif
+#ifndef ELF_ST_BIND
+#define ELF_ST_BIND ELF32_ST_BIND
+#endif
+#ifndef ELF_R_TYPE
+#define ELF_R_TYPE  ELF32_R_TYPE
+#endif
+#ifndef ELF_R_SYM
+#define ELF_R_SYM   ELF32_R_SYM
+#endif
+#endif
+
+
+/*
+ * Functions to allocate entries in dynamic sections.  Currently we simply
+ * preallocate a large number, and we don't check if a entry for the given
+ * target already exists (a linear search is too slow).  Ideally these
+ * entries would be associated with symbols.
+ */
+
+/* These sizes sufficient to load HSbase + HShaskell98 + a few modules */
+#define GOT_SIZE            0x20000
+#define FUNCTION_TABLE_SIZE 0x10000
+#define PLT_SIZE            0x08000
+
+#ifdef ELF_NEED_GOT
+static Elf_Addr got[GOT_SIZE];
+static unsigned int gotIndex;
+static Elf_Addr gp_val = (Elf_Addr)got;
+
+static Elf_Addr
+allocateGOTEntry(Elf_Addr target)
+{
+   Elf_Addr *entry;
+
+   if (gotIndex >= GOT_SIZE)
+      barf("Global offset table overflow");
+
+   entry = &got[gotIndex++];
+   *entry = target;
+   return (Elf_Addr)entry;
+}
+#endif
+
+#ifdef ELF_FUNCTION_DESC
+typedef struct {
+   Elf_Addr ip;
+   Elf_Addr gp;
+} FunctionDesc;
+
+static FunctionDesc functionTable[FUNCTION_TABLE_SIZE];
+static unsigned int functionTableIndex;
+
+static Elf_Addr
+allocateFunctionDesc(Elf_Addr target)
+{
+   FunctionDesc *entry;
+
+   if (functionTableIndex >= FUNCTION_TABLE_SIZE)
+      barf("Function table overflow");
+
+   entry = &functionTable[functionTableIndex++];
+   entry->ip = target;
+   entry->gp = (Elf_Addr)gp_val;
+   return (Elf_Addr)entry;
+}
+
+static Elf_Addr
+copyFunctionDesc(Elf_Addr target)
+{
+   FunctionDesc *olddesc = (FunctionDesc *)target;
+   FunctionDesc *newdesc;
+
+   newdesc = (FunctionDesc *)allocateFunctionDesc(olddesc->ip);
+   newdesc->gp = olddesc->gp;
+   return (Elf_Addr)newdesc;
+}
+#endif
+
+#ifdef ELF_NEED_PLT
+#ifdef ia64_HOST_ARCH
+static void ia64_reloc_gprel22(Elf_Addr target, Elf_Addr value);
+static void ia64_reloc_pcrel21(Elf_Addr target, Elf_Addr value, ObjectCode *oc);
+
+static unsigned char plt_code[] =
+{
+   /* taken from binutils bfd/elfxx-ia64.c */
+   0x0b, 0x78, 0x00, 0x02, 0x00, 0x24,  /*   [MMI]       addl r15=0,r1;;    */
+   0x00, 0x41, 0x3c, 0x30, 0x28, 0xc0,  /*               ld8 r16=[r15],8    */
+   0x01, 0x08, 0x00, 0x84,              /*               mov r14=r1;;       */
+   0x11, 0x08, 0x00, 0x1e, 0x18, 0x10,  /*   [MIB]       ld8 r1=[r15]       */
+   0x60, 0x80, 0x04, 0x80, 0x03, 0x00,  /*               mov b6=r16         */
+   0x60, 0x00, 0x80, 0x00               /*               br.few b6;;        */
+};
+
+/* If we can't get to the function descriptor via gp, take a local copy of it */
+#define PLT_RELOC(code, target) { \
+   Elf64_Sxword rel_value = target - gp_val; \
+   if ((rel_value > 0x1fffff) || (rel_value < -0x1fffff)) \
+      ia64_reloc_gprel22((Elf_Addr)code, copyFunctionDesc(target)); \
+   else \
+      ia64_reloc_gprel22((Elf_Addr)code, target); \
+   }
+#endif
+
+typedef struct {
+   unsigned char code[sizeof(plt_code)];
+} PLTEntry;
+
+static Elf_Addr
+allocatePLTEntry(Elf_Addr target, ObjectCode *oc)
+{
+   PLTEntry *plt = (PLTEntry *)oc->plt;
+   PLTEntry *entry;
+
+   if (oc->pltIndex >= PLT_SIZE)
+      barf("Procedure table overflow");
+
+   entry = &plt[oc->pltIndex++];
+   memcpy(entry->code, plt_code, sizeof(entry->code));
+   PLT_RELOC(entry->code, target);
+   return (Elf_Addr)entry;
+}
+
+static unsigned int
+PLTSize(void)
+{
+   return (PLT_SIZE * sizeof(PLTEntry));
+}
+#endif
+
+
+#if x86_64_HOST_ARCH
+// On x86_64, 32-bit relocations are often used, which requires that
+// we can resolve a symbol to a 32-bit offset.  However, shared
+// libraries are placed outside the 2Gb area, which leaves us with a
+// problem when we need to give a 32-bit offset to a symbol in a
+// shared library.
+// 
+// For a function symbol, we can allocate a bounce sequence inside the
+// 2Gb area and resolve the symbol to this.  The bounce sequence is
+// simply a long jump instruction to the real location of the symbol.
+//
+// For data references, we're screwed.
+//
+typedef struct {
+    unsigned char jmp[8];  /* 6 byte instruction: jmpq *0x00000002(%rip) */
+    void *addr;
+} x86_64_bounce;
+
+#define X86_64_BB_SIZE 1024
+
+static x86_64_bounce *x86_64_bounce_buffer = NULL;
+static nat x86_64_bb_next_off;
+
+static void*
+x86_64_high_symbol( char *lbl, void *addr )
+{
+    x86_64_bounce *bounce;
+
+    if ( x86_64_bounce_buffer == NULL || 
+	 x86_64_bb_next_off >= X86_64_BB_SIZE ) {
+	x86_64_bounce_buffer = 
+	    mmap(NULL, X86_64_BB_SIZE * sizeof(x86_64_bounce), 
+		 PROT_EXEC|PROT_READ|PROT_WRITE, 
+		 MAP_PRIVATE|MAP_32BIT|MAP_ANONYMOUS, -1, 0);
+	if (x86_64_bounce_buffer == MAP_FAILED) {
+	    barf("x86_64_high_symbol: mmap failed");
+	}
+	x86_64_bb_next_off = 0;
+    }
+    bounce = &x86_64_bounce_buffer[x86_64_bb_next_off];
+    bounce->jmp[0] = 0xff;
+    bounce->jmp[1] = 0x25;
+    bounce->jmp[2] = 0x02;
+    bounce->jmp[3] = 0x00;
+    bounce->jmp[4] = 0x00;
+    bounce->jmp[5] = 0x00;
+    bounce->addr = addr;
+    x86_64_bb_next_off++;
+
+    IF_DEBUG(linker, debugBelch("x86_64: allocated bounce entry for %s->%p at %p\n",
+				lbl, addr, bounce));
+
+    insertStrHashTable(symhash, lbl, bounce);
+    return bounce;
+}
+#endif
+
+
+/*
+ * Generic ELF functions
+ */
+
+static char *
+findElfSection ( void* objImage, Elf_Word sh_type )
+{
+   char* ehdrC = (char*)objImage;
+   Elf_Ehdr* ehdr = (Elf_Ehdr*)ehdrC;
+   Elf_Shdr* shdr = (Elf_Shdr*)(ehdrC + ehdr->e_shoff);
+   char* sh_strtab = ehdrC + shdr[ehdr->e_shstrndx].sh_offset;
+   char* ptr = NULL;
+   int i;
+
+   for (i = 0; i < ehdr->e_shnum; i++) {
+      if (shdr[i].sh_type == sh_type
+          /* Ignore the section header's string table. */
+          && i != ehdr->e_shstrndx
+	  /* Ignore string tables named .stabstr, as they contain
+             debugging info. */
+          && 0 != memcmp(".stabstr", sh_strtab + shdr[i].sh_name, 8)
+         ) {
+         ptr = ehdrC + shdr[i].sh_offset;
+         break;
+      }
+   }
+   return ptr;
+}
+
+#if defined(ia64_HOST_ARCH)
+static Elf_Addr
+findElfSegment ( void* objImage, Elf_Addr vaddr )
+{
+   char* ehdrC = (char*)objImage;
+   Elf_Ehdr* ehdr = (Elf_Ehdr*)ehdrC;
+   Elf_Phdr* phdr = (Elf_Phdr*)(ehdrC + ehdr->e_phoff);
+   Elf_Addr segaddr = 0;
+   int i;
+
+   for (i = 0; i < ehdr->e_phnum; i++) {
+      segaddr = phdr[i].p_vaddr;
+      if ((vaddr >= segaddr) && (vaddr < segaddr + phdr[i].p_memsz))
+	      break;
+   }
+   return segaddr;
+}
+#endif
+
+static int
+ocVerifyImage_ELF ( ObjectCode* oc )
+{
+   Elf_Shdr* shdr;
+   Elf_Sym*  stab;
+   int i, j, nent, nstrtab, nsymtabs;
+   char* sh_strtab;
+   char* strtab;
+
+   char*     ehdrC = (char*)(oc->image);
+   Elf_Ehdr* ehdr  = (Elf_Ehdr*)ehdrC;
+
+   if (ehdr->e_ident[EI_MAG0] != ELFMAG0 ||
+       ehdr->e_ident[EI_MAG1] != ELFMAG1 ||
+       ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
+       ehdr->e_ident[EI_MAG3] != ELFMAG3) {
+      errorBelch("%s: not an ELF object", oc->fileName);
+      return 0;
+   }
+
+   if (ehdr->e_ident[EI_CLASS] != ELFCLASS) {
+      errorBelch("%s: unsupported ELF format", oc->fileName);
+      return 0;
+   }
+
+   if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) {
+       IF_DEBUG(linker,debugBelch( "Is little-endian\n" ));
+   } else
+   if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) {
+       IF_DEBUG(linker,debugBelch( "Is big-endian\n" ));
+   } else {
+       errorBelch("%s: unknown endiannness", oc->fileName);
+       return 0;
+   }
+
+   if (ehdr->e_type != ET_REL) {
+      errorBelch("%s: not a relocatable object (.o) file", oc->fileName);
+      return 0;
+   }
+   IF_DEBUG(linker, debugBelch( "Is a relocatable object (.o) file\n" ));
+
+   IF_DEBUG(linker,debugBelch( "Architecture is " ));
+   switch (ehdr->e_machine) {
+      case EM_386:   IF_DEBUG(linker,debugBelch( "x86" )); break;
+      case EM_SPARC: IF_DEBUG(linker,debugBelch( "sparc" )); break;
+#ifdef EM_IA_64
+      case EM_IA_64: IF_DEBUG(linker,debugBelch( "ia64" )); break;
+#endif
+      case EM_PPC:   IF_DEBUG(linker,debugBelch( "powerpc32" )); break;
+#ifdef EM_X86_64
+      case EM_X86_64: IF_DEBUG(linker,debugBelch( "x86_64" )); break;
+#endif
+      default:       IF_DEBUG(linker,debugBelch( "unknown" ));
+                     errorBelch("%s: unknown architecture", oc->fileName);
+                     return 0;
+   }
+
+   IF_DEBUG(linker,debugBelch(
+             "\nSection header table: start %ld, n_entries %d, ent_size %d\n",
+             (long)ehdr->e_shoff, ehdr->e_shnum, ehdr->e_shentsize  ));
+
+   ASSERT (ehdr->e_shentsize == sizeof(Elf_Shdr));
+
+   shdr = (Elf_Shdr*) (ehdrC + ehdr->e_shoff);
+
+   if (ehdr->e_shstrndx == SHN_UNDEF) {
+      errorBelch("%s: no section header string table", oc->fileName);
+      return 0;
+   } else {
+      IF_DEBUG(linker,debugBelch( "Section header string table is section %d\n",
+                          ehdr->e_shstrndx));
+      sh_strtab = ehdrC + shdr[ehdr->e_shstrndx].sh_offset;
+   }
+
+   for (i = 0; i < ehdr->e_shnum; i++) {
+      IF_DEBUG(linker,debugBelch("%2d:  ", i ));
+      IF_DEBUG(linker,debugBelch("type=%2d  ", (int)shdr[i].sh_type ));
+      IF_DEBUG(linker,debugBelch("size=%4d  ", (int)shdr[i].sh_size ));
+      IF_DEBUG(linker,debugBelch("offs=%4d  ", (int)shdr[i].sh_offset ));
+      IF_DEBUG(linker,debugBelch("  (%p .. %p)  ",
+               ehdrC + shdr[i].sh_offset,
+		      ehdrC + shdr[i].sh_offset + shdr[i].sh_size - 1));
+
+      if (shdr[i].sh_type == SHT_REL) {
+	  IF_DEBUG(linker,debugBelch("Rel  " ));
+      } else if (shdr[i].sh_type == SHT_RELA) {
+	  IF_DEBUG(linker,debugBelch("RelA " ));
+      } else {
+	  IF_DEBUG(linker,debugBelch("     "));
+      }
+      if (sh_strtab) {
+	  IF_DEBUG(linker,debugBelch("sname=%s\n", sh_strtab + shdr[i].sh_name ));
+      }
+   }
+
+   IF_DEBUG(linker,debugBelch( "\nString tables" ));
+   strtab = NULL;
+   nstrtab = 0;
+   for (i = 0; i < ehdr->e_shnum; i++) {
+      if (shdr[i].sh_type == SHT_STRTAB
+          /* Ignore the section header's string table. */
+          && i != ehdr->e_shstrndx
+	  /* Ignore string tables named .stabstr, as they contain
+             debugging info. */
+          && 0 != memcmp(".stabstr", sh_strtab + shdr[i].sh_name, 8)
+         ) {
+         IF_DEBUG(linker,debugBelch("   section %d is a normal string table", i ));
+         strtab = ehdrC + shdr[i].sh_offset;
+         nstrtab++;
+      }
+   }
+   if (nstrtab != 1) {
+      errorBelch("%s: no string tables, or too many", oc->fileName);
+      return 0;
+   }
+
+   nsymtabs = 0;
+   IF_DEBUG(linker,debugBelch( "\nSymbol tables" ));
+   for (i = 0; i < ehdr->e_shnum; i++) {
+      if (shdr[i].sh_type != SHT_SYMTAB) continue;
+      IF_DEBUG(linker,debugBelch( "section %d is a symbol table\n", i ));
+      nsymtabs++;
+      stab = (Elf_Sym*) (ehdrC + shdr[i].sh_offset);
+      nent = shdr[i].sh_size / sizeof(Elf_Sym);
+      IF_DEBUG(linker,debugBelch( "   number of entries is apparently %d (%ld rem)\n",
+               nent,
+               (long)shdr[i].sh_size % sizeof(Elf_Sym)
+             ));
+      if (0 != shdr[i].sh_size % sizeof(Elf_Sym)) {
+         errorBelch("%s: non-integral number of symbol table entries", oc->fileName);
+         return 0;
+      }
+      for (j = 0; j < nent; j++) {
+         IF_DEBUG(linker,debugBelch("   %2d  ", j ));
+         IF_DEBUG(linker,debugBelch("  sec=%-5d  size=%-3d  val=%5p  ",
+                             (int)stab[j].st_shndx,
+                             (int)stab[j].st_size,
+                             (char*)stab[j].st_value ));
+
+         IF_DEBUG(linker,debugBelch("type=" ));
+         switch (ELF_ST_TYPE(stab[j].st_info)) {
+            case STT_NOTYPE:  IF_DEBUG(linker,debugBelch("notype " )); break;
+            case STT_OBJECT:  IF_DEBUG(linker,debugBelch("object " )); break;
+            case STT_FUNC  :  IF_DEBUG(linker,debugBelch("func   " )); break;
+            case STT_SECTION: IF_DEBUG(linker,debugBelch("section" )); break;
+            case STT_FILE:    IF_DEBUG(linker,debugBelch("file   " )); break;
+            default:          IF_DEBUG(linker,debugBelch("?      " )); break;
+         }
+         IF_DEBUG(linker,debugBelch("  " ));
+
+         IF_DEBUG(linker,debugBelch("bind=" ));
+         switch (ELF_ST_BIND(stab[j].st_info)) {
+            case STB_LOCAL :  IF_DEBUG(linker,debugBelch("local " )); break;
+            case STB_GLOBAL:  IF_DEBUG(linker,debugBelch("global" )); break;
+            case STB_WEAK  :  IF_DEBUG(linker,debugBelch("weak  " )); break;
+            default:          IF_DEBUG(linker,debugBelch("?     " )); break;
+         }
+         IF_DEBUG(linker,debugBelch("  " ));
+
+         IF_DEBUG(linker,debugBelch("name=%s\n", strtab + stab[j].st_name ));
+      }
+   }
+
+   if (nsymtabs == 0) {
+      errorBelch("%s: didn't find any symbol tables", oc->fileName);
+      return 0;
+   }
+
+   return 1;
+}
+
+static int getSectionKind_ELF( Elf_Shdr *hdr, int *is_bss )
+{
+    *is_bss = FALSE;
+
+    if (hdr->sh_type == SHT_PROGBITS
+	&& (hdr->sh_flags & SHF_ALLOC) && (hdr->sh_flags & SHF_EXECINSTR)) {
+	/* .text-style section */
+	return SECTIONKIND_CODE_OR_RODATA;
+    }
+
+    if (hdr->sh_type == SHT_PROGBITS
+	    && (hdr->sh_flags & SHF_ALLOC) && (hdr->sh_flags & SHF_WRITE)) {
+	    /* .data-style section */
+	    return SECTIONKIND_RWDATA;
+    }
+
+    if (hdr->sh_type == SHT_PROGBITS
+	&& (hdr->sh_flags & SHF_ALLOC) && !(hdr->sh_flags & SHF_WRITE)) {
+	/* .rodata-style section */
+	return SECTIONKIND_CODE_OR_RODATA;
+    }
+
+    if (hdr->sh_type == SHT_NOBITS
+	&& (hdr->sh_flags & SHF_ALLOC) && (hdr->sh_flags & SHF_WRITE)) {
+	/* .bss-style section */
+	*is_bss = TRUE;
+	return SECTIONKIND_RWDATA;
+    }
+
+    return SECTIONKIND_OTHER;
+}
+
+
+static int
+ocGetNames_ELF ( ObjectCode* oc )
+{
+   int i, j, k, nent;
+   Elf_Sym* stab;
+
+   char*     ehdrC    = (char*)(oc->image);
+   Elf_Ehdr* ehdr     = (Elf_Ehdr*)ehdrC;
+   char*     strtab   = findElfSection ( ehdrC, SHT_STRTAB );
+   Elf_Shdr* shdr     = (Elf_Shdr*) (ehdrC + ehdr->e_shoff);
+
+   ASSERT(symhash != NULL);
+
+   if (!strtab) {
+      errorBelch("%s: no strtab", oc->fileName);
+      return 0;
+   }
+
+   k = 0;
+   for (i = 0; i < ehdr->e_shnum; i++) {
+      /* Figure out what kind of section it is.  Logic derived from
+         Figure 1.14 ("Special Sections") of the ELF document
+         ("Portable Formats Specification, Version 1.1"). */
+      int         is_bss = FALSE;
+      SectionKind kind   = getSectionKind_ELF(&shdr[i], &is_bss);
+
+      if (is_bss && shdr[i].sh_size > 0) {
+         /* This is a non-empty .bss section.  Allocate zeroed space for
+            it, and set its .sh_offset field such that
+            ehdrC + .sh_offset == addr_of_zeroed_space.  */
+         char* zspace = stgCallocBytes(1, shdr[i].sh_size,
+                                       "ocGetNames_ELF(BSS)");
+         shdr[i].sh_offset = ((char*)zspace) - ((char*)ehdrC);
+	 /*
+         debugBelch("BSS section at 0x%x, size %d\n",
+                         zspace, shdr[i].sh_size);
+	 */
+      }
+
+      /* fill in the section info */
+      if (kind != SECTIONKIND_OTHER && shdr[i].sh_size > 0) {
+         addProddableBlock(oc, ehdrC + shdr[i].sh_offset, shdr[i].sh_size);
+         addSection(oc, kind, ehdrC + shdr[i].sh_offset,
+                        ehdrC + shdr[i].sh_offset + shdr[i].sh_size - 1);
+      }
+
+      if (shdr[i].sh_type != SHT_SYMTAB) continue;
+
+      /* copy stuff into this module's object symbol table */
+      stab = (Elf_Sym*) (ehdrC + shdr[i].sh_offset);
+      nent = shdr[i].sh_size / sizeof(Elf_Sym);
+
+      oc->n_symbols = nent;
+      oc->symbols = stgMallocBytes(oc->n_symbols * sizeof(char*),
+                                   "ocGetNames_ELF(oc->symbols)");
+
+      for (j = 0; j < nent; j++) {
+
+         char  isLocal = FALSE; /* avoids uninit-var warning */
+         char* ad      = NULL;
+         char* nm      = strtab + stab[j].st_name;
+         int   secno   = stab[j].st_shndx;
+
+	 /* Figure out if we want to add it; if so, set ad to its
+            address.  Otherwise leave ad == NULL. */
+
+         if (secno == SHN_COMMON) {
+            isLocal = FALSE;
+            ad = stgCallocBytes(1, stab[j].st_size, "ocGetNames_ELF(COMMON)");
+	    /*
+            debugBelch("COMMON symbol, size %d name %s\n",
+                            stab[j].st_size, nm);
+	    */
+	    /* Pointless to do addProddableBlock() for this area,
+               since the linker should never poke around in it. */
+	 }
+         else
+         if ( ( ELF_ST_BIND(stab[j].st_info)==STB_GLOBAL
+                || ELF_ST_BIND(stab[j].st_info)==STB_LOCAL
+              )
+              /* and not an undefined symbol */
+              && stab[j].st_shndx != SHN_UNDEF
+	      /* and not in a "special section" */
+              && stab[j].st_shndx < SHN_LORESERVE
+              &&
+	      /* and it's a not a section or string table or anything silly */
+              ( ELF_ST_TYPE(stab[j].st_info)==STT_FUNC ||
+                ELF_ST_TYPE(stab[j].st_info)==STT_OBJECT ||
+                ELF_ST_TYPE(stab[j].st_info)==STT_NOTYPE
+              )
+            ) {
+	    /* Section 0 is the undefined section, hence > and not >=. */
+            ASSERT(secno > 0 && secno < ehdr->e_shnum);
+	    /*
+            if (shdr[secno].sh_type == SHT_NOBITS) {
+               debugBelch("   BSS symbol, size %d off %d name %s\n",
+                               stab[j].st_size, stab[j].st_value, nm);
+            }
+            */
+            ad = ehdrC + shdr[ secno ].sh_offset + stab[j].st_value;
+            if (ELF_ST_BIND(stab[j].st_info)==STB_LOCAL) {
+               isLocal = TRUE;
+            } else {
+#ifdef ELF_FUNCTION_DESC
+               /* dlsym() and the initialisation table both give us function
+		* descriptors, so to be consistent we store function descriptors
+		* in the symbol table */
+               if (ELF_ST_TYPE(stab[j].st_info) == STT_FUNC)
+                   ad = (char *)allocateFunctionDesc((Elf_Addr)ad);
+#endif
+               IF_DEBUG(linker,debugBelch( "addOTabName(GLOB): %10p  %s %s",
+                                      ad, oc->fileName, nm ));
+               isLocal = FALSE;
+            }
+         }
+
+         /* And the decision is ... */
+
+         if (ad != NULL) {
+            ASSERT(nm != NULL);
+	    oc->symbols[j] = nm;
+            /* Acquire! */
+            if (isLocal) {
+               /* Ignore entirely. */
+            } else {
+               ghciInsertStrHashTable(oc->fileName, symhash, nm, ad);
+            }
+         } else {
+            /* Skip. */
+            IF_DEBUG(linker,debugBelch( "skipping `%s'\n",
+                                   strtab + stab[j].st_name ));
+            /*
+            debugBelch(
+                    "skipping   bind = %d,  type = %d,  shndx = %d   `%s'\n",
+                    (int)ELF_ST_BIND(stab[j].st_info),
+                    (int)ELF_ST_TYPE(stab[j].st_info),
+                    (int)stab[j].st_shndx,
+                    strtab + stab[j].st_name
+                   );
+            */
+            oc->symbols[j] = NULL;
+         }
+
+      }
+   }
+
+   return 1;
+}
+
+/* Do ELF relocations which lack an explicit addend.  All x86-linux
+   relocations appear to be of this form. */
+static int
+do_Elf_Rel_relocations ( ObjectCode* oc, char* ehdrC,
+                         Elf_Shdr* shdr, int shnum,
+                         Elf_Sym*  stab, char* strtab )
+{
+   int j;
+   char *symbol;
+   Elf_Word* targ;
+   Elf_Rel*  rtab = (Elf_Rel*) (ehdrC + shdr[shnum].sh_offset);
+   int         nent = shdr[shnum].sh_size / sizeof(Elf_Rel);
+   int target_shndx = shdr[shnum].sh_info;
+   int symtab_shndx = shdr[shnum].sh_link;
+
+   stab  = (Elf_Sym*) (ehdrC + shdr[ symtab_shndx ].sh_offset);
+   targ  = (Elf_Word*)(ehdrC + shdr[ target_shndx ].sh_offset);
+   IF_DEBUG(linker,debugBelch( "relocations for section %d using symtab %d\n",
+                          target_shndx, symtab_shndx ));
+
+   /* Skip sections that we're not interested in. */
+   {
+       int is_bss;
+       SectionKind kind = getSectionKind_ELF(&shdr[target_shndx], &is_bss);
+       if (kind == SECTIONKIND_OTHER) {
+	   IF_DEBUG(linker,debugBelch( "skipping (target section not loaded)"));
+	   return 1;
+       }
+   }
+
+   for (j = 0; j < nent; j++) {
+      Elf_Addr offset = rtab[j].r_offset;
+      Elf_Addr info   = rtab[j].r_info;
+
+      Elf_Addr  P  = ((Elf_Addr)targ) + offset;
+      Elf_Word* pP = (Elf_Word*)P;
+      Elf_Addr  A  = *pP;
+      Elf_Addr  S;
+      void*     S_tmp;
+      Elf_Addr  value;
+
+      IF_DEBUG(linker,debugBelch( "Rel entry %3d is raw(%6p %6p)",
+                             j, (void*)offset, (void*)info ));
+      if (!info) {
+         IF_DEBUG(linker,debugBelch( " ZERO" ));
+         S = 0;
+      } else {
+         Elf_Sym sym = stab[ELF_R_SYM(info)];
+	 /* First see if it is a local symbol. */
+         if (ELF_ST_BIND(sym.st_info) == STB_LOCAL) {
+            /* Yes, so we can get the address directly from the ELF symbol
+               table. */
+            symbol = sym.st_name==0 ? "(noname)" : strtab+sym.st_name;
+            S = (Elf_Addr)
+                (ehdrC + shdr[ sym.st_shndx ].sh_offset
+                       + stab[ELF_R_SYM(info)].st_value);
+
+	 } else {
+            /* No, so look up the name in our global table. */
+            symbol = strtab + sym.st_name;
+            S_tmp = lookupSymbol( symbol );
+            S = (Elf_Addr)S_tmp;
+	 }
+         if (!S) {
+            errorBelch("%s: unknown symbol `%s'", oc->fileName, symbol);
+	    return 0;
+         }
+         IF_DEBUG(linker,debugBelch( "`%s' resolves to %p\n", symbol, (void*)S ));
+      }
+
+      IF_DEBUG(linker,debugBelch( "Reloc: P = %p   S = %p   A = %p\n",
+			     (void*)P, (void*)S, (void*)A ));
+      checkProddableBlock ( oc, pP );
+
+      value = S + A;
+
+      switch (ELF_R_TYPE(info)) {
+#        ifdef i386_HOST_ARCH
+         case R_386_32:   *pP = value;     break;
+         case R_386_PC32: *pP = value - P; break;
+#        endif
+         default:
+            errorBelch("%s: unhandled ELF relocation(Rel) type %lu\n",
+		  oc->fileName, (lnat)ELF_R_TYPE(info));
+            return 0;
+      }
+
+   }
+   return 1;
+}
+
+/* Do ELF relocations for which explicit addends are supplied.
+   sparc-solaris relocations appear to be of this form. */
+static int
+do_Elf_Rela_relocations ( ObjectCode* oc, char* ehdrC,
+                          Elf_Shdr* shdr, int shnum,
+                          Elf_Sym*  stab, char* strtab )
+{
+   int j;
+   char *symbol = NULL;
+   Elf_Addr targ;
+   Elf_Rela* rtab = (Elf_Rela*) (ehdrC + shdr[shnum].sh_offset);
+   int         nent = shdr[shnum].sh_size / sizeof(Elf_Rela);
+   int target_shndx = shdr[shnum].sh_info;
+   int symtab_shndx = shdr[shnum].sh_link;
+
+   stab  = (Elf_Sym*) (ehdrC + shdr[ symtab_shndx ].sh_offset);
+   targ  = (Elf_Addr) (ehdrC + shdr[ target_shndx ].sh_offset);
+   IF_DEBUG(linker,debugBelch( "relocations for section %d using symtab %d\n",
+                          target_shndx, symtab_shndx ));
+
+   for (j = 0; j < nent; j++) {
+#if defined(DEBUG) || defined(sparc_HOST_ARCH) || defined(ia64_HOST_ARCH) || defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH)
+      /* This #ifdef only serves to avoid unused-var warnings. */
+      Elf_Addr  offset = rtab[j].r_offset;
+      Elf_Addr  P      = targ + offset;
+#endif
+      Elf_Addr  info   = rtab[j].r_info;
+      Elf_Addr  A      = rtab[j].r_addend;
+      Elf_Addr  S;
+      void*     S_tmp;
+      Elf_Addr  value;
+#     if defined(sparc_HOST_ARCH)
+      Elf_Word* pP = (Elf_Word*)P;
+      Elf_Word  w1, w2;
+#     elif defined(ia64_HOST_ARCH)
+      Elf64_Xword *pP = (Elf64_Xword *)P;
+      Elf_Addr addr;
+#     elif defined(powerpc_HOST_ARCH)
+      Elf_Sword delta;
+#     endif
+
+      IF_DEBUG(linker,debugBelch( "Rel entry %3d is raw(%6p %6p %6p)   ",
+                             j, (void*)offset, (void*)info,
+                                (void*)A ));
+      if (!info) {
+         IF_DEBUG(linker,debugBelch( " ZERO" ));
+         S = 0;
+      } else {
+         Elf_Sym sym = stab[ELF_R_SYM(info)];
+	 /* First see if it is a local symbol. */
+         if (ELF_ST_BIND(sym.st_info) == STB_LOCAL) {
+            /* Yes, so we can get the address directly from the ELF symbol
+               table. */
+            symbol = sym.st_name==0 ? "(noname)" : strtab+sym.st_name;
+            S = (Elf_Addr)
+                (ehdrC + shdr[ sym.st_shndx ].sh_offset
+                       + stab[ELF_R_SYM(info)].st_value);
+#ifdef ELF_FUNCTION_DESC
+	    /* Make a function descriptor for this function */
+            if (S && ELF_ST_TYPE(sym.st_info) == STT_FUNC) {
+               S = allocateFunctionDesc(S + A);
+       	       A = 0;
+            }
+#endif
+	 } else {
+            /* No, so look up the name in our global table. */
+            symbol = strtab + sym.st_name;
+            S_tmp = lookupSymbol( symbol );
+            S = (Elf_Addr)S_tmp;
+
+#ifdef ELF_FUNCTION_DESC
+	    /* If a function, already a function descriptor - we would
+	       have to copy it to add an offset. */
+            if (S && (ELF_ST_TYPE(sym.st_info) == STT_FUNC) && (A != 0))
+               errorBelch("%s: function %s with addend %p", oc->fileName, symbol, (void *)A);
+#endif
+	 }
+         if (!S) {
+	   errorBelch("%s: unknown symbol `%s'", oc->fileName, symbol);
+	   return 0;
+         }
+         IF_DEBUG(linker,debugBelch( "`%s' resolves to %p", symbol, (void*)S ));
+      }
+
+      IF_DEBUG(linker,debugBelch("Reloc: P = %p   S = %p   A = %p\n",
+                                        (void*)P, (void*)S, (void*)A ));
+      /* checkProddableBlock ( oc, (void*)P ); */
+
+      value = S + A;
+
+      switch (ELF_R_TYPE(info)) {
+#        if defined(sparc_HOST_ARCH)
+         case R_SPARC_WDISP30:
+            w1 = *pP & 0xC0000000;
+            w2 = (Elf_Word)((value - P) >> 2);
+            ASSERT((w2 & 0xC0000000) == 0);
+            w1 |= w2;
+            *pP = w1;
+            break;
+         case R_SPARC_HI22:
+            w1 = *pP & 0xFFC00000;
+            w2 = (Elf_Word)(value >> 10);
+            ASSERT((w2 & 0xFFC00000) == 0);
+            w1 |= w2;
+            *pP = w1;
+            break;
+         case R_SPARC_LO10:
+            w1 = *pP & ~0x3FF;
+            w2 = (Elf_Word)(value & 0x3FF);
+            ASSERT((w2 & ~0x3FF) == 0);
+            w1 |= w2;
+            *pP = w1;
+            break;
+         /* According to the Sun documentation:
+            R_SPARC_UA32
+            This relocation type resembles R_SPARC_32, except it refers to an
+            unaligned word. That is, the word to be relocated must be treated
+            as four separate bytes with arbitrary alignment, not as a word
+            aligned according to the architecture requirements.
+
+            (JRS: which means that freeloading on the R_SPARC_32 case
+            is probably wrong, but hey ...)
+         */
+         case R_SPARC_UA32:
+         case R_SPARC_32:
+            w2 = (Elf_Word)value;
+            *pP = w2;
+            break;
+#        elif defined(ia64_HOST_ARCH)
+	 case R_IA64_DIR64LSB:
+	 case R_IA64_FPTR64LSB:
+	    *pP = value;
+	    break;
+	 case R_IA64_PCREL64LSB:
+	    *pP = value - P;
+	    break;
+	 case R_IA64_SEGREL64LSB:
+	    addr = findElfSegment(ehdrC, value);
+	    *pP = value - addr;
+	    break;
+	 case R_IA64_GPREL22:
+	    ia64_reloc_gprel22(P, value);
+	    break;
+	 case R_IA64_LTOFF22:
+	 case R_IA64_LTOFF22X:
+	 case R_IA64_LTOFF_FPTR22:
+	    addr = allocateGOTEntry(value);
+	    ia64_reloc_gprel22(P, addr);
+	    break;
+	 case R_IA64_PCREL21B:
+	    ia64_reloc_pcrel21(P, S, oc);
+	    break;
+	 case R_IA64_LDXMOV:
+	    /* This goes with R_IA64_LTOFF22X and points to the load to
+	     * convert into a move.  We don't implement relaxation. */
+	    break;
+#        elif defined(powerpc_HOST_ARCH)
+         case R_PPC_ADDR16_LO:
+            *(Elf32_Half*) P = value;
+            break;
+
+         case R_PPC_ADDR16_HI:
+            *(Elf32_Half*) P = value >> 16;
+            break;
+ 
+         case R_PPC_ADDR16_HA:
+            *(Elf32_Half*) P = (value + 0x8000) >> 16;
+            break;
+
+         case R_PPC_ADDR32:
+            *(Elf32_Word *) P = value;
+            break;
+
+         case R_PPC_REL32:
+            *(Elf32_Word *) P = value - P;
+            break;
+
+         case R_PPC_REL24:
+            delta = value - P;
+
+            if( delta << 6 >> 6 != delta )
+            {
+               value = makeJumpIsland( oc, ELF_R_SYM(info), value );
+               delta = value - P;
+
+               if( value == 0 || delta << 6 >> 6 != delta )
+               {
+                  barf( "Unable to make ppcJumpIsland for #%d",
+                        ELF_R_SYM(info) );
+                  return 0;
+               }
+            }
+
+            *(Elf_Word *) P = (*(Elf_Word *) P & 0xfc000003)
+                                          | (delta & 0x3fffffc);
+            break;
+#        endif
+
+#if x86_64_HOST_ARCH
+      case R_X86_64_64:
+	  *(Elf64_Xword *)P = value;
+	  break;
+
+      case R_X86_64_PC32:
+      {
+	  StgInt64 off = value - P;
+	  if (off >= 0x7fffffffL || off < -0x80000000L) {
+	      barf("R_X86_64_PC32 relocation out of range: %s = %p",
+		   symbol, off);
+	  }
+	  *(Elf64_Word *)P = (Elf64_Word)off;
+	  break;
+      }
+
+      case R_X86_64_32:
+	  if (value >= 0x7fffffffL) {
+	      barf("R_X86_64_32 relocation out of range: %s = %p\n",
+		   symbol, value);
+	  }
+	  *(Elf64_Word *)P = (Elf64_Word)value;
+	  break;
+
+      case R_X86_64_32S:
+	  if ((StgInt64)value > 0x7fffffffL || (StgInt64)value < -0x80000000L) {
+	      barf("R_X86_64_32S relocation out of range: %s = %p\n",
+		   symbol, value);
+	  }
+	  *(Elf64_Sword *)P = (Elf64_Sword)value;
+	  break;
+#endif
+
+         default:
+            errorBelch("%s: unhandled ELF relocation(RelA) type %lu\n",
+		  oc->fileName, (lnat)ELF_R_TYPE(info));
+            return 0;
+      }
+
+   }
+   return 1;
+}
+
+static int
+ocResolve_ELF ( ObjectCode* oc )
+{
+   char *strtab;
+   int   shnum, ok;
+   Elf_Sym*  stab  = NULL;
+   char*     ehdrC = (char*)(oc->image);
+   Elf_Ehdr* ehdr  = (Elf_Ehdr*) ehdrC;
+   Elf_Shdr* shdr  = (Elf_Shdr*) (ehdrC + ehdr->e_shoff);
+
+   /* first find "the" symbol table */
+   stab = (Elf_Sym*) findElfSection ( ehdrC, SHT_SYMTAB );
+
+   /* also go find the string table */
+   strtab = findElfSection ( ehdrC, SHT_STRTAB );
+
+   if (stab == NULL || strtab == NULL) {
+      errorBelch("%s: can't find string or symbol table", oc->fileName);
+      return 0;
+   }
+
+   /* Process the relocation sections. */
+   for (shnum = 0; shnum < ehdr->e_shnum; shnum++) {
+      if (shdr[shnum].sh_type == SHT_REL) {
+         ok = do_Elf_Rel_relocations ( oc, ehdrC, shdr,
+                                       shnum, stab, strtab );
+         if (!ok) return ok;
+      }
+      else
+      if (shdr[shnum].sh_type == SHT_RELA) {
+         ok = do_Elf_Rela_relocations ( oc, ehdrC, shdr,
+                                        shnum, stab, strtab );
+         if (!ok) return ok;
+      }
+   }
+
+   /* Free the local symbol table; we won't need it again. */
+   freeHashTable(oc->lochash, NULL);
+   oc->lochash = NULL;
+
+#if defined(powerpc_HOST_ARCH)
+   ocFlushInstructionCache( oc );
+#endif
+
+   return 1;
+}
+
+/*
+ * IA64 specifics
+ * Instructions are 41 bits long, packed into 128 bit bundles with a 5-bit template
+ * at the front.  The following utility functions pack and unpack instructions, and
+ * take care of the most common relocations.
+ */
+
+#ifdef ia64_HOST_ARCH
+
+static Elf64_Xword
+ia64_extract_instruction(Elf64_Xword *target)
+{
+   Elf64_Xword w1, w2;
+   int slot = (Elf_Addr)target & 3;
+   target = (Elf_Addr)target & ~3;
+
+   w1 = *target;
+   w2 = *(target+1);
+
+   switch (slot)
+   {
+      case 0:
+         return ((w1 >> 5) & 0x1ffffffffff);
+      case 1:
+         return (w1 >> 46) | ((w2 & 0x7fffff) << 18);
+      case 2:
+         return (w2 >> 23);
+      default:
+         barf("ia64_extract_instruction: invalid slot %p", target);
+   }
+}
+
+static void
+ia64_deposit_instruction(Elf64_Xword *target, Elf64_Xword value)
+{
+   int slot = (Elf_Addr)target & 3;
+   target = (Elf_Addr)target & ~3;
+
+   switch (slot)
+   {
+      case 0:
+         *target |= value << 5;
+         break;
+      case 1:
+         *target |= value << 46;
+         *(target+1) |= value >> 18;
+         break;
+      case 2:
+         *(target+1) |= value << 23;
+         break;
+   }
+}
+
+static void
+ia64_reloc_gprel22(Elf_Addr target, Elf_Addr value)
+{
+   Elf64_Xword instruction;
+   Elf64_Sxword rel_value;
+
+   rel_value = value - gp_val;
+   if ((rel_value > 0x1fffff) || (rel_value < -0x1fffff))
+      barf("GP-relative data out of range (address = 0x%lx, gp = 0x%lx)", value, gp_val);
+
+   instruction = ia64_extract_instruction((Elf64_Xword *)target);
+   instruction |= (((rel_value >> 0) & 0x07f) << 13)		/* imm7b */
+		    | (((rel_value >> 7) & 0x1ff) << 27)	/* imm9d */
+		    | (((rel_value >> 16) & 0x01f) << 22)	/* imm5c */
+		    | ((Elf64_Xword)(rel_value < 0) << 36);	/* s */
+   ia64_deposit_instruction((Elf64_Xword *)target, instruction);
+}
+
+static void
+ia64_reloc_pcrel21(Elf_Addr target, Elf_Addr value, ObjectCode *oc)
+{
+   Elf64_Xword instruction;
+   Elf64_Sxword rel_value;
+   Elf_Addr entry;
+
+   entry = allocatePLTEntry(value, oc);
+
+   rel_value = (entry >> 4) - (target >> 4);
+   if ((rel_value > 0xfffff) || (rel_value < -0xfffff))
+      barf("PLT entry too far away (entry = 0x%lx, target = 0x%lx)", entry, target);
+
+   instruction = ia64_extract_instruction((Elf64_Xword *)target);
+   instruction |= ((rel_value & 0xfffff) << 13) 		/* imm20b */
+	    	    | ((Elf64_Xword)(rel_value < 0) << 36);	/* s */
+   ia64_deposit_instruction((Elf64_Xword *)target, instruction);
+}
+
+#endif /* ia64 */
+
+/*
+ * PowerPC ELF specifics
+ */
+
+#ifdef powerpc_HOST_ARCH
+
+static int ocAllocateJumpIslands_ELF( ObjectCode *oc )
+{
+  Elf_Ehdr *ehdr;
+  Elf_Shdr* shdr;
+  int i;
+
+  ehdr = (Elf_Ehdr *) oc->image;
+  shdr = (Elf_Shdr *) ( ((char *)oc->image) + ehdr->e_shoff );
+
+  for( i = 0; i < ehdr->e_shnum; i++ )
+    if( shdr[i].sh_type == SHT_SYMTAB )
+      break;
+
+  if( i == ehdr->e_shnum )
+  {
+    errorBelch( "This ELF file contains no symtab" );
+    return 0;
+  }
+
+  if( shdr[i].sh_entsize != sizeof( Elf_Sym ) )
+  {
+    errorBelch( "The entry size (%d) of the symtab isn't %d\n",
+      shdr[i].sh_entsize, sizeof( Elf_Sym ) );
+    
+    return 0;
+  }
+
+  return ocAllocateJumpIslands( oc, shdr[i].sh_size / sizeof( Elf_Sym ), 0 );
+}
+
+#endif /* powerpc */
+
+#endif /* ELF */
+
+/* --------------------------------------------------------------------------
+ * Mach-O specifics
+ * ------------------------------------------------------------------------*/
+
+#if defined(OBJFORMAT_MACHO)
+
+/*
+  Support for MachO linking on Darwin/MacOS X
+  by Wolfgang Thaller (wolfgang.thaller@gmx.net)
+
+  I hereby formally apologize for the hackish nature of this code.
+  Things that need to be done:
+  *) implement ocVerifyImage_MachO
+  *) add still more sanity checks.
+*/
+
+#ifdef powerpc_HOST_ARCH
+static int ocAllocateJumpIslands_MachO(ObjectCode* oc)
+{
+    struct mach_header *header = (struct mach_header *) oc->image;
+    struct load_command *lc = (struct load_command *) (header + 1);
+    unsigned i;
+
+    for( i = 0; i < header->ncmds; i++ )
+    {   
+        if( lc->cmd == LC_SYMTAB )
+        {
+                // Find out the first and last undefined external
+                // symbol, so we don't have to allocate too many
+                // jump islands.
+            struct symtab_command *symLC = (struct symtab_command *) lc;
+            unsigned min = symLC->nsyms, max = 0;
+            struct nlist *nlist =
+                symLC ? (struct nlist*) ((char*) oc->image + symLC->symoff)
+                      : NULL;
+            for(i=0;i<symLC->nsyms;i++)
+            {
+                if(nlist[i].n_type & N_STAB)
+                    ;
+                else if(nlist[i].n_type & N_EXT)
+                {
+                    if((nlist[i].n_type & N_TYPE) == N_UNDF
+                        && (nlist[i].n_value == 0))
+                    {
+                        if(i < min)
+                            min = i;
+                        if(i > max)
+                            max = i;
+                    }
+                }
+            }
+            if(max >= min)
+                return ocAllocateJumpIslands(oc, max - min + 1, min);
+
+            break;
+        }
+        
+        lc = (struct load_command *) ( ((char *)lc) + lc->cmdsize );
+    }
+    return ocAllocateJumpIslands(oc,0,0);
+}
+#endif
+
+static int ocVerifyImage_MachO(ObjectCode* oc STG_UNUSED)
+{
+    // FIXME: do some verifying here
+    return 1;
+}
+
+static int resolveImports(
+    ObjectCode* oc,
+    char *image,
+    struct symtab_command *symLC,
+    struct section *sect,    // ptr to lazy or non-lazy symbol pointer section
+    unsigned long *indirectSyms,
+    struct nlist *nlist)
+{
+    unsigned i;
+    size_t itemSize = 4;
+
+#if i386_HOST_ARCH
+    int isJumpTable = 0;
+    if(!strcmp(sect->sectname,"__jump_table"))
+    {
+        isJumpTable = 1;
+        itemSize = 5;
+        ASSERT(sect->reserved2 == itemSize);
+    }
+#endif
+
+    for(i=0; i*itemSize < sect->size;i++)
+    {
+	// according to otool, reserved1 contains the first index into the indirect symbol table
+	struct nlist *symbol = &nlist[indirectSyms[sect->reserved1+i]];
+	char *nm = image + symLC->stroff + symbol->n_un.n_strx;
+	void *addr = NULL;
+
+	if((symbol->n_type & N_TYPE) == N_UNDF
+	    && (symbol->n_type & N_EXT) && (symbol->n_value != 0))
+	    addr = (void*) (symbol->n_value);
+	else if((addr = lookupLocalSymbol(oc,nm)) != NULL)
+	    ;
+	else
+	    addr = lookupSymbol(nm);
+	if(!addr)
+	{
+	    errorBelch("\n%s: unknown symbol `%s'", oc->fileName, nm);
+	    return 0;
+	}
+	ASSERT(addr);
+
+#if i386_HOST_ARCH
+        if(isJumpTable)
+        {
+            checkProddableBlock(oc,image + sect->offset + i*itemSize);
+            *(image + sect->offset + i*itemSize) = 0xe9; // jmp
+            *(unsigned*)(image + sect->offset + i*itemSize + 1)
+                = (char*)addr - (image + sect->offset + i*itemSize + 5);
+        }
+        else
+#endif
+	{
+	    checkProddableBlock(oc,((void**)(image + sect->offset)) + i);
+	    ((void**)(image + sect->offset))[i] = addr;
+        }
+    }
+
+    return 1;
+}
+
+static unsigned long relocateAddress(
+    ObjectCode* oc,
+    int nSections,
+    struct section* sections,
+    unsigned long address)
+{
+    int i;
+    for(i = 0; i < nSections; i++)
+    {
+        if(sections[i].addr <= address
+            && address < sections[i].addr + sections[i].size)
+        {
+            return (unsigned long)oc->image
+                    + sections[i].offset + address - sections[i].addr;
+        }
+    }
+    barf("Invalid Mach-O file:"
+         "Address out of bounds while relocating object file");
+    return 0;
+}
+
+static int relocateSection(
+    ObjectCode* oc,
+    char *image,
+    struct symtab_command *symLC, struct nlist *nlist,
+    int nSections, struct section* sections, struct section *sect)
+{
+    struct relocation_info *relocs;
+    int i,n;
+
+    if(!strcmp(sect->sectname,"__la_symbol_ptr"))
+	return 1;
+    else if(!strcmp(sect->sectname,"__nl_symbol_ptr"))
+	return 1;
+    else if(!strcmp(sect->sectname,"__la_sym_ptr2"))
+	return 1;
+    else if(!strcmp(sect->sectname,"__la_sym_ptr3"))
+	return 1;
+
+    n = sect->nreloc;
+    relocs = (struct relocation_info*) (image + sect->reloff);
+
+    for(i=0;i<n;i++)
+    {
+	if(relocs[i].r_address & R_SCATTERED)
+	{
+	    struct scattered_relocation_info *scat =
+		(struct scattered_relocation_info*) &relocs[i];
+
+	    if(!scat->r_pcrel)
+	    {
+		if(scat->r_length == 2)
+		{
+		    unsigned long word = 0;
+		    unsigned long* wordPtr = (unsigned long*) (image + sect->offset + scat->r_address);
+		    checkProddableBlock(oc,wordPtr);
+
+                    // Note on relocation types:
+                    // i386 uses the GENERIC_RELOC_* types,
+                    // while ppc uses special PPC_RELOC_* types.
+                    // *_RELOC_VANILLA and *_RELOC_PAIR have the same value
+                    // in both cases, all others are different.
+                    // Therefore, we use GENERIC_RELOC_VANILLA
+                    // and GENERIC_RELOC_PAIR instead of the PPC variants,
+                    // and use #ifdefs for the other types.
+                    
+		    // Step 1: Figure out what the relocated value should be
+		    if(scat->r_type == GENERIC_RELOC_VANILLA)
+		    {
+                        word = *wordPtr + (unsigned long) relocateAddress(
+                                                                oc,
+                                                                nSections,
+                                                                sections,
+                                                                scat->r_value)
+                                        - scat->r_value;
+		    }
+#ifdef powerpc_HOST_ARCH
+		    else if(scat->r_type == PPC_RELOC_SECTDIFF
+		        || scat->r_type == PPC_RELOC_LO16_SECTDIFF
+		        || scat->r_type == PPC_RELOC_HI16_SECTDIFF
+		        || scat->r_type == PPC_RELOC_HA16_SECTDIFF)
+#else
+                    else if(scat->r_type == GENERIC_RELOC_SECTDIFF)
+#endif
+		    {
+		        struct scattered_relocation_info *pair =
+		                (struct scattered_relocation_info*) &relocs[i+1];
+
+		        if(!pair->r_scattered || pair->r_type != GENERIC_RELOC_PAIR)
+		            barf("Invalid Mach-O file: "
+		                 "RELOC_*_SECTDIFF not followed by RELOC_PAIR");
+
+		        word = (unsigned long)
+		               (relocateAddress(oc, nSections, sections, scat->r_value)
+		              - relocateAddress(oc, nSections, sections, pair->r_value));
+		        i++;
+		    }
+#ifdef powerpc_HOST_ARCH
+		    else if(scat->r_type == PPC_RELOC_HI16
+                         || scat->r_type == PPC_RELOC_LO16
+                         || scat->r_type == PPC_RELOC_HA16
+                         || scat->r_type == PPC_RELOC_LO14)
+                    {   // these are generated by label+offset things
+		        struct relocation_info *pair = &relocs[i+1];
+                        if((pair->r_address & R_SCATTERED) || pair->r_type != PPC_RELOC_PAIR)
+		            barf("Invalid Mach-O file: "
+		                 "PPC_RELOC_* not followed by PPC_RELOC_PAIR");
+                        
+                        if(scat->r_type == PPC_RELOC_LO16)
+                        {
+                            word = ((unsigned short*) wordPtr)[1];
+                            word |= ((unsigned long) relocs[i+1].r_address & 0xFFFF) << 16;
+                        }
+                        else if(scat->r_type == PPC_RELOC_LO14)
+                        {
+                            barf("Unsupported Relocation: PPC_RELOC_LO14");
+                            word = ((unsigned short*) wordPtr)[1] & 0xFFFC;
+                            word |= ((unsigned long) relocs[i+1].r_address & 0xFFFF) << 16;
+                        }
+                        else if(scat->r_type == PPC_RELOC_HI16)
+                        {
+                            word = ((unsigned short*) wordPtr)[1] << 16;
+                            word |= ((unsigned long) relocs[i+1].r_address & 0xFFFF);
+                        }
+                        else if(scat->r_type == PPC_RELOC_HA16)
+                        {
+                            word = ((unsigned short*) wordPtr)[1] << 16;
+                            word += ((short)relocs[i+1].r_address & (short)0xFFFF);
+                        }
+                       
+                        
+                        word += (unsigned long) relocateAddress(oc, nSections, sections, scat->r_value)
+                                                - scat->r_value;
+                        
+                        i++;
+                    }
+ #endif
+                    else
+		        continue;  // ignore the others
+
+#ifdef powerpc_HOST_ARCH
+                    if(scat->r_type == GENERIC_RELOC_VANILLA
+                        || scat->r_type == PPC_RELOC_SECTDIFF)
+#else
+                    if(scat->r_type == GENERIC_RELOC_VANILLA
+                        || scat->r_type == GENERIC_RELOC_SECTDIFF)
+#endif
+                    {
+                        *wordPtr = word;
+                    }
+#ifdef powerpc_HOST_ARCH
+                    else if(scat->r_type == PPC_RELOC_LO16_SECTDIFF || scat->r_type == PPC_RELOC_LO16)
+                    {
+                        ((unsigned short*) wordPtr)[1] = word & 0xFFFF;
+                    }
+                    else if(scat->r_type == PPC_RELOC_HI16_SECTDIFF || scat->r_type == PPC_RELOC_HI16)
+                    {
+                        ((unsigned short*) wordPtr)[1] = (word >> 16) & 0xFFFF;
+                    }
+                    else if(scat->r_type == PPC_RELOC_HA16_SECTDIFF || scat->r_type == PPC_RELOC_HA16)
+                    {
+                        ((unsigned short*) wordPtr)[1] = ((word >> 16) & 0xFFFF)
+                            + ((word & (1<<15)) ? 1 : 0);
+                    }
+#endif
+		}
+	    }
+
+	    continue; // FIXME: I hope it's OK to ignore all the others.
+	}
+	else
+	{
+	    struct relocation_info *reloc = &relocs[i];
+	    if(reloc->r_pcrel && !reloc->r_extern)
+		continue;
+
+	    if(reloc->r_length == 2)
+	    {
+		unsigned long word = 0;
+#ifdef powerpc_HOST_ARCH
+                unsigned long jumpIsland = 0;
+                long offsetToJumpIsland = 0xBADBAD42; // initialise to bad value
+                                                      // to avoid warning and to catch
+                                                      // bugs.
+#endif
+
+		unsigned long* wordPtr = (unsigned long*) (image + sect->offset + reloc->r_address);
+		checkProddableBlock(oc,wordPtr);
+
+		if(reloc->r_type == GENERIC_RELOC_VANILLA)
+		{
+		    word = *wordPtr;
+		}
+#ifdef powerpc_HOST_ARCH
+		else if(reloc->r_type == PPC_RELOC_LO16)
+		{
+		    word = ((unsigned short*) wordPtr)[1];
+		    word |= ((unsigned long) relocs[i+1].r_address & 0xFFFF) << 16;
+		}
+		else if(reloc->r_type == PPC_RELOC_HI16)
+		{
+		    word = ((unsigned short*) wordPtr)[1] << 16;
+		    word |= ((unsigned long) relocs[i+1].r_address & 0xFFFF);
+		}
+		else if(reloc->r_type == PPC_RELOC_HA16)
+		{
+		    word = ((unsigned short*) wordPtr)[1] << 16;
+		    word += ((short)relocs[i+1].r_address & (short)0xFFFF);
+		}
+		else if(reloc->r_type == PPC_RELOC_BR24)
+		{
+		    word = *wordPtr;
+		    word = (word & 0x03FFFFFC) | ((word & 0x02000000) ? 0xFC000000 : 0);
+		}
+#endif
+
+		if(!reloc->r_extern)
+		{
+		    long delta =
+			sections[reloc->r_symbolnum-1].offset
+			- sections[reloc->r_symbolnum-1].addr
+			+ ((long) image);
+
+		    word += delta;
+		}
+		else
+		{
+		    struct nlist *symbol = &nlist[reloc->r_symbolnum];
+		    char *nm = image + symLC->stroff + symbol->n_un.n_strx;
+		    void *symbolAddress = lookupSymbol(nm);
+		    if(!symbolAddress)
+		    {
+			errorBelch("\nunknown symbol `%s'", nm);
+			return 0;
+		    }
+
+		    if(reloc->r_pcrel)
+                    {  
+#ifdef powerpc_HOST_ARCH
+                            // In the .o file, this should be a relative jump to NULL
+                            // and we'll change it to a relative jump to the symbol
+                        ASSERT(-word == reloc->r_address);
+                        jumpIsland = makeJumpIsland(oc,reloc->r_symbolnum,(unsigned long) symbolAddress);
+                        if(jumpIsland != 0)
+                        {
+                            offsetToJumpIsland = word + jumpIsland
+                                - (((long)image) + sect->offset - sect->addr);
+                        }
+#endif
+			word += (unsigned long) symbolAddress
+                                - (((long)image) + sect->offset - sect->addr);
+                    }
+                    else
+                    {
+                        word += (unsigned long) symbolAddress;
+                    }
+		}
+
+		if(reloc->r_type == GENERIC_RELOC_VANILLA)
+		{
+		    *wordPtr = word;
+		    continue;
+		}
+#ifdef powerpc_HOST_ARCH
+		else if(reloc->r_type == PPC_RELOC_LO16)
+		{
+		    ((unsigned short*) wordPtr)[1] = word & 0xFFFF;
+		    i++; continue;
+		}
+		else if(reloc->r_type == PPC_RELOC_HI16)
+		{
+		    ((unsigned short*) wordPtr)[1] = (word >> 16) & 0xFFFF;
+		    i++; continue;
+		}
+		else if(reloc->r_type == PPC_RELOC_HA16)
+		{
+		    ((unsigned short*) wordPtr)[1] = ((word >> 16) & 0xFFFF)
+			+ ((word & (1<<15)) ? 1 : 0);
+		    i++; continue;
+		}
+		else if(reloc->r_type == PPC_RELOC_BR24)
+		{
+                    if((long)word > (long)0x01FFFFFF || (long)word < (long)0xFFE00000)
+                    {
+                        // The branch offset is too large.
+                        // Therefore, we try to use a jump island.
+                        if(jumpIsland == 0)
+                        {
+                            barf("unconditional relative branch out of range: "
+                                 "no jump island available");
+                        }
+                        
+                        word = offsetToJumpIsland;
+                        if((long)word > (long)0x01FFFFFF || (long)word < (long)0xFFE00000)
+                            barf("unconditional relative branch out of range: "
+                                 "jump island out of range");
+                    }
+		    *wordPtr = (*wordPtr & 0xFC000003) | (word & 0x03FFFFFC);
+		    continue;
+		}
+#endif
+            }
+	    barf("\nunknown relocation %d",reloc->r_type);
+	    return 0;
+	}
+    }
+    return 1;
+}
+
+static int ocGetNames_MachO(ObjectCode* oc)
+{
+    char *image = (char*) oc->image;
+    struct mach_header *header = (struct mach_header*) image;
+    struct load_command *lc = (struct load_command*) (image + sizeof(struct mach_header));
+    unsigned i,curSymbol = 0;
+    struct segment_command *segLC = NULL;
+    struct section *sections;
+    struct symtab_command *symLC = NULL;
+    struct nlist *nlist;
+    unsigned long commonSize = 0;
+    char    *commonStorage = NULL;
+    unsigned long commonCounter;
+
+    for(i=0;i<header->ncmds;i++)
+    {
+	if(lc->cmd == LC_SEGMENT)
+	    segLC = (struct segment_command*) lc;
+	else if(lc->cmd == LC_SYMTAB)
+	    symLC = (struct symtab_command*) lc;
+	lc = (struct load_command *) ( ((char*)lc) + lc->cmdsize );
+    }
+
+    sections = (struct section*) (segLC+1);
+    nlist = symLC ? (struct nlist*) (image + symLC->symoff)
+                  : NULL;
+
+    for(i=0;i<segLC->nsects;i++)
+    {
+        if(sections[i].size == 0)
+            continue;
+
+        if((sections[i].flags & SECTION_TYPE) == S_ZEROFILL)
+        {
+            char * zeroFillArea = stgCallocBytes(1,sections[i].size,
+                                      "ocGetNames_MachO(common symbols)");
+            sections[i].offset = zeroFillArea - image;
+        }
+
+	if(!strcmp(sections[i].sectname,"__text"))
+	    addSection(oc, SECTIONKIND_CODE_OR_RODATA,
+		(void*) (image + sections[i].offset),
+		(void*) (image + sections[i].offset + sections[i].size));
+	else if(!strcmp(sections[i].sectname,"__const"))
+	    addSection(oc, SECTIONKIND_RWDATA,
+		(void*) (image + sections[i].offset),
+		(void*) (image + sections[i].offset + sections[i].size));
+	else if(!strcmp(sections[i].sectname,"__data"))
+	    addSection(oc, SECTIONKIND_RWDATA,
+		(void*) (image + sections[i].offset),
+		(void*) (image + sections[i].offset + sections[i].size));
+	else if(!strcmp(sections[i].sectname,"__bss")
+	        || !strcmp(sections[i].sectname,"__common"))
+	    addSection(oc, SECTIONKIND_RWDATA,
+		(void*) (image + sections[i].offset),
+		(void*) (image + sections[i].offset + sections[i].size));
+
+        addProddableBlock(oc, (void*) (image + sections[i].offset),
+                                        sections[i].size);
+    }
+
+	// count external symbols defined here
+    oc->n_symbols = 0;
+    if(symLC)
+    {
+        for(i=0;i<symLC->nsyms;i++)
+        {
+            if(nlist[i].n_type & N_STAB)
+                ;
+            else if(nlist[i].n_type & N_EXT)
+            {
+                if((nlist[i].n_type & N_TYPE) == N_UNDF
+                    && (nlist[i].n_value != 0))
+                {
+                    commonSize += nlist[i].n_value;
+                    oc->n_symbols++;
+                }
+                else if((nlist[i].n_type & N_TYPE) == N_SECT)
+                    oc->n_symbols++;
+            }
+        }
+    }
+    oc->symbols = stgMallocBytes(oc->n_symbols * sizeof(char*),
+				   "ocGetNames_MachO(oc->symbols)");
+
+    if(symLC)
+    {
+        for(i=0;i<symLC->nsyms;i++)
+        {
+            if(nlist[i].n_type & N_STAB)
+                ;
+            else if((nlist[i].n_type & N_TYPE) == N_SECT)
+            {
+                if(nlist[i].n_type & N_EXT)
+                {
+                    char *nm = image + symLC->stroff + nlist[i].n_un.n_strx;
+                    ghciInsertStrHashTable(oc->fileName, symhash, nm,
+                                            image
+                                            + sections[nlist[i].n_sect-1].offset
+                                            - sections[nlist[i].n_sect-1].addr
+                                            + nlist[i].n_value);
+                    oc->symbols[curSymbol++] = nm;
+                }
+                else
+                {
+                    char *nm = image + symLC->stroff + nlist[i].n_un.n_strx;
+                    ghciInsertStrHashTable(oc->fileName, oc->lochash, nm,
+                                            image
+                                            + sections[nlist[i].n_sect-1].offset
+                                            - sections[nlist[i].n_sect-1].addr
+                                            + nlist[i].n_value);
+                }
+            }
+        }
+    }
+
+    commonStorage = stgCallocBytes(1,commonSize,"ocGetNames_MachO(common symbols)");
+    commonCounter = (unsigned long)commonStorage;
+    if(symLC)
+    {
+        for(i=0;i<symLC->nsyms;i++)
+        {
+	    if((nlist[i].n_type & N_TYPE) == N_UNDF
+	            && (nlist[i].n_type & N_EXT) && (nlist[i].n_value != 0))
+	    {
+	        char *nm = image + symLC->stroff + nlist[i].n_un.n_strx;
+	        unsigned long sz = nlist[i].n_value;
+
+	        nlist[i].n_value = commonCounter;
+
+	        ghciInsertStrHashTable(oc->fileName, symhash, nm,
+	                               (void*)commonCounter);
+	        oc->symbols[curSymbol++] = nm;
+
+	        commonCounter += sz;
+	    }
+        }
+    }
+    return 1;
+}
+
+static int ocResolve_MachO(ObjectCode* oc)
+{
+    char *image = (char*) oc->image;
+    struct mach_header *header = (struct mach_header*) image;
+    struct load_command *lc = (struct load_command*) (image + sizeof(struct mach_header));
+    unsigned i;
+    struct segment_command *segLC = NULL;
+    struct section *sections;
+    struct symtab_command *symLC = NULL;
+    struct dysymtab_command *dsymLC = NULL;
+    struct nlist *nlist;
+
+    for(i=0;i<header->ncmds;i++)
+    {
+	if(lc->cmd == LC_SEGMENT)
+	    segLC = (struct segment_command*) lc;
+	else if(lc->cmd == LC_SYMTAB)
+	    symLC = (struct symtab_command*) lc;
+	else if(lc->cmd == LC_DYSYMTAB)
+	    dsymLC = (struct dysymtab_command*) lc;
+	lc = (struct load_command *) ( ((char*)lc) + lc->cmdsize );
+    }
+
+    sections = (struct section*) (segLC+1);
+    nlist = symLC ? (struct nlist*) (image + symLC->symoff)
+                  : NULL;
+
+    if(dsymLC)
+    {
+        unsigned long *indirectSyms
+            = (unsigned long*) (image + dsymLC->indirectsymoff);
+
+        for(i=0;i<segLC->nsects;i++)
+        {
+            if(    !strcmp(sections[i].sectname,"__la_symbol_ptr")
+                || !strcmp(sections[i].sectname,"__la_sym_ptr2")
+                || !strcmp(sections[i].sectname,"__la_sym_ptr3"))
+            {
+                if(!resolveImports(oc,image,symLC,&sections[i],indirectSyms,nlist))
+                    return 0;
+            }
+            else if(!strcmp(sections[i].sectname,"__nl_symbol_ptr")
+                ||  !strcmp(sections[i].sectname,"__pointers"))
+            {
+                if(!resolveImports(oc,image,symLC,&sections[i],indirectSyms,nlist))
+                    return 0;
+            }
+            else if(!strcmp(sections[i].sectname,"__jump_table"))
+            {
+                if(!resolveImports(oc,image,symLC,&sections[i],indirectSyms,nlist))
+                    return 0;
+            }
+        }
+    }
+    
+    for(i=0;i<segLC->nsects;i++)
+    {
+	if(!relocateSection(oc,image,symLC,nlist,segLC->nsects,sections,&sections[i]))
+	    return 0;
+    }
+
+    /* Free the local symbol table; we won't need it again. */
+    freeHashTable(oc->lochash, NULL);
+    oc->lochash = NULL;
+
+#if defined (powerpc_HOST_ARCH)
+    ocFlushInstructionCache( oc );
+#endif
+
+    return 1;
+}
+
+#ifdef powerpc_HOST_ARCH
+/*
+ * The Mach-O object format uses leading underscores. But not everywhere.
+ * There is a small number of runtime support functions defined in
+ * libcc_dynamic.a whose name does not have a leading underscore.
+ * As a consequence, we can't get their address from C code.
+ * We have to use inline assembler just to take the address of a function.
+ * Yuck.
+ */
+
+static void machoInitSymbolsWithoutUnderscore()
+{
+    extern void* symbolsWithoutUnderscore[];
+    void **p = symbolsWithoutUnderscore;
+    __asm__ volatile(".globl _symbolsWithoutUnderscore\n.data\n_symbolsWithoutUnderscore:");
+
+#undef Sym
+#define Sym(x)  \
+    __asm__ volatile(".long " # x);
+
+    RTS_MACHO_NOUNDERLINE_SYMBOLS
+
+    __asm__ volatile(".text");
+    
+#undef Sym
+#define Sym(x)  \
+    ghciInsertStrHashTable("(GHCi built-in symbols)", symhash, #x, *p++);
+    
+    RTS_MACHO_NOUNDERLINE_SYMBOLS
+    
+#undef Sym
+}
+#endif
+
+/*
+ * Figure out by how much to shift the entire Mach-O file in memory
+ * when loading so that its single segment ends up 16-byte-aligned
+ */
+static int machoGetMisalignment( FILE * f )
+{
+    struct mach_header header;
+    int misalignment;
+    
+    fread(&header, sizeof(header), 1, f);
+    rewind(f);
+
+    if(header.magic != MH_MAGIC)
+        return 0;
+    
+    misalignment = (header.sizeofcmds + sizeof(header))
+                    & 0xF;
+
+    return misalignment ? (16 - misalignment) : 0;
+}
+
+#endif
diff --git a/rts/LinkerInternals.h b/rts/LinkerInternals.h
new file mode 100644
index 0000000000..07d6334c7f
--- /dev/null
+++ b/rts/LinkerInternals.h
@@ -0,0 +1,110 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2000
+ *
+ * RTS Object Linker
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef LINKERINTERNALS_H
+#define LINKERINTERNALS_H
+
+typedef enum { OBJECT_LOADED, OBJECT_RESOLVED } OStatus;
+
+/* Indication of section kinds for loaded objects.  Needed by
+   the GC for deciding whether or not a pointer on the stack
+   is a code pointer.
+*/
+typedef 
+   enum { SECTIONKIND_CODE_OR_RODATA,
+          SECTIONKIND_RWDATA,
+          SECTIONKIND_OTHER,
+          SECTIONKIND_NOINFOAVAIL } 
+   SectionKind;
+
+typedef 
+   struct _Section { 
+      void* start; 
+      void* end; 
+      SectionKind kind;
+      struct _Section* next;
+   } 
+   Section;
+
+typedef 
+   struct _ProddableBlock {
+      void* start;
+      int   size;
+      struct _ProddableBlock* next;
+   }
+   ProddableBlock;
+
+/* Jump Islands are sniplets of machine code required for relative
+ * address relocations on the PowerPC.
+ */
+#ifdef powerpc_HOST_ARCH
+typedef struct {
+    short lis_r12, hi_addr;
+    short ori_r12_r12, lo_addr;
+    long mtctr_r12;
+    long bctr;
+} ppcJumpIsland;
+#endif
+
+/* Top-level structure for an object module.  One of these is allocated
+ * for each object file in use.
+ */
+typedef struct _ObjectCode {
+    OStatus    status;
+    char*      fileName;
+    int        fileSize;
+    char*      formatName;            /* eg "ELF32", "DLL", "COFF", etc. */
+
+    /* An array containing ptrs to all the symbol names copied from
+       this object into the global symbol hash table.  This is so that
+       we know which parts of the latter mapping to nuke when this
+       object is removed from the system. */
+    char**     symbols;
+    int        n_symbols;
+
+    /* ptr to malloc'd lump of memory holding the obj file */
+    char*      image;
+
+#ifdef darwin_HOST_OS
+    /* record by how much image has been deliberately misaligned
+       after allocation, so that we can use realloc */
+    int        misalignment;
+#endif
+
+    /* The section-kind entries for this object module.  Linked
+       list. */
+    Section* sections;
+
+    /* A private hash table for local symbols. */
+    HashTable* lochash;
+    
+    /* Allow a chain of these things */
+    struct _ObjectCode * next;
+
+    /* SANITY CHECK ONLY: a list of the only memory regions which may
+       safely be prodded during relocation.  Any attempt to prod
+       outside one of these is an error in the linker. */
+    ProddableBlock* proddables;
+
+#ifdef ia64_HOST_ARCH
+    /* Procedure Linkage Table for this object */
+    void *plt;
+    unsigned int pltIndex;
+#endif
+
+#ifdef powerpc_HOST_ARCH
+    ppcJumpIsland   *jump_islands;
+    unsigned long   island_start_symbol;
+    unsigned long   n_islands;
+#endif
+
+} ObjectCode;
+
+extern ObjectCode *objects;
+
+#endif /* LINKERINTERNALS_H */
diff --git a/rts/MBlock.c b/rts/MBlock.c
new file mode 100644
index 0000000000..fa8fd49d88
--- /dev/null
+++ b/rts/MBlock.c
@@ -0,0 +1,453 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-1999
+ *
+ * MegaBlock Allocator Interface.  This file contains all the dirty
+ * architecture-dependent hackery required to get a chunk of aligned
+ * memory from the operating system.
+ *
+ * ---------------------------------------------------------------------------*/
+
+/* This is non-posix compliant. */
+/* #include "PosixSource.h" */
+
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "MBlock.h"
+#include "BlockAlloc.h"
+
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#ifndef mingw32_HOST_OS
+# ifdef HAVE_SYS_MMAN_H
+# include <sys/mman.h>
+# endif
+#endif
+#ifdef HAVE_FCNTL_H
+#include <fcntl.h>
+#endif
+#if HAVE_WINDOWS_H
+#include <windows.h>
+#endif
+#if darwin_HOST_OS
+#include <mach/vm_map.h>
+#endif
+
+#include <errno.h>
+
+lnat mblocks_allocated = 0;
+
+/* -----------------------------------------------------------------------------
+   The MBlock Map: provides our implementation of HEAP_ALLOCED()
+   -------------------------------------------------------------------------- */
+
+#if SIZEOF_VOID_P == 4
+StgWord8 mblock_map[MBLOCK_MAP_SIZE]; // initially all zeros
+#elif SIZEOF_VOID_P == 8
+static MBlockMap dummy_mblock_map;
+MBlockMap *mblock_cache = &dummy_mblock_map;
+int mblock_map_count = 0;
+MBlockMap **mblock_maps = NULL;
+
+static MBlockMap *
+findMBlockMap(void *p)
+{
+    int i;
+    StgWord32 hi = (StgWord32) (((StgWord)p) >> 32);
+    for( i = 0; i < mblock_map_count; i++ )
+    {
+        if(mblock_maps[i]->addrHigh32 == hi)
+        {
+	    return mblock_maps[i];
+	}
+    }
+    return NULL;
+}
+
+StgBool
+slowIsHeapAlloced(void *p)
+{
+    MBlockMap *map = findMBlockMap(p);
+    if(map)
+    {
+    	mblock_cache = map;
+	return map->mblocks[MBLOCK_MAP_ENTRY(p)];
+    }
+    else
+    	return 0;
+}
+#endif
+
+static void
+markHeapAlloced(void *p)
+{
+#if SIZEOF_VOID_P == 4
+    mblock_map[MBLOCK_MAP_ENTRY(p)] = 1;
+#elif SIZEOF_VOID_P == 8
+    MBlockMap *map = findMBlockMap(p);
+    if(map == NULL)
+    {
+    	mblock_map_count++;
+    	mblock_maps = realloc(mblock_maps,
+			      sizeof(MBlockMap*) * mblock_map_count);
+	map = mblock_maps[mblock_map_count-1] = calloc(1,sizeof(MBlockMap));
+	map->addrHigh32 = (StgWord32) (((StgWord)p) >> 32);
+    }
+    map->mblocks[MBLOCK_MAP_ENTRY(p)] = 1;
+    mblock_cache = map;
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+   Allocate new mblock(s)
+   -------------------------------------------------------------------------- */
+
+void *
+getMBlock(void)
+{
+  return getMBlocks(1);
+}
+
+/* -----------------------------------------------------------------------------
+   The mmap() method
+
+   On Unix-like systems, we use mmap() to allocate our memory.  We
+   want memory in chunks of MBLOCK_SIZE, and aligned on an MBLOCK_SIZE
+   boundary.  The mmap() interface doesn't give us this level of
+   control, so we have to use some heuristics.
+
+   In the general case, if we want a block of n megablocks, then we
+   allocate n+1 and trim off the slop from either side (using
+   munmap()) to get an aligned chunk of size n.  However, the next
+   time we'll try to allocate directly after the previously allocated
+   chunk, on the grounds that this is aligned and likely to be free.
+   If it turns out that we were wrong, we have to munmap() and try
+   again using the general method.
+
+   Note on posix_memalign(): this interface is available on recent
+   systems and appears to provide exactly what we want.  However, it
+   turns out not to be as good as our mmap() implementation, because
+   it wastes extra space (using double the address space, in a test on
+   x86_64/Linux).  The problem seems to be that posix_memalign()
+   returns memory that can be free()'d, so the library must store
+   extra information along with the allocated block, thus messing up
+   the alignment.  Hence, we don't use posix_memalign() for now.
+
+   -------------------------------------------------------------------------- */
+
+#if !defined(mingw32_HOST_OS) && !defined(cygwin32_HOST_OS)
+
+// A wrapper around mmap(), to abstract away from OS differences in
+// the mmap() interface.
+
+static void *
+my_mmap (void *addr, lnat size)
+{
+    void *ret;
+
+#if defined(solaris2_HOST_OS) || defined(irix_HOST_OS)
+    { 
+	int fd = open("/dev/zero",O_RDONLY);
+	ret = mmap(addr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+	close(fd);
+    }
+#elif hpux_HOST_OS
+    ret = mmap(addr, size, PROT_READ | PROT_WRITE, 
+	       MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+#elif darwin_HOST_OS
+    // Without MAP_FIXED, Apple's mmap ignores addr.
+    // With MAP_FIXED, it overwrites already mapped regions, whic
+    // mmap(0, ... MAP_FIXED ...) is worst of all: It unmaps the program text
+    // and replaces it with zeroes, causing instant death.
+    // This behaviour seems to be conformant with IEEE Std 1003.1-2001.
+    // Let's just use the underlying Mach Microkernel calls directly,
+    // they're much nicer.
+    
+    kern_return_t err;
+    ret = addr;
+    if(addr)	// try to allocate at adress
+	err = vm_allocate(mach_task_self(),(vm_address_t*) &ret, size, FALSE);
+    if(!addr || err)	// try to allocate anywhere
+	err = vm_allocate(mach_task_self(),(vm_address_t*) &ret, size, TRUE);
+	
+    if(err) {
+	// don't know what the error codes mean exactly, assume it's
+	// not our problem though.
+	errorBelch("memory allocation failed (requested %lu bytes)", size);
+	stg_exit(EXIT_FAILURE);
+    } else {
+	vm_protect(mach_task_self(),ret,size,FALSE,VM_PROT_READ|VM_PROT_WRITE);
+    }
+#else
+    ret = mmap(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC, 
+	       MAP_ANON | MAP_PRIVATE, -1, 0);
+#endif
+
+    if (ret == (void *)-1) {
+	if (errno == ENOMEM || 
+	    (errno == EINVAL && sizeof(void*)==4 && size >= 0xc0000000)) {
+	    // If we request more than 3Gig, then we get EINVAL
+	    // instead of ENOMEM (at least on Linux).
+	    errorBelch("out of memory (requested %lu bytes)", size);
+	    stg_exit(EXIT_FAILURE);
+	} else {
+	    barf("getMBlock: mmap: %s", strerror(errno));
+	}
+    }
+
+    return ret;
+}
+
+// Implements the general case: allocate a chunk of memory of 'size'
+// mblocks.
+
+static void *
+gen_map_mblocks (lnat size)
+{
+    int slop;
+    void *ret;
+
+    // Try to map a larger block, and take the aligned portion from
+    // it (unmap the rest).
+    size += MBLOCK_SIZE;
+    ret = my_mmap(0, size);
+    
+    // unmap the slop bits around the chunk we allocated
+    slop = (W_)ret & MBLOCK_MASK;
+    
+    if (munmap(ret, MBLOCK_SIZE - slop) == -1) {
+      barf("gen_map_mblocks: munmap failed");
+    }
+    if (slop > 0 && munmap(ret+size-slop, slop) == -1) {
+      barf("gen_map_mblocks: munmap failed");
+    }
+
+    // ToDo: if we happened to get an aligned block, then don't
+    // unmap the excess, just use it. For this to work, you
+    // need to keep in mind the following:
+    //     * Calling my_mmap() with an 'addr' arg pointing to
+    //       already my_mmap()ed space is OK and won't fail.
+    //     * If my_mmap() can't satisfy the request at the
+    //       given 'next_request' address in getMBlocks(), that
+    //       you unmap the extra mblock mmap()ed here (or simply
+    //       satisfy yourself that the slop introduced isn't worth
+    //       salvaging.)
+    // 
+
+    // next time, try after the block we just got.
+    ret += MBLOCK_SIZE - slop;
+    return ret;
+}
+
+
+// The external interface: allocate 'n' mblocks, and return the
+// address.
+
+void *
+getMBlocks(nat n)
+{
+  static caddr_t next_request = (caddr_t)HEAP_BASE;
+  caddr_t ret;
+  lnat size = MBLOCK_SIZE * n;
+  nat i;
+ 
+  if (next_request == 0) {
+      // use gen_map_mblocks the first time.
+      ret = gen_map_mblocks(size);
+  } else {
+      ret = my_mmap(next_request, size);
+
+      if (((W_)ret & MBLOCK_MASK) != 0) {
+	  // misaligned block!
+#if 0 // defined(DEBUG)
+	  errorBelch("warning: getMBlock: misaligned block %p returned when allocating %d megablock(s) at %p", ret, n, next_request);
+#endif
+
+	  // unmap this block...
+	  if (munmap(ret, size) == -1) {
+	      barf("getMBlock: munmap failed");
+	  }
+	  // and do it the hard way
+	  ret = gen_map_mblocks(size);
+      }
+  }
+
+  // Next time, we'll try to allocate right after the block we just got.
+  // ToDo: check that we haven't already grabbed the memory at next_request
+  next_request = ret + size;
+
+  IF_DEBUG(gc,debugBelch("Allocated %d megablock(s) at %p\n",n,ret));
+
+  // fill in the table
+  for (i = 0; i < n; i++) {
+      markHeapAlloced( ret + i * MBLOCK_SIZE );
+  }
+
+  mblocks_allocated += n;
+
+  return ret;
+}
+
+void
+freeAllMBlocks(void)
+{
+  /* XXX Do something here */
+}
+
+#else /* defined(mingw32_HOST_OS) || defined(cygwin32_HOST_OS) */
+
+/*
+ On Win32 platforms we make use of the two-phased virtual memory API
+ to allocate mega blocks. We proceed as follows:
+
+ Reserve a large chunk of VM (256M at the time, or what the user asked
+ for via the -M option), but don't supply a base address that's aligned on
+ a MB boundary. Instead we round up to the nearest mblock from the chunk of
+ VM we're handed back from the OS (at the moment we just leave the 'slop' at
+ the beginning of the reserved chunk unused - ToDo: reuse it .)
+
+ Reserving memory doesn't allocate physical storage (not even in the
+ page file), this is done later on by committing pages (or mega-blocks in
+ our case).
+*/
+
+static char* base_non_committed = (char*)0;
+static char* end_non_committed = (char*)0;
+
+static void *membase;
+
+/* Default is to reserve 256M of VM to minimise the slop cost. */
+#define SIZE_RESERVED_POOL  ( 256 * 1024 * 1024 )
+
+/* Number of bytes reserved */
+static unsigned long size_reserved_pool = SIZE_RESERVED_POOL;
+
+void *
+getMBlocks(nat n)
+{
+  static char* base_mblocks       = (char*)0;
+  static char* next_request       = (char*)0;
+  void* ret                       = (void*)0;
+  nat i;
+
+  lnat size = MBLOCK_SIZE * n;
+  
+  if ( (base_non_committed == 0) || (next_request + size > end_non_committed) ) {
+    if (base_non_committed) {
+	/* Tacky, but if no user-provided -M option is in effect,
+	 * set it to the default (==256M) in time for the heap overflow PSA.
+	 */
+	if (RtsFlags.GcFlags.maxHeapSize == 0) {
+	    RtsFlags.GcFlags.maxHeapSize = size_reserved_pool / BLOCK_SIZE;
+	}
+	heapOverflow();
+    }
+    if (RtsFlags.GcFlags.maxHeapSize != 0) {
+      size_reserved_pool = BLOCK_SIZE * RtsFlags.GcFlags.maxHeapSize;
+      if (size_reserved_pool < MBLOCK_SIZE) {
+	size_reserved_pool = 2*MBLOCK_SIZE;
+      }
+    }
+    base_non_committed = VirtualAlloc ( NULL
+                                      , size_reserved_pool
+				      , MEM_RESERVE
+				      , PAGE_READWRITE
+				      );
+    membase = base_non_committed;
+    if ( base_non_committed == 0 ) {
+         errorBelch("getMBlocks: VirtualAlloc MEM_RESERVE %lu failed with: %ld\n", size_reserved_pool, GetLastError());
+       ret=(void*)-1;
+    } else {
+      end_non_committed = (char*)base_non_committed + (unsigned long)size_reserved_pool;
+      /* The returned pointer is not aligned on a mega-block boundary. Make it. */
+      base_mblocks = (char*)((unsigned long)base_non_committed & (unsigned long)~MBLOCK_MASK) + MBLOCK_SIZE;
+#      if 0
+       debugBelch("getMBlocks: Dropping %d bytes off of 256M chunk\n", 
+		  (unsigned)base_mblocks - (unsigned)base_non_committed);
+#      endif
+
+       if ( ((char*)base_mblocks + size) > end_non_committed ) {
+          debugBelch("getMBlocks: oops, committed too small a region to start with.");
+	  ret=(void*)-1;
+       } else {
+          next_request = base_mblocks;
+       }
+    }
+  }
+  /* Commit the mega block(s) to phys mem */
+  if ( ret != (void*)-1 ) {
+     ret = VirtualAlloc(next_request, size, MEM_COMMIT, PAGE_READWRITE);
+     if (ret == NULL) {
+        debugBelch("getMBlocks: VirtualAlloc MEM_COMMIT %lu failed with: %ld\n", size, GetLastError());
+        ret=(void*)-1;
+     }
+  }
+
+  if (((W_)ret & MBLOCK_MASK) != 0) {
+    barf("getMBlocks: misaligned block returned");
+  }
+
+  if (ret == (void*)-1) {
+     barf("getMBlocks: unknown memory allocation failure on Win32.");
+  }
+
+  IF_DEBUG(gc,debugBelch("Allocated %d megablock(s) at 0x%x\n",n,(nat)ret));
+  next_request = (char*)next_request + size;
+
+  mblocks_allocated += n;
+  
+  // fill in the table
+  for (i = 0; i < n; i++) {
+      markHeapAlloced( ret + i * MBLOCK_SIZE );
+  }
+
+  return ret;
+}
+
+void
+freeAllMBlocks(void)
+{
+  BOOL rc;
+
+  rc = VirtualFree(membase, 0, MEM_RELEASE);
+  
+  if (rc == FALSE) {
+     debugBelch("freeAllMBlocks: VirtualFree failed with: %ld\n", GetLastError());
+  }
+}
+
+/* Hand back the physical memory that is allocated to a mega-block. 
+   ToDo: chain the released mega block onto some list so that
+         getMBlocks() can get at it.
+
+   Currently unused.
+*/
+#if 0
+void
+freeMBlock(void* p, nat n)
+{
+  BOOL rc;
+
+  rc = VirtualFree(p, n * MBLOCK_SIZE , MEM_DECOMMIT );
+  
+  if (rc == FALSE) {
+#    ifdef DEBUG
+     debugBelch("freeMBlocks: VirtualFree failed with: %d\n", GetLastError());
+#    endif
+  }
+
+}
+#endif
+
+#endif
diff --git a/rts/MBlock.h b/rts/MBlock.h
new file mode 100644
index 0000000000..1cc0dc5a1f
--- /dev/null
+++ b/rts/MBlock.h
@@ -0,0 +1,90 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * MegaBlock Allocator interface.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef MBLOCK_H
+#define MBLOCK_H
+
+extern lnat RTS_VAR(mblocks_allocated);
+
+extern void * getMBlock(void);
+extern void * getMBlocks(nat n);
+extern void freeAllMBlocks(void);
+
+#if osf3_HOST_OS
+/* ToDo: Perhaps by adjusting this value we can make linking without
+ * -static work (i.e., not generate a core-dumping executable)? */
+#if SIZEOF_VOID_P == 8
+#define HEAP_BASE 0x180000000L
+#else
+#error I have no idea where to begin the heap on a non-64-bit osf3 machine.
+#endif
+
+#else
+
+// we're using the generic method
+#define HEAP_BASE 0
+
+#endif
+
+/* -----------------------------------------------------------------------------
+   The HEAP_ALLOCED() test.
+
+   HEAP_ALLOCED is called FOR EVERY SINGLE CLOSURE during GC.
+   It needs to be FAST.
+
+   Implementation of HEAP_ALLOCED
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   Since heap is allocated in chunks of megablocks (MBLOCK_SIZE), we
+   can just use a table to record which megablocks in the address
+   space belong to the heap.  On a 32-bit machine, with 1Mb
+   megablocks, using 8 bits for each entry in the table, the table
+   requires 4k.  Lookups during GC will be fast, because the table
+   will be quickly cached (indeed, performance measurements showed no
+   measurable difference between doing the table lookup and using a
+   constant comparison).
+
+   On 64-bit machines, we cache one 12-bit block map that describes
+   4096 megablocks or 4GB of memory. If HEAP_ALLOCED is called for
+   an address that is not in the cache, it calls slowIsHeapAlloced
+   (see MBlock.c) which will find the block map for the 4GB block in
+   question.
+   -------------------------------------------------------------------------- */
+
+#if SIZEOF_VOID_P == 4
+extern StgWord8 mblock_map[];
+
+/* On a 32-bit machine a 4KB table is always sufficient */
+# define MBLOCK_MAP_SIZE	4096
+# define MBLOCK_MAP_ENTRY(p)	((StgWord)(p) >> MBLOCK_SHIFT)
+# define HEAP_ALLOCED(p)	mblock_map[MBLOCK_MAP_ENTRY(p)]
+
+#elif SIZEOF_VOID_P == 8
+
+# define MBLOCK_MAP_SIZE	4096
+# define MBLOCK_MAP_ENTRY(p)	(((StgWord)(p) & 0xffffffff) >> MBLOCK_SHIFT)
+
+typedef struct {
+    StgWord32	addrHigh32;
+    StgWord8	mblocks[MBLOCK_MAP_SIZE];
+} MBlockMap;
+
+extern MBlockMap *mblock_cache;
+
+StgBool slowIsHeapAlloced(void *p);
+
+# define HEAP_ALLOCED(p)  					\
+	( ((((StgWord)(p)) >> 32) == mblock_cache->addrHigh32)	\
+	? mblock_cache->mblocks[MBLOCK_MAP_ENTRY(p)]		\
+	: slowIsHeapAlloced(p) )
+
+#else
+# error HEAP_ALLOCED not defined
+#endif
+
+#endif /* MBLOCK_H */
diff --git a/rts/Main.c b/rts/Main.c
new file mode 100644
index 0000000000..6aef280e25
--- /dev/null
+++ b/rts/Main.c
@@ -0,0 +1,138 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-2000
+ *
+ * Main function for a standalone Haskell program.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#define COMPILING_RTS_MAIN
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsAPI.h"
+#include "SchedAPI.h"
+#include "Schedule.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Prelude.h"
+#include "Task.h"
+#include <stdlib.h>
+
+#ifdef DEBUG
+# include "Printer.h"   /* for printing        */
+#endif
+
+#ifdef PAR
+# include "Parallel.h"
+# include "ParallelRts.h"
+# include "LLC.h"
+#endif
+
+#if defined(GRAN) || defined(PAR)
+# include "GranSimRts.h"
+#endif
+
+#ifdef HAVE_WINDOWS_H
+# include <windows.h>
+#endif
+
+extern void __stginit_ZCMain(void);
+
+/* Hack: we assume that we're building a batch-mode system unless 
+ * INTERPRETER is set
+ */
+#ifndef INTERPRETER /* Hack */
+int main(int argc, char *argv[])
+{
+    int exit_status;
+    SchedulerStatus status;
+    /* all GranSim/GUM init is done in startupHaskell; sets IAmMainThread! */
+
+    startupHaskell(argc,argv,__stginit_ZCMain);
+
+    /* kick off the computation by creating the main thread with a pointer
+       to mainIO_closure representing the computation of the overall program;
+       then enter the scheduler with this thread and off we go;
+      
+       the same for GranSim (we have only one instance of this code)
+
+       in a parallel setup, where we have many instances of this code
+       running on different PEs, we should do this only for the main PE
+       (IAmMainThread is set in startupHaskell) 
+    */
+
+#  if defined(PAR)
+
+#   if defined(DEBUG)
+    { /* a wait loop to allow attachment of gdb to UNIX threads */
+      nat i, j, s;
+
+      for (i=0, s=0; i<(nat)RtsFlags.ParFlags.wait; i++)
+	for (j=0; j<1000000; j++) 
+	  s += j % 65536;
+    }
+    IF_PAR_DEBUG(verbose,
+		 belch("Passed wait loop"));
+#   endif
+
+    if (IAmMainThread == rtsTrue) {
+      IF_PAR_DEBUG(verbose,
+		   debugBelch("==== [%x] Main Thread Started ...\n", mytid));
+
+      /* ToDo: Dump event for the main thread */
+      status = rts_mainLazyIO((HaskellObj)mainIO_closure, NULL);
+    } else {
+      /* Just to show we're alive */
+      IF_PAR_DEBUG(verbose,
+		   debugBelch("== [%x] Non-Main PE enters scheduler via taskStart() without work ...\n",
+			   mytid));
+     
+      /* all non-main threads enter the scheduler without work */
+      taskStart();       
+      status = Success;  // declare victory (see shutdownParallelSystem)
+    }
+
+#  elif defined(GRAN)
+
+    /* ToDo: Dump event for the main thread */
+    status = rts_mainLazyIO(mainIO_closure, NULL);
+
+#  else /* !PAR && !GRAN */
+
+    /* ToDo: want to start with a larger stack size */
+    { 
+	void *cap = rts_lock();
+	cap = rts_evalLazyIO(cap,(HaskellObj)(void *)mainIO_closure, NULL);
+	status = rts_getSchedStatus(cap);
+	rts_unlock(cap);
+    }
+
+#  endif /* !PAR && !GRAN */
+
+    /* check the status of the entire Haskell computation */
+    switch (status) {
+    case Killed:
+      errorBelch("main thread exited (uncaught exception)");
+      exit_status = EXIT_KILLED;
+      break;
+    case Interrupted:
+      errorBelch("interrupted");
+      exit_status = EXIT_INTERRUPTED;
+      break;
+    case Success:
+      exit_status = EXIT_SUCCESS;
+      break;
+#if defined(PAR)
+    case NoStatus:
+      errorBelch("main thread PE killed; probably due to failure of another PE; check /tmp/pvml...");
+      exit_status = EXIT_KILLED;
+      break;
+#endif 
+    default:
+      barf("main thread completed with invalid status");
+    }
+    shutdownHaskellAndExit(exit_status);
+    return 0; /* never reached, keep gcc -Wall happy */
+}
+# endif /* BATCH_MODE */
diff --git a/rts/Makefile b/rts/Makefile
new file mode 100644
index 0000000000..2319788d65
--- /dev/null
+++ b/rts/Makefile
@@ -0,0 +1,370 @@
+#-----------------------------------------------------------------------------
+#
+#  This is the Makefile for the runtime-system stuff.
+#  This stuff is written in C (and cannot be written in Haskell).
+#
+#  .c  files are vanilla C,
+#  .hc files are "Haskellized-C", compiled using the C compiler and
+#      (possibly) the assembly-mangler.  The GHC driver script
+#      knows how to compile this stuff.
+#
+#  Other sorta independent, compile-once subdirs are:
+#	gmp		-- GNU multi-precision library (for Integer)
+
+#-----------------------------------------------------------------------------
+# Preamble
+
+TOP=..
+
+# Set UseGhcForCc: this causes the fptools build system to use a different
+# set of suffix rules for compiling C code, using $(HC) rather than $(CC)
+# and prepending "-optc" to $(CC_OPTS).  NB. must be done before including
+# boilerplate.mk below.
+UseGhcForCc = YES
+
+include $(TOP)/mk/boilerplate.mk
+
+PACKAGE = rts
+
+HC=$(GHC_INPLACE)
+
+# -----------------------------------------------------------------------------
+# RTS ways
+
+WAYS=$(GhcLibWays) $(GhcRTSWays)
+
+ifneq "$(findstring debug, $(way))" ""
+GhcRtsHcOpts=
+GhcRtsCcOpts=-g
+endif
+
+# -----------------------------------------------------------------------------
+
+# Tells the build system not to add various Haskellish options to $(SRC_HC_OPTS)
+NON_HS_PACKAGE = YES
+
+# grab sources from these subdirectories
+ALL_DIRS = hooks parallel
+
+ifeq "$(HOSTPLATFORM)" "i386-unknown-mingw32"
+ALL_DIRS += win32
+else
+ALL_DIRS += posix
+endif
+
+ifneq "$(DLLized)" "YES"
+EXCLUDED_SRCS += RtsDllMain.c
+else
+EXCLUDED_SRCS += Main.c
+endif
+
+# This file ends up being empty unless we're building for a powerpc
+# or darwin system, and it is reported that Solaris ld chokes on it when
+# building HSrts.o.
+ifeq "$(findstring $(TargetArch_CPP), powerpc powerpc64)" ""
+ifeq "$(findstring $(TargetOS_CPP), darwin)" ""
+EXCLUDED_SRCS += AdjustorAsm.S
+endif
+endif
+
+EXCLUDED_SRCS += parallel/SysMan.c
+
+# The build system doesn't give us these
+CMM_SRCS = $(filter-out AutoApply%.cmm, $(wildcard *.cmm)) $(EXTRA_CMM_SRCS)
+CMM_OBJS = $(patsubst %.cmm,%.$(way_)o, $(CMM_SRCS))
+
+CLEAN_FILES += $(CMM_OBJS)
+
+# Override the default $(LIBOBJS) (defaults to $(HS_OBJS))
+LIBOBJS = $(C_OBJS) $(CMM_OBJS)
+
+SplitObjs=NO
+
+H_FILES = $(wildcard ../includes/*.h) $(wildcard *.h)
+
+#-----------------------------------------------------------------------------
+# Flags for compiling RTS .c and .hc files
+
+# gcc provides lots of useful warnings if you ask it.
+# This is a pretty good list to start with - use a # to comment out
+# any you don't like.
+WARNING_OPTS += -Wall 
+WARNING_OPTS += -W
+WARNING_OPTS += -Wstrict-prototypes 
+WARNING_OPTS += -Wmissing-prototypes 
+WARNING_OPTS += -Wmissing-declarations
+WARNING_OPTS += -Winline
+WARNING_OPTS += -Waggregate-return
+#WARNING_OPTS += -Wpointer-arith
+WARNING_OPTS += -Wbad-function-cast
+#WARNING_OPTS += -Wcast-align
+#WARNING_OPTS += -Wnested-externs
+#WARNING_OPTS += -Wshadow
+#WARNING_OPTS += -Wcast-qual
+#WARNING_OPTS += -Wno-unused 
+#WARNING_OPTS += -Wredundant-decls 
+#WARNING_OPTS += -Wconversion
+
+STANDARD_OPTS += -I../includes -I. -Iparallel
+# COMPILING_RTS is only used when building Win32 DLL support.
+STANDARD_OPTS += -DCOMPILING_RTS
+
+# HC_OPTS is included in both .c and .cmm compilations, whereas CC_OPTS is
+# only included in .c compilations.  HC_OPTS included the WAY_* opts, which
+# must be included in both types of compilations.
+
+SRC_CC_OPTS += $(WARNING_OPTS)
+SRC_CC_OPTS += $(STANDARD_OPTS)
+
+SRC_CC_OPTS += $(GhcRtsCcOpts)
+SRC_HC_OPTS += $(GhcRtsHcOpts)
+
+ifneq "$(DLLized)" "YES"
+SRC_HC_OPTS += -static
+endif
+# SRC_HC_OPTS += -fPIC
+
+RtsMessages_CC_OPTS += -DProjectVersion=\"$(ProjectVersion)\"
+
+ifeq "$(way)" "mp"
+SRC_HC_OPTS += -I$$PVM_ROOT/include
+endif
+
+# If -DDEBUG is in effect, adjust package conf accordingly..
+ifneq "$(strip $(filter -optc-DDEBUG,$(GhcRtsHcOpts)))" ""
+PACKAGE_CPP_OPTS += -DDEBUG
+endif
+
+ifeq "$(HaveLibMingwEx)" "YES"
+PACKAGE_CPP_OPTS += -DHAVE_LIBMINGWEX
+endif
+
+ifeq "$(DotnetSupport)" "YES"
+
+# 
+# Would like to just use SUBDIRS here, but need to
+# descend into dotnet/ earlier than that.
+#
+all ::
+	$(MAKE) -C dotnet all
+
+# But use SUBDIRS for other recursive targets.
+SUBDIRS += dotnet
+
+LIBOBJS += dotnet/Invoke.o
+endif
+
+# Suppress uninitialized variable warnings for GC.c
+GC_CC_OPTS 	+= -Wno-uninitialized
+
+#-----------------------------------------------------------------------------
+# Include the Front panel code?
+
+# we need GTK+ for the front panel
+ifneq "$(GTK_CONFIG)" ""
+ifeq "$(GhcRtsWithFrontPanel)" "YES"
+SRC_HC_OPTS 		+= `$(GTK_CONFIG) --cflags` -optc-DRTS_GTK_FRONTPANEL
+VisCallbacks_CC_OPTS 	+= -Wno-unused
+SRC_MKDEPENDC_OPTS	+= `$(GTK_CONFIG) --cflags`
+else # GhcRtsWithFrontPanel
+EXCLUDED_SRCS		+= $(wildcard Vis*.c)
+endif
+else # GTK_CONFIG
+EXCLUDED_SRCS		+= $(wildcard Vis*.c)
+endif
+
+#-----------------------------------------------------------------------------
+# make depend setup
+
+SRC_MKDEPENDC_OPTS += -I. -I../includes
+
+# Hack: we define every way-related option here, so that we get (hopefully)
+# a superset of the dependencies.  To do this properly, we should generate
+# a different set of dependencies for each way.  Further hack: PROFILING and
+# TICKY_TICKY can't be used together, so we omit TICKY_TICKY for now.
+SRC_MKDEPENDC_OPTS += -DPROFILING -DTHREADED_RTS -DDEBUG
+
+# -----------------------------------------------------------------------------
+# The auto-generated apply code
+
+# We want a slightly different version for the unregisterised way, so we make
+# AutoApply on a per-way basis (eg. AutoApply_p.cmm).
+
+AUTO_APPLY_CMM = AutoApply$(_way).cmm
+
+ifneq "$(BootingFromHc)" "YES"
+$(AUTO_APPLY_CMM): $(GHC_GENAPPLY)
+	@$(RM) $@
+	$(GENAPPLY) $(if $(filter $(way), u debug_u), -u) >$@
+endif
+
+EXTRA_CMM_SRCS += $(AUTO_APPLY_CMM)
+
+CLEAN_FILES += $(AUTO_APPLY_CMM)
+
+# -----------------------------------------------------------------------------
+#
+#  Building DLLs is only supported on mingw32 at the moment.
+#
+ifeq "$(DLLized)" "YES"
+SRC_BLD_DLL_OPTS += -lHS_imp_stub -lgmp_imp
+
+# It's not included in the DLL, but we need to compile it up separately.
+all :: Main.dll_o
+
+# Need an import library containing the symbols the RTS uses from the Prelude.
+# So, to avoid bootstrapping trouble, we build one containing just the syms
+# we need. Weirdly named to avoid clashing later on when compiling the contents
+# of ghc/lib/..
+#
+# Note: if you do change the name of the Prelude DLL, the "--dllname <nm>.dll"
+# below will need to be updated as well.
+
+$(DLL_PEN)/HSrts$(_way).dll :: libHS_imp_stub.a
+
+libHS_imp_stub.a :
+	dlltool --output-lib libHS_imp_stub.a --def HSprel.def --dllname HSstd.dll
+
+endif
+
+# -----------------------------------------------------------------------------
+# Compile GMP only if we don't have it already
+#
+# We use GMP's own configuration stuff, because it's all rather hairy
+# and not worth re-implementing in our Makefile framework.
+
+ifneq "$(HaveLibGmp)" "YES"
+ifneq "$(HaveFrameworkGMP)" "YES"
+boot ::
+	if [ -f gmp/config.status ]; then \
+	   cd gmp && CC=$(WhatGccIsCalled) ./config.status; \
+        else \
+	   cd gmp && CC=$(WhatGccIsCalled) ./configure --enable-shared=no \
+		        --host=`echo $(HOSTPLATFORM) | sed 's/i[567]86/i486/g'`; \
+	fi
+
+# Slight cheatage here to pass host as target, but x-compilation isn't supported by ghc.
+
+ifeq "$(way)" ""
+all :: gmp/libgmp.a
+
+ifeq "$(DLLized)" "YES"
+all :: $(DLL_PEN)/gmp.dll
+
+$(DLL_PEN)/gmp.dll:
+	$(MAKE) -C gmp gmp.dll
+	$(MV) gmp/gmp.dll $(DLL_PEN)
+endif
+endif
+
+install :: gmp/libgmp.a
+
+ifeq "$(way)" ""
+clean distclean maintainer-clean ::
+	-$(MAKE) -C gmp MAKEFLAGS= $@
+
+INSTALL_LIBS += gmp/libgmp.a
+endif
+endif
+
+gmp/libgmp.a ::
+	$(MAKE) -C gmp MAKEFLAGS=
+	@$(CP) gmp/.libs/libgmp.a gmp
+	@$(RANLIB) gmp/libgmp.a
+endif
+
+CLEAN_FILES += gmp/libgmp.a
+
+#-----------------------------------------------------------------------------
+#
+# Building the GUM SysMan
+#
+
+ifeq "$(way)" "mp"
+all :: parallel/SysMan
+
+ifdef solaris2_TARGET_OS
+__socket_libs = -lsocket -lnsl
+else
+__socket_libs =
+endif
+
+parallel/SysMan : parallel/SysMan.mp_o parallel/LLComms.mp_o RtsUtils.mp_o RtsFlags.mp_o
+	$(RM) $@
+	gcc -o $@ parallel/SysMan.mp_o parallel/LLComms.mp_o -L$$PVM_ROOT/lib/$$PVM_ARCH -lgpvm3 -lpvm3 $(__socket_libs)
+
+CLEAN_FILES  += parallel/SysMan.mp_o parallel/SysMan
+INSTALL_LIBEXECS += parallel/SysMan
+endif
+
+#-----------------------------------------------------------------------------
+# Compiling the cmm files
+
+# ToDo: should we really include Rts.h here?  Required for GNU_ATTRIBUTE().
+SRC_HC_OPTS += \
+  -I. \
+  -\#include Prelude.h \
+  -\#include Rts.h \
+  -\#include RtsFlags.h \
+  -\#include RtsUtils.h \
+  -\#include StgRun.h \
+  -\#include Schedule.h \
+  -\#include Printer.h \
+  -\#include Sanity.h \
+  -\#include STM.h \
+  -\#include Storage.h \
+  -\#include SchedAPI.h \
+  -\#include Timer.h \
+  -\#include ProfHeap.h \
+  -\#include LdvProfile.h \
+  -\#include Profiling.h \
+  -\#include OSThreads.h \
+  -\#include Apply.h \
+  -\#include SMP.h
+
+ifeq "$(Windows)" "YES"
+PrimOps_HC_OPTS += -\#include '<windows.h>' -\#include win32/AsyncIO.h
+else
+PrimOps_HC_OPTS += -\#include posix/Itimer.h
+endif
+
+# -O3 helps unroll some loops (especially in copy() with a constant argument).
+# -fno-strict-aliasing is a hack because we often mix StgPtr and StgClosure pointers
+# to the same object, and gcc will assume these don't alias.  eg. it happens in
+# copy() with gcc 3.4.3, the upd_evacee() assigments get moved before the object copy.
+GC_HC_OPTS += -optc-O3 -optc-fno-strict-aliasing
+
+# Cmm must be compiled via-C for now, because the NCG can't handle loops
+SRC_HC_OPTS += -fvia-C
+
+# We *want* type-checking of hand-written cmm.
+SRC_HC_OPTS += -dcmm-lint 
+
+ifneq "$(BootingFromHc)" "YES"
+# .cmm files depend on all the .h files, to a first approximation.
+%.$(way_)o : %.cmm $(H_FILES)
+	$(HC_PRE_OPTS)
+	$(HC) $(HC_OPTS) -c $< -o $@
+	$(HC_POST_OPTS)
+
+%.$(way_)hc : %.cmm $(H_FILES)
+	$(HC) $(HC_OPTS) -C $< -o $@
+
+%.$(way_)s : %.cmm $(H_FILES)
+	$(HC) $(HC_OPTS) -S $< -o $@
+endif
+
+#-----------------------------------------------------------------------------
+#
+# Files to install
+#
+# Just libHSrts is installed uniformly across ways
+#
+INSTALL_LIBS += $(LIBRARY)
+ifeq "$(DLLized)" "YES"
+INSTALL_PROGS += $(DLL_NAME) gmp/gmp.dll
+INSTALL_LIBS += $(patsubst %.a,%_imp.a,$(LIBARY))
+INSTALL_LIBS += gmp/libgmp_imp.a Main.dll_o
+endif
+
+include $(TOP)/mk/target.mk
diff --git a/rts/PosixSource.h b/rts/PosixSource.h
new file mode 100644
index 0000000000..a938f9bc0f
--- /dev/null
+++ b/rts/PosixSource.h
@@ -0,0 +1,18 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Include this file into sources which should not need any non-Posix services.
+ * That includes most RTS C sources.
+ * ---------------------------------------------------------------------------*/
+
+#ifndef POSIXSOURCE_H
+#define POSIXSOURCE_H
+
+#define _POSIX_SOURCE   1
+#define _POSIX_C_SOURCE 199506L
+#define _ISOC9X_SOURCE
+
+/* Let's be ISO C9X too... */
+
+#endif /* POSIXSOURCE_H */
diff --git a/rts/Prelude.h b/rts/Prelude.h
new file mode 100644
index 0000000000..c209b2b800
--- /dev/null
+++ b/rts/Prelude.h
@@ -0,0 +1,129 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Prelude identifiers that we sometimes need to refer to in the RTS.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef PRELUDE_H
+#define PRELUDE_H
+
+/* These definitions are required by the RTS .cmm files too, so we
+ * need declarations that we can #include into the generated .hc files.
+ */
+#if IN_STG_CODE
+#define PRELUDE_INFO(i)       extern W_(i)[]
+#define PRELUDE_CLOSURE(i)    extern W_(i)[]
+#else
+#define PRELUDE_INFO(i)       extern DLL_IMPORT const StgInfoTable i
+#define PRELUDE_CLOSURE(i)    extern DLL_IMPORT StgClosure i
+#endif
+
+/* Define canonical names so we can abstract away from the actual
+ * modules these names are defined in.
+ */
+
+PRELUDE_CLOSURE(GHCziBase_True_closure);
+PRELUDE_CLOSURE(GHCziBase_False_closure);
+PRELUDE_CLOSURE(GHCziPack_unpackCString_closure);
+PRELUDE_CLOSURE(GHCziWeak_runFinalizzerBatch_closure);
+
+#ifdef IN_STG_CODE
+extern W_ ZCMain_main_closure[];
+#else
+extern StgClosure ZCMain_main_closure;
+#endif
+
+PRELUDE_CLOSURE(GHCziIOBase_stackOverflow_closure);
+PRELUDE_CLOSURE(GHCziIOBase_heapOverflow_closure);
+PRELUDE_CLOSURE(GHCziIOBase_BlockedOnDeadMVar_closure);
+PRELUDE_CLOSURE(GHCziIOBase_BlockedIndefinitely_closure);
+PRELUDE_CLOSURE(GHCziIOBase_NonTermination_closure);
+PRELUDE_CLOSURE(GHCziIOBase_NestedAtomically_closure);
+
+PRELUDE_INFO(GHCziBase_Czh_static_info);
+PRELUDE_INFO(GHCziBase_Izh_static_info);
+PRELUDE_INFO(GHCziFloat_Fzh_static_info);
+PRELUDE_INFO(GHCziFloat_Dzh_static_info);
+PRELUDE_INFO(Addr_Azh_static_info);
+PRELUDE_INFO(GHCziPtr_Ptr_static_info);
+PRELUDE_INFO(GHCziPtr_FunPtr_static_info);
+PRELUDE_INFO(GHCziInt_I8zh_static_info);
+PRELUDE_INFO(GHCziInt_I16zh_static_info);
+PRELUDE_INFO(GHCziInt_I32zh_static_info);
+PRELUDE_INFO(GHCziInt_I64zh_static_info);
+PRELUDE_INFO(GHCziWord_Wzh_static_info);
+PRELUDE_INFO(GHCziWord_W8zh_static_info);
+PRELUDE_INFO(GHCziWord_W16zh_static_info);
+PRELUDE_INFO(GHCziWord_W32zh_static_info);
+PRELUDE_INFO(GHCziWord_W64zh_static_info);
+PRELUDE_INFO(GHCziBase_Czh_con_info);
+PRELUDE_INFO(GHCziBase_Izh_con_info);
+PRELUDE_INFO(GHCziFloat_Fzh_con_info);
+PRELUDE_INFO(GHCziFloat_Dzh_con_info);
+PRELUDE_INFO(GHCziPtr_Ptr_con_info);
+PRELUDE_INFO(GHCziPtr_FunPtr_con_info);
+PRELUDE_INFO(Addr_Azh_con_info);
+PRELUDE_INFO(GHCziWord_Wzh_con_info);
+PRELUDE_INFO(GHCziInt_I8zh_con_info);
+PRELUDE_INFO(GHCziInt_I16zh_con_info);
+PRELUDE_INFO(GHCziInt_I32zh_con_info);
+PRELUDE_INFO(GHCziInt_I64zh_con_info);
+PRELUDE_INFO(GHCziWord_W8zh_con_info);
+PRELUDE_INFO(GHCziWord_W16zh_con_info);
+PRELUDE_INFO(GHCziWord_W32zh_con_info);
+PRELUDE_INFO(GHCziWord_W64zh_con_info);
+PRELUDE_INFO(GHCziStable_StablePtr_static_info);
+PRELUDE_INFO(GHCziStable_StablePtr_con_info);
+
+#define True_closure              (&GHCziBase_True_closure)
+#define False_closure             (&GHCziBase_False_closure)
+#define unpackCString_closure     (&GHCziPack_unpackCString_closure)
+#define runFinalizerBatch_closure (&GHCziWeak_runFinalizzerBatch_closure)
+#define mainIO_closure            (&ZCMain_main_closure)
+
+#define stackOverflow_closure     (&GHCziIOBase_stackOverflow_closure)
+#define heapOverflow_closure      (&GHCziIOBase_heapOverflow_closure)
+#define BlockedOnDeadMVar_closure (&GHCziIOBase_BlockedOnDeadMVar_closure)
+#define BlockedIndefinitely_closure (&GHCziIOBase_BlockedIndefinitely_closure)
+#define NonTermination_closure    (&GHCziIOBase_NonTermination_closure)
+#define NestedAtomically_closure  (&GHCziIOBase_NestedAtomically_closure)
+
+#define Czh_static_info           (&GHCziBase_Czh_static_info)
+#define Fzh_static_info           (&GHCziFloat_Fzh_static_info)
+#define Dzh_static_info           (&GHCziFloat_Dzh_static_info)
+#define Azh_static_info           (&Addr_Azh_static_info)
+#define Izh_static_info           (&GHCziBase_Izh_static_info)
+#define I8zh_static_info          (&GHCziInt_I8zh_static_info)
+#define I16zh_static_info         (&GHCziInt_I16zh_static_info)
+#define I32zh_static_info         (&GHCziInt_I32zh_static_info)
+#define I64zh_static_info         (&GHCziInt_I64zh_static_info)
+#define Wzh_static_info           (&GHCziWord_Wzh_static_info)
+#define W8zh_static_info          (&GHCziWord_W8zh_static_info)
+#define W16zh_static_info         (&GHCziWord_W16zh_static_info)
+#define W32zh_static_info         (&GHCziWord_W32zh_static_info)
+#define W64zh_static_info         (&GHCziWord_W64zh_static_info)
+#define Ptr_static_info           (&GHCziPtr_Ptr_static_info)
+#define FunPtr_static_info        (&GHCziPtr_FunPtr_static_info)
+#define Czh_con_info              (&GHCziBase_Czh_con_info)
+#define Izh_con_info              (&GHCziBase_Izh_con_info)
+#define Fzh_con_info              (&GHCziFloat_Fzh_con_info)
+#define Dzh_con_info              (&GHCziFloat_Dzh_con_info)
+#define Azh_con_info              (&Addr_Azh_con_info)
+#define Wzh_con_info              (&GHCziWord_Wzh_con_info)
+#define W8zh_con_info             (&GHCziWord_W8zh_con_info)
+#define W16zh_con_info            (&GHCziWord_W16zh_con_info)
+#define W32zh_con_info            (&GHCziWord_W32zh_con_info)
+#define W64zh_con_info            (&GHCziWord_W64zh_con_info)
+#define I8zh_con_info             (&GHCziInt_I8zh_con_info)
+#define I16zh_con_info            (&GHCziInt_I16zh_con_info)
+#define I32zh_con_info            (&GHCziInt_I32zh_con_info)
+#define I64zh_con_info            (&GHCziInt_I64zh_con_info)
+#define I64zh_con_info            (&GHCziInt_I64zh_con_info)
+#define Ptr_con_info              (&GHCziPtr_Ptr_con_info)
+#define FunPtr_con_info           (&GHCziPtr_FunPtr_con_info)
+#define StablePtr_static_info     (&GHCziStable_StablePtr_static_info)
+#define StablePtr_con_info        (&GHCziStable_StablePtr_con_info)
+
+#endif /* PRELUDE_H */
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm
new file mode 100644
index 0000000000..f1c214e304
--- /dev/null
+++ b/rts/PrimOps.cmm
@@ -0,0 +1,2106 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * Out-of-line primitive operations
+ *
+ * This file contains the implementations of all the primitive
+ * operations ("primops") which are not expanded inline.  See
+ * ghc/compiler/prelude/primops.txt.pp for a list of all the primops;
+ * this file contains code for most of those with the attribute
+ * out_of_line=True.
+ *
+ * Entry convention: the entry convention for a primop is that all the
+ * args are in Stg registers (R1, R2, etc.).  This is to make writing
+ * the primops easier.  (see compiler/codeGen/CgCallConv.hs).
+ *
+ * Return convention: results from a primop are generally returned
+ * using the ordinary unboxed tuple return convention.  The C-- parser
+ * implements the RET_xxxx() macros to perform unboxed-tuple returns
+ * based on the prevailing return convention.
+ *
+ * This file is written in a subset of C--, extended with various
+ * features specific to GHC.  It is compiled by GHC directly.  For the
+ * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Cmm.h"
+
+/*-----------------------------------------------------------------------------
+  Array Primitives
+
+  Basically just new*Array - the others are all inline macros.
+
+  The size arg is always passed in R1, and the result returned in R1.
+
+  The slow entry point is for returning from a heap check, the saved
+  size argument must be re-loaded from the stack.
+  -------------------------------------------------------------------------- */
+
+/* for objects that are *less* than the size of a word, make sure we
+ * round up to the nearest word for the size of the array.
+ */
+
+newByteArrayzh_fast
+{
+    W_ words, payload_words, n, p;
+    MAYBE_GC(NO_PTRS,newByteArrayzh_fast);
+    n = R1;
+    payload_words = ROUNDUP_BYTES_TO_WDS(n);
+    words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
+    "ptr" p = foreign "C" allocateLocal(MyCapability() "ptr",words) [];
+    TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
+    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+    StgArrWords_words(p) = payload_words;
+    RET_P(p);
+}
+
+newPinnedByteArrayzh_fast
+{
+    W_ words, payload_words, n, p;
+
+    MAYBE_GC(NO_PTRS,newPinnedByteArrayzh_fast);
+    n = R1;
+    payload_words = ROUNDUP_BYTES_TO_WDS(n);
+
+    // We want an 8-byte aligned array.  allocatePinned() gives us
+    // 8-byte aligned memory by default, but we want to align the
+    // *goods* inside the ArrWords object, so we have to check the
+    // size of the ArrWords header and adjust our size accordingly.
+    words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
+    if ((SIZEOF_StgArrWords & 7) != 0) {
+	words = words + 1;
+    }
+
+    "ptr" p = foreign "C" allocatePinned(words) [];
+    TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
+
+    // Again, if the ArrWords header isn't a multiple of 8 bytes, we
+    // have to push the object forward one word so that the goods
+    // fall on an 8-byte boundary.
+    if ((SIZEOF_StgArrWords & 7) != 0) {
+	p = p + WDS(1);
+    }
+
+    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+    StgArrWords_words(p) = payload_words;
+    RET_P(p);
+}
+
+newArrayzh_fast
+{
+    W_ words, n, init, arr, p;
+    /* Args: R1 = words, R2 = initialisation value */
+
+    n = R1;
+    MAYBE_GC(R2_PTR,newArrayzh_fast);
+
+    words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + n;
+    "ptr" arr = foreign "C" allocateLocal(MyCapability() "ptr",words) [R2];
+    TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0);
+
+    SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]);
+    StgMutArrPtrs_ptrs(arr) = n;
+
+    // Initialise all elements of the the array with the value in R2
+    init = R2;
+    p = arr + SIZEOF_StgMutArrPtrs;
+  for:
+    if (p < arr + WDS(words)) {
+	W_[p] = init;
+	p = p + WDS(1);
+	goto for;
+    }
+
+    RET_P(arr);
+}
+
+unsafeThawArrayzh_fast
+{
+  // SUBTLETY TO DO WITH THE OLD GEN MUTABLE LIST
+  //
+  // A MUT_ARR_PTRS lives on the mutable list, but a MUT_ARR_PTRS_FROZEN 
+  // normally doesn't.  However, when we freeze a MUT_ARR_PTRS, we leave
+  // it on the mutable list for the GC to remove (removing something from
+  // the mutable list is not easy, because the mut_list is only singly-linked).
+  // 
+  // So that we can tell whether a MUT_ARR_PTRS_FROZEN is on the mutable list,
+  // when we freeze it we set the info ptr to be MUT_ARR_PTRS_FROZEN0
+  // to indicate that it is still on the mutable list.
+  //
+  // So, when we thaw a MUT_ARR_PTRS_FROZEN, we must cope with two cases:
+  // either it is on a mut_list, or it isn't.  We adopt the convention that
+  // the closure type is MUT_ARR_PTRS_FROZEN0 if it is on the mutable list,
+  // and MUT_ARR_PTRS_FROZEN otherwise.  In fact it wouldn't matter if
+  // we put it on the mutable list more than once, but it would get scavenged
+  // multiple times during GC, which would be unnecessarily slow.
+  //
+  if (StgHeader_info(R1) != stg_MUT_ARR_PTRS_FROZEN0_info) {
+	SET_INFO(R1,stg_MUT_ARR_PTRS_DIRTY_info);
+	foreign "C" recordMutableLock(R1 "ptr") [R1];
+	// must be done after SET_INFO, because it ASSERTs closure_MUTABLE()
+	RET_P(R1);
+  } else {
+	SET_INFO(R1,stg_MUT_ARR_PTRS_DIRTY_info);
+	RET_P(R1);
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   MutVar primitives
+   -------------------------------------------------------------------------- */
+
+newMutVarzh_fast
+{
+    W_ mv;
+    /* Args: R1 = initialisation value */
+
+    ALLOC_PRIM( SIZEOF_StgMutVar, R1_PTR, newMutVarzh_fast);
+
+    mv = Hp - SIZEOF_StgMutVar + WDS(1);
+    SET_HDR(mv,stg_MUT_VAR_DIRTY_info,W_[CCCS]);
+    StgMutVar_var(mv) = R1;
+    
+    RET_P(mv);
+}
+
+atomicModifyMutVarzh_fast
+{
+    W_ mv, z, x, y, r;
+    /* Args: R1 :: MutVar#,  R2 :: a -> (a,b) */
+
+    /* If x is the current contents of the MutVar#, then 
+       We want to make the new contents point to
+
+         (sel_0 (f x))
+ 
+       and the return value is
+	 
+	 (sel_1 (f x))
+
+        obviously we can share (f x).
+
+         z = [stg_ap_2 f x]  (max (HS + 2) MIN_UPD_SIZE)
+	 y = [stg_sel_0 z]   (max (HS + 1) MIN_UPD_SIZE)
+         r = [stg_sel_1 z]   (max (HS + 1) MIN_UPD_SIZE)
+    */
+
+#if MIN_UPD_SIZE > 1
+#define THUNK_1_SIZE (SIZEOF_StgThunkHeader + WDS(MIN_UPD_SIZE))
+#define TICK_ALLOC_THUNK_1() TICK_ALLOC_UP_THK(WDS(1),WDS(MIN_UPD_SIZE-1))
+#else
+#define THUNK_1_SIZE (SIZEOF_StgThunkHeader + WDS(1))
+#define TICK_ALLOC_THUNK_1() TICK_ALLOC_UP_THK(WDS(1),0)
+#endif
+
+#if MIN_UPD_SIZE > 2
+#define THUNK_2_SIZE (SIZEOF_StgThunkHeader + WDS(MIN_UPD_SIZE))
+#define TICK_ALLOC_THUNK_2() TICK_ALLOC_UP_THK(WDS(2),WDS(MIN_UPD_SIZE-2))
+#else
+#define THUNK_2_SIZE (SIZEOF_StgThunkHeader + WDS(2))
+#define TICK_ALLOC_THUNK_2() TICK_ALLOC_UP_THK(WDS(2),0)
+#endif
+
+#define SIZE (THUNK_2_SIZE + THUNK_1_SIZE + THUNK_1_SIZE)
+
+   HP_CHK_GEN_TICKY(SIZE, R1_PTR & R2_PTR, atomicModifyMutVarzh_fast);
+
+#if defined(THREADED_RTS)
+    foreign "C" ACQUIRE_LOCK(atomic_modify_mutvar_mutex "ptr") [R1,R2];
+#endif
+
+   x = StgMutVar_var(R1);
+
+   TICK_ALLOC_THUNK_2();
+   CCCS_ALLOC(THUNK_2_SIZE);
+   z = Hp - THUNK_2_SIZE + WDS(1);
+   SET_HDR(z, stg_ap_2_upd_info, W_[CCCS]);
+   LDV_RECORD_CREATE(z);
+   StgThunk_payload(z,0) = R2;
+   StgThunk_payload(z,1) = x;
+
+   TICK_ALLOC_THUNK_1();
+   CCCS_ALLOC(THUNK_1_SIZE);
+   y = z - THUNK_1_SIZE;
+   SET_HDR(y, stg_sel_0_upd_info, W_[CCCS]);
+   LDV_RECORD_CREATE(y);
+   StgThunk_payload(y,0) = z;
+
+   StgMutVar_var(R1) = y;
+   foreign "C" dirty_MUT_VAR(BaseReg "ptr", R1 "ptr") [R1];
+
+   TICK_ALLOC_THUNK_1();
+   CCCS_ALLOC(THUNK_1_SIZE);
+   r = y - THUNK_1_SIZE;
+   SET_HDR(r, stg_sel_1_upd_info, W_[CCCS]);
+   LDV_RECORD_CREATE(r);
+   StgThunk_payload(r,0) = z;
+
+#if defined(THREADED_RTS)
+    foreign "C" RELEASE_LOCK(atomic_modify_mutvar_mutex "ptr") [];
+#endif
+
+   RET_P(r);
+}
+
+/* -----------------------------------------------------------------------------
+   Weak Pointer Primitives
+   -------------------------------------------------------------------------- */
+
+STRING(stg_weak_msg,"New weak pointer at %p\n")
+
+mkWeakzh_fast
+{
+  /* R1 = key
+     R2 = value
+     R3 = finalizer (or NULL)
+  */
+  W_ w;
+
+  if (R3 == NULL) {
+    R3 = stg_NO_FINALIZER_closure;
+  }
+
+  ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, mkWeakzh_fast );
+
+  w = Hp - SIZEOF_StgWeak + WDS(1);
+  SET_HDR(w, stg_WEAK_info, W_[CCCS]);
+
+  StgWeak_key(w)       = R1;
+  StgWeak_value(w)     = R2;
+  StgWeak_finalizer(w) = R3;
+
+  StgWeak_link(w)	= W_[weak_ptr_list];
+  W_[weak_ptr_list] 	= w;
+
+  IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);
+
+  RET_P(w);
+}
+
+
+finalizzeWeakzh_fast
+{
+  /* R1 = weak ptr
+   */
+  W_ w, f;
+
+  w = R1;
+
+  // already dead?
+  if (GET_INFO(w) == stg_DEAD_WEAK_info) {
+      RET_NP(0,stg_NO_FINALIZER_closure);
+  }
+
+  // kill it
+#ifdef PROFILING
+  // @LDV profiling
+  // A weak pointer is inherently used, so we do not need to call
+  // LDV_recordDead_FILL_SLOP_DYNAMIC():
+  //    LDV_recordDead_FILL_SLOP_DYNAMIC((StgClosure *)w);
+  // or, LDV_recordDead():
+  //    LDV_recordDead((StgClosure *)w, sizeofW(StgWeak) - sizeofW(StgProfHeader));
+  // Furthermore, when PROFILING is turned on, dead weak pointers are exactly as 
+  // large as weak pointers, so there is no need to fill the slop, either.
+  // See stg_DEAD_WEAK_info in StgMiscClosures.hc.
+#endif
+
+  //
+  // Todo: maybe use SET_HDR() and remove LDV_recordCreate()?
+  //
+  SET_INFO(w,stg_DEAD_WEAK_info);
+  LDV_RECORD_CREATE(w);
+
+  f = StgWeak_finalizer(w);
+  StgDeadWeak_link(w) = StgWeak_link(w);
+
+  /* return the finalizer */
+  if (f == stg_NO_FINALIZER_closure) {
+      RET_NP(0,stg_NO_FINALIZER_closure);
+  } else {
+      RET_NP(1,f);
+  }
+}
+
+deRefWeakzh_fast
+{
+  /* R1 = weak ptr */
+  W_ w, code, val;
+
+  w = R1;
+  if (GET_INFO(w) == stg_WEAK_info) {
+    code = 1;
+    val = StgWeak_value(w);
+  } else {
+    code = 0;
+    val = w;
+  }
+  RET_NP(code,val);
+}
+
+/* -----------------------------------------------------------------------------
+   Arbitrary-precision Integer operations.
+
+   There are some assumptions in this code that mp_limb_t == W_.  This is
+   the case for all the platforms that GHC supports, currently.
+   -------------------------------------------------------------------------- */
+
+int2Integerzh_fast
+{
+   /* arguments: R1 = Int# */
+
+   W_ val, s, p;	/* to avoid aliasing */
+
+   val = R1;
+   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, int2Integerzh_fast );
+
+   p = Hp - SIZEOF_StgArrWords;
+   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+   StgArrWords_words(p) = 1;
+
+   /* mpz_set_si is inlined here, makes things simpler */
+   if (%lt(val,0)) { 
+	s  = -1;
+	Hp(0) = -val;
+   } else { 
+     if (%gt(val,0)) {
+	s = 1;
+	Hp(0) = val;
+     } else {
+	s = 0;
+     }
+  }
+
+   /* returns (# size  :: Int#, 
+		 data  :: ByteArray# 
+	       #)
+   */
+   RET_NP(s,p);
+}
+
+word2Integerzh_fast
+{
+   /* arguments: R1 = Word# */
+
+   W_ val, s, p;	/* to avoid aliasing */
+
+   val = R1;
+
+   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, word2Integerzh_fast);
+
+   p = Hp - SIZEOF_StgArrWords;
+   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+   StgArrWords_words(p) = 1;
+
+   if (val != 0) {
+	s = 1;
+	W_[Hp] = val;
+   } else {
+	s = 0;
+   }
+
+   /* returns (# size  :: Int#, 
+		 data  :: ByteArray# #)
+   */
+   RET_NP(s,p);
+}
+
+
+/*
+ * 'long long' primops for converting to/from Integers.
+ */
+
+#ifdef SUPPORT_LONG_LONGS
+
+int64ToIntegerzh_fast
+{
+   /* arguments: L1 = Int64# */
+
+   L_ val;
+   W_ hi, s, neg, words_needed, p;
+
+   val = L1;
+   neg = 0;
+
+   if ( %ge(val,0x100000000::L_) || %le(val,-0x100000000::L_) )  { 
+       words_needed = 2;
+   } else { 
+       // minimum is one word
+       words_needed = 1;
+   }
+
+   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(words_needed),
+	       NO_PTRS, int64ToIntegerzh_fast );
+
+   p = Hp - SIZEOF_StgArrWords - WDS(words_needed) + WDS(1);
+   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+   StgArrWords_words(p) = words_needed;
+
+   if ( %lt(val,0::L_) ) {
+     neg = 1;
+     val = -val;
+   }
+
+   hi = TO_W_(val >> 32);
+
+   if ( words_needed == 2 )  { 
+      s = 2;
+      Hp(-1) = TO_W_(val);
+      Hp(0) = hi;
+   } else { 
+       if ( val != 0::L_ ) {
+	   s = 1;
+	   Hp(0) = TO_W_(val);
+       } else /* val==0 */  {
+	   s = 0;
+       }
+   }
+   if ( neg != 0 ) {
+	s = -s;
+   }
+
+   /* returns (# size  :: Int#, 
+		 data  :: ByteArray# #)
+   */
+   RET_NP(s,p);
+}
+
+word64ToIntegerzh_fast
+{
+   /* arguments: L1 = Word64# */
+
+   L_ val;
+   W_ hi, s, words_needed, p;
+
+   val = L1;
+   if ( val >= 0x100000000::L_ ) {
+      words_needed = 2;
+   } else {
+      words_needed = 1;
+   }
+
+   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(words_needed),
+	       NO_PTRS, word64ToIntegerzh_fast );
+
+   p = Hp - SIZEOF_StgArrWords - WDS(words_needed) + WDS(1);
+   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+   StgArrWords_words(p) = words_needed;
+
+   hi = TO_W_(val >> 32);
+   if ( val >= 0x100000000::L_ ) { 
+     s = 2;
+     Hp(-1) = TO_W_(val);
+     Hp(0)  = hi;
+   } else {
+      if ( val != 0::L_ ) {
+        s = 1;
+        Hp(0) = TO_W_(val);
+     } else /* val==0 */  {
+      s = 0;
+     }
+  }
+
+   /* returns (# size  :: Int#, 
+		 data  :: ByteArray# #)
+   */
+   RET_NP(s,p);
+}
+
+
+#endif /* SUPPORT_LONG_LONGS */
+
+/* ToDo: this is shockingly inefficient */
+
+#ifndef THREADED_RTS
+section "bss" {
+  mp_tmp1:
+    bits8 [SIZEOF_MP_INT];
+}
+
+section "bss" {
+  mp_tmp2:
+    bits8 [SIZEOF_MP_INT];
+}
+
+section "bss" {
+  mp_result1:
+    bits8 [SIZEOF_MP_INT];
+}
+
+section "bss" {
+  mp_result2:
+    bits8 [SIZEOF_MP_INT];
+}
+#endif
+
+#ifdef THREADED_RTS
+#define FETCH_MP_TEMP(X) \
+W_ X; \
+X = BaseReg + (OFFSET_StgRegTable_r ## X);
+#else
+#define FETCH_MP_TEMP(X) /* Nothing */
+#endif
+
+#define GMP_TAKE2_RET1(name,mp_fun)                                     \
+name                                                                    \
+{                                                                       \
+  CInt s1, s2;                                                          \
+  W_ d1, d2;                                                            \
+  FETCH_MP_TEMP(mp_tmp1);                                               \
+  FETCH_MP_TEMP(mp_tmp2);                                               \
+  FETCH_MP_TEMP(mp_result1)                                             \
+  FETCH_MP_TEMP(mp_result2);                                            \
+                                                                        \
+  /* call doYouWantToGC() */                                            \
+  MAYBE_GC(R2_PTR & R4_PTR, name);                                      \
+                                                                        \
+  s1 = W_TO_INT(R1);                                                    \
+  d1 = R2;                                                              \
+  s2 = W_TO_INT(R3);                                                    \
+  d2 = R4;                                                              \
+                                                                        \
+  MP_INT__mp_alloc(mp_tmp1) = W_TO_INT(StgArrWords_words(d1));          \
+  MP_INT__mp_size(mp_tmp1)  = (s1);                                     \
+  MP_INT__mp_d(mp_tmp1)	    = BYTE_ARR_CTS(d1);                         \
+  MP_INT__mp_alloc(mp_tmp2) = W_TO_INT(StgArrWords_words(d2));          \
+  MP_INT__mp_size(mp_tmp2)  = (s2);                                     \
+  MP_INT__mp_d(mp_tmp2)	    = BYTE_ARR_CTS(d2);                         \
+                                                                        \
+  foreign "C" mpz_init(mp_result1 "ptr") [];                            \
+                                                                        \
+  /* Perform the operation */                                           \
+  foreign "C" mp_fun(mp_result1 "ptr",mp_tmp1  "ptr",mp_tmp2  "ptr") []; \
+                                                                        \
+  RET_NP(TO_W_(MP_INT__mp_size(mp_result1)),                            \
+         MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords);                \
+}
+
+#define GMP_TAKE1_RET1(name,mp_fun)                                     \
+name                                                                    \
+{                                                                       \
+  CInt s1;                                                              \
+  W_ d1;                                                                \
+  FETCH_MP_TEMP(mp_tmp1);                                               \
+  FETCH_MP_TEMP(mp_result1)                                             \
+                                                                        \
+  /* call doYouWantToGC() */                                            \
+  MAYBE_GC(R2_PTR, name);                                               \
+                                                                        \
+  d1 = R2;                                                              \
+  s1 = W_TO_INT(R1);                                                    \
+                                                                        \
+  MP_INT__mp_alloc(mp_tmp1)	= W_TO_INT(StgArrWords_words(d1));      \
+  MP_INT__mp_size(mp_tmp1)	= (s1);                                 \
+  MP_INT__mp_d(mp_tmp1)		= BYTE_ARR_CTS(d1);                     \
+                                                                        \
+  foreign "C" mpz_init(mp_result1 "ptr") [];                            \
+                                                                        \
+  /* Perform the operation */                                           \
+  foreign "C" mp_fun(mp_result1 "ptr",mp_tmp1 "ptr") [];                \
+                                                                        \
+  RET_NP(TO_W_(MP_INT__mp_size(mp_result1)),                            \
+         MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords);                \
+}
+
+#define GMP_TAKE2_RET2(name,mp_fun)                                                     \
+name                                                                                    \
+{                                                                                       \
+  CInt s1, s2;                                                                          \
+  W_ d1, d2;                                                                            \
+  FETCH_MP_TEMP(mp_tmp1);                                                               \
+  FETCH_MP_TEMP(mp_tmp2);                                                               \
+  FETCH_MP_TEMP(mp_result1)                                                             \
+  FETCH_MP_TEMP(mp_result2)                                                             \
+                                                                                        \
+  /* call doYouWantToGC() */                                                            \
+  MAYBE_GC(R2_PTR & R4_PTR, name);                                                      \
+                                                                                        \
+  s1 = W_TO_INT(R1);                                                                    \
+  d1 = R2;                                                                              \
+  s2 = W_TO_INT(R3);                                                                    \
+  d2 = R4;                                                                              \
+                                                                                        \
+  MP_INT__mp_alloc(mp_tmp1)	= W_TO_INT(StgArrWords_words(d1));                      \
+  MP_INT__mp_size(mp_tmp1)	= (s1);                                                 \
+  MP_INT__mp_d(mp_tmp1)		= BYTE_ARR_CTS(d1);                                     \
+  MP_INT__mp_alloc(mp_tmp2)	= W_TO_INT(StgArrWords_words(d2));                      \
+  MP_INT__mp_size(mp_tmp2)	= (s2);                                                 \
+  MP_INT__mp_d(mp_tmp2)		= BYTE_ARR_CTS(d2);                                     \
+                                                                                        \
+  foreign "C" mpz_init(mp_result1 "ptr") [];                                               \
+  foreign "C" mpz_init(mp_result2 "ptr") [];                                               \
+                                                                                        \
+  /* Perform the operation */                                                           \
+  foreign "C" mp_fun(mp_result1 "ptr",mp_result2 "ptr",mp_tmp1 "ptr",mp_tmp2 "ptr") [];    \
+                                                                                        \
+  RET_NPNP(TO_W_(MP_INT__mp_size(mp_result1)),                                          \
+           MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords,                               \
+	   TO_W_(MP_INT__mp_size(mp_result2)),                                          \
+           MP_INT__mp_d(mp_result2) - SIZEOF_StgArrWords);                              \
+}
+
+GMP_TAKE2_RET1(plusIntegerzh_fast,     mpz_add)
+GMP_TAKE2_RET1(minusIntegerzh_fast,    mpz_sub)
+GMP_TAKE2_RET1(timesIntegerzh_fast,    mpz_mul)
+GMP_TAKE2_RET1(gcdIntegerzh_fast,      mpz_gcd)
+GMP_TAKE2_RET1(quotIntegerzh_fast,     mpz_tdiv_q)
+GMP_TAKE2_RET1(remIntegerzh_fast,      mpz_tdiv_r)
+GMP_TAKE2_RET1(divExactIntegerzh_fast, mpz_divexact)
+GMP_TAKE2_RET1(andIntegerzh_fast,      mpz_and)
+GMP_TAKE2_RET1(orIntegerzh_fast,       mpz_ior)
+GMP_TAKE2_RET1(xorIntegerzh_fast,      mpz_xor)
+GMP_TAKE1_RET1(complementIntegerzh_fast, mpz_com)
+
+GMP_TAKE2_RET2(quotRemIntegerzh_fast, mpz_tdiv_qr)
+GMP_TAKE2_RET2(divModIntegerzh_fast,  mpz_fdiv_qr)
+
+#ifndef THREADED_RTS
+section "bss" {
+  mp_tmp_w:  W_; // NB. mp_tmp_w is really an here mp_limb_t
+}
+#endif
+
+gcdIntzh_fast
+{
+    /* R1 = the first Int#; R2 = the second Int# */
+    W_ r; 
+    FETCH_MP_TEMP(mp_tmp_w);
+
+    W_[mp_tmp_w] = R1;
+    r = foreign "C" mpn_gcd_1(mp_tmp_w "ptr", 1, R2) [];
+
+    R1 = r;
+    /* Result parked in R1, return via info-pointer at TOS */
+    jump %ENTRY_CODE(Sp(0));
+}
+
+
+gcdIntegerIntzh_fast
+{
+    /* R1 = s1; R2 = d1; R3 = the int */
+    R1 = foreign "C" mpn_gcd_1( BYTE_ARR_CTS(R2) "ptr", R1, R3) [];
+    
+    /* Result parked in R1, return via info-pointer at TOS */
+    jump %ENTRY_CODE(Sp(0));
+}
+
+
+cmpIntegerIntzh_fast
+{
+    /* R1 = s1; R2 = d1; R3 = the int */
+    W_ usize, vsize, v_digit, u_digit;
+
+    usize = R1;
+    vsize = 0;
+    v_digit = R3;
+
+    // paraphrased from mpz_cmp_si() in the GMP sources
+    if (%gt(v_digit,0)) {
+	vsize = 1;
+    } else { 
+	if (%lt(v_digit,0)) {
+	    vsize = -1;
+	    v_digit = -v_digit;
+	}
+    }
+
+    if (usize != vsize) {
+	R1 = usize - vsize; 
+	jump %ENTRY_CODE(Sp(0));
+    }
+
+    if (usize == 0) {
+	R1 = 0; 
+	jump %ENTRY_CODE(Sp(0));
+    }
+
+    u_digit = W_[BYTE_ARR_CTS(R2)];
+
+    if (u_digit == v_digit) {
+	R1 = 0; 
+	jump %ENTRY_CODE(Sp(0));
+    }
+
+    if (%gtu(u_digit,v_digit)) { // NB. unsigned: these are mp_limb_t's
+	R1 = usize; 
+    } else {
+	R1 = -usize; 
+    }
+
+    jump %ENTRY_CODE(Sp(0));
+}
+
+cmpIntegerzh_fast
+{
+    /* R1 = s1; R2 = d1; R3 = s2; R4 = d2 */
+    W_ usize, vsize, size, up, vp;
+    CInt cmp;
+
+    // paraphrased from mpz_cmp() in the GMP sources
+    usize = R1;
+    vsize = R3;
+
+    if (usize != vsize) {
+	R1 = usize - vsize; 
+	jump %ENTRY_CODE(Sp(0));
+    }
+
+    if (usize == 0) {
+	R1 = 0; 
+	jump %ENTRY_CODE(Sp(0));
+    }
+
+    if (%lt(usize,0)) { // NB. not <, which is unsigned
+	size = -usize;
+    } else {
+	size = usize;
+    }
+
+    up = BYTE_ARR_CTS(R2);
+    vp = BYTE_ARR_CTS(R4);
+
+    cmp = foreign "C" mpn_cmp(up "ptr", vp "ptr", size) [];
+
+    if (cmp == 0 :: CInt) {
+	R1 = 0; 
+	jump %ENTRY_CODE(Sp(0));
+    }
+
+    if (%lt(cmp,0 :: CInt) == %lt(usize,0)) {
+	R1 = 1;
+    } else {
+	R1 = (-1); 
+    }
+    /* Result parked in R1, return via info-pointer at TOS */
+    jump %ENTRY_CODE(Sp(0));
+}
+
+integer2Intzh_fast
+{
+    /* R1 = s; R2 = d */
+    W_ r, s;
+
+    s = R1;
+    if (s == 0) {
+	r = 0;
+    } else {
+	r = W_[R2 + SIZEOF_StgArrWords];
+	if (%lt(s,0)) {
+	    r = -r;
+	}
+    }
+    /* Result parked in R1, return via info-pointer at TOS */
+    R1 = r;
+    jump %ENTRY_CODE(Sp(0));
+}
+
+integer2Wordzh_fast
+{
+  /* R1 = s; R2 = d */
+  W_ r, s;
+
+  s = R1;
+  if (s == 0) {
+    r = 0;
+  } else {
+    r = W_[R2 + SIZEOF_StgArrWords];
+    if (%lt(s,0)) {
+	r = -r;
+    }
+  }
+  /* Result parked in R1, return via info-pointer at TOS */
+  R1 = r;
+  jump %ENTRY_CODE(Sp(0));
+}
+
+decodeFloatzh_fast
+{ 
+    W_ p;
+    F_ arg;
+    FETCH_MP_TEMP(mp_tmp1);
+    FETCH_MP_TEMP(mp_tmp_w);
+    
+    /* arguments: F1 = Float# */
+    arg = F1;
+    
+    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, decodeFloatzh_fast );
+    
+    /* Be prepared to tell Lennart-coded __decodeFloat
+       where mantissa._mp_d can be put (it does not care about the rest) */
+    p = Hp - SIZEOF_StgArrWords;
+    SET_HDR(p,stg_ARR_WORDS_info,W_[CCCS]);
+    StgArrWords_words(p) = 1;
+    MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p);
+    
+    /* Perform the operation */
+    foreign "C" __decodeFloat(mp_tmp1 "ptr",mp_tmp_w "ptr" ,arg) [];
+    
+    /* returns: (Int# (expn), Int#, ByteArray#) */
+    RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
+}
+
+#define DOUBLE_MANTISSA_SIZE SIZEOF_DOUBLE
+#define ARR_SIZE (SIZEOF_StgArrWords + DOUBLE_MANTISSA_SIZE)
+
+decodeDoublezh_fast
+{ 
+    D_ arg;
+    W_ p;
+    FETCH_MP_TEMP(mp_tmp1);
+    FETCH_MP_TEMP(mp_tmp_w);
+
+    /* arguments: D1 = Double# */
+    arg = D1;
+
+    ALLOC_PRIM( ARR_SIZE, NO_PTRS, decodeDoublezh_fast );
+    
+    /* Be prepared to tell Lennart-coded __decodeDouble
+       where mantissa.d can be put (it does not care about the rest) */
+    p = Hp - ARR_SIZE + WDS(1);
+    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+    StgArrWords_words(p) = BYTES_TO_WDS(DOUBLE_MANTISSA_SIZE);
+    MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p);
+
+    /* Perform the operation */
+    foreign "C" __decodeDouble(mp_tmp1 "ptr", mp_tmp_w "ptr",arg) [];
+    
+    /* returns: (Int# (expn), Int#, ByteArray#) */
+    RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
+}
+
+/* -----------------------------------------------------------------------------
+ * Concurrency primitives
+ * -------------------------------------------------------------------------- */
+
+forkzh_fast
+{
+  /* args: R1 = closure to spark */
+
+  MAYBE_GC(R1_PTR, forkzh_fast);
+
+  W_ closure;
+  W_ threadid;
+  closure = R1;
+
+  "ptr" threadid = foreign "C" createIOThread( MyCapability() "ptr", 
+				RtsFlags_GcFlags_initialStkSize(RtsFlags), 
+				closure "ptr") [];
+  foreign "C" scheduleThread(MyCapability() "ptr", threadid "ptr") [];
+
+  // switch at the earliest opportunity
+  CInt[context_switch] = 1 :: CInt;
+  
+  RET_P(threadid);
+}
+
+forkOnzh_fast
+{
+  /* args: R1 = cpu, R2 = closure to spark */
+
+  MAYBE_GC(R2_PTR, forkOnzh_fast);
+
+  W_ cpu;
+  W_ closure;
+  W_ threadid;
+  cpu = R1;
+  closure = R2;
+
+  "ptr" threadid = foreign "C" createIOThread( MyCapability() "ptr", 
+				RtsFlags_GcFlags_initialStkSize(RtsFlags), 
+				closure "ptr") [];
+  foreign "C" scheduleThreadOn(MyCapability() "ptr", cpu, threadid "ptr") [];
+
+  // switch at the earliest opportunity
+  CInt[context_switch] = 1 :: CInt;
+  
+  RET_P(threadid);
+}
+
+yieldzh_fast
+{
+  jump stg_yield_noregs;
+}
+
+myThreadIdzh_fast
+{
+  /* no args. */
+  RET_P(CurrentTSO);
+}
+
+labelThreadzh_fast
+{
+  /* args: 
+	R1 = ThreadId#
+	R2 = Addr# */
+#ifdef DEBUG
+  foreign "C" labelThread(R1 "ptr", R2 "ptr") [];
+#endif
+  jump %ENTRY_CODE(Sp(0));
+}
+
+isCurrentThreadBoundzh_fast
+{
+  /* no args */
+  W_ r;
+  r = foreign "C" isThreadBound(CurrentTSO) [];
+  RET_N(r);
+}
+
+
+/* -----------------------------------------------------------------------------
+ * TVar primitives
+ * -------------------------------------------------------------------------- */
+
+#ifdef REG_R1
+#define SP_OFF 0
+#define IF_NOT_REG_R1(x) 
+#else
+#define SP_OFF 1
+#define IF_NOT_REG_R1(x) x
+#endif
+
+// Catch retry frame ------------------------------------------------------------
+
+#define CATCH_RETRY_FRAME_ERROR(label) \
+  label { foreign "C" barf("catch_retry_frame incorrectly entered!"); }
+
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_0_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_1_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_2_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_3_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_4_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_5_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_6_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_7_ret)
+
+#if MAX_VECTORED_RTN > 8
+#error MAX_VECTORED_RTN has changed: please modify stg_catch_retry_frame too.
+#endif
+
+#if defined(PROFILING)
+#define CATCH_RETRY_FRAME_BITMAP 7
+#define CATCH_RETRY_FRAME_WORDS  6
+#else
+#define CATCH_RETRY_FRAME_BITMAP 1
+#define CATCH_RETRY_FRAME_WORDS  4
+#endif
+
+INFO_TABLE_RET(stg_catch_retry_frame,
+	       CATCH_RETRY_FRAME_WORDS, CATCH_RETRY_FRAME_BITMAP,
+	       CATCH_RETRY_FRAME,
+	       stg_catch_retry_frame_0_ret,
+	       stg_catch_retry_frame_1_ret,
+	       stg_catch_retry_frame_2_ret,
+	       stg_catch_retry_frame_3_ret,
+	       stg_catch_retry_frame_4_ret,
+	       stg_catch_retry_frame_5_ret,
+	       stg_catch_retry_frame_6_ret,
+	       stg_catch_retry_frame_7_ret)
+{
+   W_ r, frame, trec, outer;
+   IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
+
+   frame = Sp;
+   trec = StgTSO_trec(CurrentTSO);
+   "ptr" outer = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+   r = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
+   if (r) {
+     /* Succeeded (either first branch or second branch) */
+     StgTSO_trec(CurrentTSO) = outer;
+     Sp = Sp + SIZEOF_StgCatchRetryFrame;
+     IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)
+     jump %ENTRY_CODE(Sp(SP_OFF));
+   } else {
+     /* Did not commit: retry */
+     W_ new_trec;
+     "ptr" new_trec = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+     StgTSO_trec(CurrentTSO) = new_trec;
+     if (StgCatchRetryFrame_running_alt_code(frame)) {
+       R1 = StgCatchRetryFrame_alt_code(frame);
+     } else {
+       R1 = StgCatchRetryFrame_first_code(frame);
+       StgCatchRetryFrame_first_code_trec(frame) = new_trec;
+     }
+     jump stg_ap_v_fast;
+   }
+}
+
+
+// Atomically frame -------------------------------------------------------------
+
+
+#define ATOMICALLY_FRAME_ERROR(label) \
+  label { foreign "C" barf("atomically_frame incorrectly entered!"); }
+
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_0_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_1_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_2_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_3_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_4_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_5_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_6_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_7_ret)
+
+#if MAX_VECTORED_RTN > 8
+#error MAX_VECTORED_RTN has changed: please modify stg_atomically_frame too.
+#endif
+
+#if defined(PROFILING)
+#define ATOMICALLY_FRAME_BITMAP 3
+#define ATOMICALLY_FRAME_WORDS  3
+#else
+#define ATOMICALLY_FRAME_BITMAP 0
+#define ATOMICALLY_FRAME_WORDS  1
+#endif
+
+
+INFO_TABLE_RET(stg_atomically_frame,
+	       ATOMICALLY_FRAME_WORDS, ATOMICALLY_FRAME_BITMAP,
+	       ATOMICALLY_FRAME,
+	       stg_atomically_frame_0_ret,
+	       stg_atomically_frame_1_ret,
+	       stg_atomically_frame_2_ret,
+	       stg_atomically_frame_3_ret,
+	       stg_atomically_frame_4_ret,
+	       stg_atomically_frame_5_ret,
+	       stg_atomically_frame_6_ret,
+	       stg_atomically_frame_7_ret)
+{
+  W_ frame, trec, valid;
+  IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
+
+  frame = Sp;
+  trec = StgTSO_trec(CurrentTSO);
+
+  /* The TSO is not currently waiting: try to commit the transaction */
+  valid = foreign "C" stmCommitTransaction(MyCapability() "ptr", trec "ptr") [];
+  if (valid) {
+    /* Transaction was valid: commit succeeded */
+    StgTSO_trec(CurrentTSO) = NO_TREC;
+    Sp = Sp + SIZEOF_StgAtomicallyFrame;
+    IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)
+    jump %ENTRY_CODE(Sp(SP_OFF));
+  } else {
+    /* Transaction was not valid: try again */
+    "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
+    StgTSO_trec(CurrentTSO) = trec;
+    R1 = StgAtomicallyFrame_code(frame);
+    jump stg_ap_v_fast;
+  }
+}
+
+INFO_TABLE_RET(stg_atomically_waiting_frame,
+	       ATOMICALLY_FRAME_WORDS, ATOMICALLY_FRAME_BITMAP,
+	       ATOMICALLY_FRAME,
+	       stg_atomically_frame_0_ret,
+	       stg_atomically_frame_1_ret,
+	       stg_atomically_frame_2_ret,
+	       stg_atomically_frame_3_ret,
+	       stg_atomically_frame_4_ret,
+	       stg_atomically_frame_5_ret,
+	       stg_atomically_frame_6_ret,
+	       stg_atomically_frame_7_ret)
+{
+  W_ frame, trec, valid;
+  IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
+
+  frame = Sp;
+
+  /* The TSO is currently waiting: should we stop waiting? */
+  valid = foreign "C" stmReWait(MyCapability() "ptr", CurrentTSO "ptr") [];
+  if (valid) {
+    /* Previous attempt is still valid: no point trying again yet */
+	  IF_NOT_REG_R1(Sp_adj(-2);
+			Sp(1) = stg_NO_FINALIZER_closure;
+		  	Sp(0) = stg_ut_1_0_unreg_info;)
+    jump stg_block_noregs;
+  } else {
+    /* Previous attempt is no longer valid: try again */
+    "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
+    StgTSO_trec(CurrentTSO) = trec;
+    StgHeader_info(frame) = stg_atomically_frame_info;
+    R1 = StgAtomicallyFrame_code(frame);
+    jump stg_ap_v_fast;
+  }
+}
+
+// STM catch frame --------------------------------------------------------------
+
+#define CATCH_STM_FRAME_ENTRY_TEMPLATE(label,ret)          \
+   label                                                   \
+   {                                                       \
+      IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )  \
+      Sp = Sp + SIZEOF_StgCatchSTMFrame;                   \
+      IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)             \
+      jump ret;                                            \
+   }
+
+#ifdef REG_R1
+#define SP_OFF 0
+#else
+#define SP_OFF 1
+#endif
+
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_0_ret,%RET_VEC(Sp(SP_OFF),0))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_1_ret,%RET_VEC(Sp(SP_OFF),1))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_2_ret,%RET_VEC(Sp(SP_OFF),2))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_3_ret,%RET_VEC(Sp(SP_OFF),3))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_4_ret,%RET_VEC(Sp(SP_OFF),4))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_5_ret,%RET_VEC(Sp(SP_OFF),5))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_6_ret,%RET_VEC(Sp(SP_OFF),6))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_7_ret,%RET_VEC(Sp(SP_OFF),7))
+
+#if MAX_VECTORED_RTN > 8
+#error MAX_VECTORED_RTN has changed: please modify stg_catch_stm_frame too.
+#endif
+
+#if defined(PROFILING)
+#define CATCH_STM_FRAME_BITMAP 3
+#define CATCH_STM_FRAME_WORDS  3
+#else
+#define CATCH_STM_FRAME_BITMAP 0
+#define CATCH_STM_FRAME_WORDS  1
+#endif
+
+/* Catch frames are very similar to update frames, but when entering
+ * one we just pop the frame off the stack and perform the correct
+ * kind of return to the activation record underneath us on the stack.
+ */
+
+INFO_TABLE_RET(stg_catch_stm_frame,
+	       CATCH_STM_FRAME_WORDS, CATCH_STM_FRAME_BITMAP,
+	       CATCH_STM_FRAME,
+	       stg_catch_stm_frame_0_ret,
+	       stg_catch_stm_frame_1_ret,
+	       stg_catch_stm_frame_2_ret,
+	       stg_catch_stm_frame_3_ret,
+	       stg_catch_stm_frame_4_ret,
+	       stg_catch_stm_frame_5_ret,
+	       stg_catch_stm_frame_6_ret,
+	       stg_catch_stm_frame_7_ret)
+CATCH_STM_FRAME_ENTRY_TEMPLATE(,%ENTRY_CODE(Sp(SP_OFF)))
+
+
+// Primop definition ------------------------------------------------------------
+
+atomicallyzh_fast
+{
+  W_ frame;
+  W_ old_trec;
+  W_ new_trec;
+  
+  // stmStartTransaction may allocate
+  MAYBE_GC (R1_PTR, atomicallyzh_fast); 
+
+  /* Args: R1 = m :: STM a */
+  STK_CHK_GEN(SIZEOF_StgAtomicallyFrame + WDS(1), R1_PTR, atomicallyzh_fast);
+
+  old_trec = StgTSO_trec(CurrentTSO);
+
+  /* Nested transactions are not allowed; raise an exception */
+  if (old_trec != NO_TREC) {
+     R1 = GHCziIOBase_NestedAtomically_closure;
+     jump raisezh_fast;
+  }
+
+  /* Set up the atomically frame */
+  Sp = Sp - SIZEOF_StgAtomicallyFrame;
+  frame = Sp;
+
+  SET_HDR(frame,stg_atomically_frame_info, W_[CCCS]);
+  StgAtomicallyFrame_code(frame) = R1;
+
+  /* Start the memory transcation */
+  "ptr" new_trec = foreign "C" stmStartTransaction(MyCapability() "ptr", old_trec "ptr") [R1];
+  StgTSO_trec(CurrentTSO) = new_trec;
+
+  /* Apply R1 to the realworld token */
+  jump stg_ap_v_fast;
+}
+
+
+catchSTMzh_fast
+{
+  W_ frame;
+  
+  /* Args: R1 :: STM a */
+  /* Args: R2 :: Exception -> STM a */
+  STK_CHK_GEN(SIZEOF_StgCatchSTMFrame + WDS(1), R1_PTR & R2_PTR, catchSTMzh_fast);
+
+  /* Set up the catch frame */
+  Sp = Sp - SIZEOF_StgCatchSTMFrame;
+  frame = Sp;
+
+  SET_HDR(frame, stg_catch_stm_frame_info, W_[CCCS]);
+  StgCatchSTMFrame_handler(frame) = R2;
+
+  /* Apply R1 to the realworld token */
+  jump stg_ap_v_fast;
+}
+
+
+catchRetryzh_fast
+{
+  W_ frame;
+  W_ new_trec;
+  W_ trec;
+
+  // stmStartTransaction may allocate
+  MAYBE_GC (R1_PTR & R2_PTR, catchRetryzh_fast); 
+
+  /* Args: R1 :: STM a */
+  /* Args: R2 :: STM a */
+  STK_CHK_GEN(SIZEOF_StgCatchRetryFrame + WDS(1), R1_PTR & R2_PTR, catchRetryzh_fast);
+
+  /* Start a nested transaction within which to run the first code */
+  trec = StgTSO_trec(CurrentTSO);
+  "ptr" new_trec = foreign "C" stmStartTransaction(MyCapability() "ptr", trec "ptr") [R1,R2];
+  StgTSO_trec(CurrentTSO) = new_trec;
+
+  /* Set up the catch-retry frame */
+  Sp = Sp - SIZEOF_StgCatchRetryFrame;
+  frame = Sp;
+  
+  SET_HDR(frame, stg_catch_retry_frame_info, W_[CCCS]);
+  StgCatchRetryFrame_running_alt_code(frame) = 0 :: CInt; // false;
+  StgCatchRetryFrame_first_code(frame) = R1;
+  StgCatchRetryFrame_alt_code(frame) = R2;
+  StgCatchRetryFrame_first_code_trec(frame) = new_trec;
+
+  /* Apply R1 to the realworld token */
+  jump stg_ap_v_fast;
+}
+
+
+retryzh_fast
+{
+  W_ frame_type;
+  W_ frame;
+  W_ trec;
+  W_ outer;
+  W_ r;
+
+  MAYBE_GC (NO_PTRS, retryzh_fast); // STM operations may allocate
+
+  // Find the enclosing ATOMICALLY_FRAME or CATCH_RETRY_FRAME
+retry_pop_stack:
+  trec = StgTSO_trec(CurrentTSO);
+  "ptr" outer = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+  StgTSO_sp(CurrentTSO) = Sp;
+  frame_type = foreign "C" findRetryFrameHelper(CurrentTSO "ptr") [];
+  Sp = StgTSO_sp(CurrentTSO);
+  frame = Sp;
+
+  if (frame_type == CATCH_RETRY_FRAME) {
+    // The retry reaches a CATCH_RETRY_FRAME before the atomic frame
+    ASSERT(outer != NO_TREC);
+    if (!StgCatchRetryFrame_running_alt_code(frame)) {
+      // Retry in the first code: try the alternative
+      "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+      StgTSO_trec(CurrentTSO) = trec;
+      StgCatchRetryFrame_running_alt_code(frame) = 1 :: CInt; // true;
+      R1 = StgCatchRetryFrame_alt_code(frame);
+      jump stg_ap_v_fast;
+    } else {
+      // Retry in the alternative code: propagate
+      W_ other_trec;
+      other_trec = StgCatchRetryFrame_first_code_trec(frame);
+      r = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", other_trec "ptr") [];
+      if (r) {
+        r = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
+      } else {
+        foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
+      }
+      if (r) {
+        // Merge between siblings succeeded: commit it back to enclosing transaction
+        // and then propagate the retry
+        StgTSO_trec(CurrentTSO) = outer;
+        Sp = Sp + SIZEOF_StgCatchRetryFrame;
+        goto retry_pop_stack;
+      } else {
+        // Merge failed: we musn't propagate the retry.  Try both paths again.
+        "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+        StgCatchRetryFrame_first_code_trec(frame) = trec;
+        StgCatchRetryFrame_running_alt_code(frame) = 0 :: CInt; // false;
+        StgTSO_trec(CurrentTSO) = trec;
+        R1 = StgCatchRetryFrame_first_code(frame);
+        jump stg_ap_v_fast;
+      }
+    }
+  }
+
+  // We've reached the ATOMICALLY_FRAME: attempt to wait 
+  ASSERT(frame_type == ATOMICALLY_FRAME);
+  ASSERT(outer == NO_TREC);
+  r = foreign "C" stmWait(MyCapability() "ptr", CurrentTSO "ptr", trec "ptr") [];
+  if (r) {
+    // Transaction was valid: stmWait put us on the TVars' queues, we now block
+    StgHeader_info(frame) = stg_atomically_waiting_frame_info;
+    Sp = frame;
+    // Fix up the stack in the unregisterised case: the return convention is different.
+    IF_NOT_REG_R1(Sp_adj(-2); 
+		  Sp(1) = stg_NO_FINALIZER_closure;
+		  Sp(0) = stg_ut_1_0_unreg_info;)
+    R3 = trec; // passing to stmWaitUnblock()
+    jump stg_block_stmwait;
+  } else {
+    // Transaction was not valid: retry immediately
+    "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+    StgTSO_trec(CurrentTSO) = trec;
+    R1 = StgAtomicallyFrame_code(frame);
+    Sp = frame;
+    jump stg_ap_v_fast;
+  }
+}
+
+
+newTVarzh_fast
+{
+  W_ tv;
+  W_ new_value;
+
+  /* Args: R1 = initialisation value */
+
+  MAYBE_GC (R1_PTR, newTVarzh_fast); 
+  new_value = R1;
+  "ptr" tv = foreign "C" stmNewTVar(MyCapability() "ptr", new_value "ptr") [];
+  RET_P(tv);
+}
+
+
+readTVarzh_fast
+{
+  W_ trec;
+  W_ tvar;
+  W_ result;
+
+  /* Args: R1 = TVar closure */
+
+  MAYBE_GC (R1_PTR, readTVarzh_fast); // Call to stmReadTVar may allocate
+  trec = StgTSO_trec(CurrentTSO);
+  tvar = R1;
+  "ptr" result = foreign "C" stmReadTVar(MyCapability() "ptr", trec "ptr", tvar "ptr") [];
+
+  RET_P(result);
+}
+
+
+writeTVarzh_fast
+{
+  W_ trec;
+  W_ tvar;
+  W_ new_value;
+  
+  /* Args: R1 = TVar closure */
+  /*       R2 = New value    */
+
+  MAYBE_GC (R1_PTR & R2_PTR, writeTVarzh_fast); // Call to stmWriteTVar may allocate
+  trec = StgTSO_trec(CurrentTSO);
+  tvar = R1;
+  new_value = R2;
+  foreign "C" stmWriteTVar(MyCapability() "ptr", trec "ptr", tvar "ptr", new_value "ptr") [];
+
+  jump %ENTRY_CODE(Sp(0));
+}
+
+
+/* -----------------------------------------------------------------------------
+ * MVar primitives
+ *
+ * take & putMVar work as follows.  Firstly, an important invariant:
+ *
+ *    If the MVar is full, then the blocking queue contains only
+ *    threads blocked on putMVar, and if the MVar is empty then the
+ *    blocking queue contains only threads blocked on takeMVar.
+ *
+ * takeMvar:
+ *    MVar empty : then add ourselves to the blocking queue
+ *    MVar full  : remove the value from the MVar, and
+ *                 blocking queue empty     : return
+ *                 blocking queue non-empty : perform the first blocked putMVar
+ *                                            from the queue, and wake up the
+ *                                            thread (MVar is now full again)
+ *
+ * putMVar is just the dual of the above algorithm.
+ *
+ * How do we "perform a putMVar"?  Well, we have to fiddle around with
+ * the stack of the thread waiting to do the putMVar.  See
+ * stg_block_putmvar and stg_block_takemvar in HeapStackCheck.c for
+ * the stack layout, and the PerformPut and PerformTake macros below.
+ *
+ * It is important that a blocked take or put is woken up with the
+ * take/put already performed, because otherwise there would be a
+ * small window of vulnerability where the thread could receive an
+ * exception and never perform its take or put, and we'd end up with a
+ * deadlock.
+ *
+ * -------------------------------------------------------------------------- */
+
+isEmptyMVarzh_fast
+{
+    /* args: R1 = MVar closure */
+
+    if (GET_INFO(R1) == stg_EMPTY_MVAR_info) {
+	RET_N(1);
+    } else {
+	RET_N(0);
+    }
+}
+
+newMVarzh_fast
+{
+    /* args: none */
+    W_ mvar;
+
+    ALLOC_PRIM ( SIZEOF_StgMVar, NO_PTRS, newMVarzh_fast );
+  
+    mvar = Hp - SIZEOF_StgMVar + WDS(1);
+    SET_HDR(mvar,stg_EMPTY_MVAR_info,W_[CCCS]);
+    StgMVar_head(mvar)  = stg_END_TSO_QUEUE_closure;
+    StgMVar_tail(mvar)  = stg_END_TSO_QUEUE_closure;
+    StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
+    RET_P(mvar);
+}
+
+
+/* If R1 isn't available, pass it on the stack */
+#ifdef REG_R1
+#define PerformTake(tso, value)				\
+    W_[StgTSO_sp(tso) + WDS(1)] = value;		\
+    W_[StgTSO_sp(tso) + WDS(0)] = stg_gc_unpt_r1_info;
+#else
+#define PerformTake(tso, value)					\
+    W_[StgTSO_sp(tso) + WDS(1)] = value;			\
+    W_[StgTSO_sp(tso) + WDS(0)] = stg_ut_1_0_unreg_info;
+#endif
+
+#define PerformPut(tso,lval)			\
+    StgTSO_sp(tso) = StgTSO_sp(tso) + WDS(3);	\
+    lval = W_[StgTSO_sp(tso) - WDS(1)];
+
+takeMVarzh_fast
+{
+    W_ mvar, val, info, tso;
+
+    /* args: R1 = MVar closure */
+    mvar = R1;
+
+#if defined(THREADED_RTS)
+    "ptr" info = foreign "C" lockClosure(mvar "ptr") [];
+#else
+    info = GET_INFO(mvar);
+#endif
+
+    /* If the MVar is empty, put ourselves on its blocking queue,
+     * and wait until we're woken up.
+     */
+    if (info == stg_EMPTY_MVAR_info) {
+	if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+	    StgMVar_head(mvar) = CurrentTSO;
+	} else {
+	    StgTSO_link(StgMVar_tail(mvar)) = CurrentTSO;
+	}
+	StgTSO_link(CurrentTSO)        = stg_END_TSO_QUEUE_closure;
+	StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
+	StgTSO_block_info(CurrentTSO)  = mvar;
+	StgMVar_tail(mvar) = CurrentTSO;
+	
+	jump stg_block_takemvar;
+  }
+
+  /* we got the value... */
+  val = StgMVar_value(mvar);
+
+  if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure)
+  {
+      /* There are putMVar(s) waiting... 
+       * wake up the first thread on the queue
+       */
+      ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
+
+      /* actually perform the putMVar for the thread that we just woke up */
+      tso = StgMVar_head(mvar);
+      PerformPut(tso,StgMVar_value(mvar));
+      foreign "C" dirtyTSO(tso "ptr") [];
+
+#if defined(GRAN) || defined(PAR)
+      /* ToDo: check 2nd arg (mvar) is right */
+      "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar),mvar) [];
+      StgMVar_head(mvar) = tso;
+#else
+      "ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", 
+				         StgMVar_head(mvar) "ptr") [];
+      StgMVar_head(mvar) = tso;
+#endif
+
+      if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+	  StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
+      }
+
+#if defined(THREADED_RTS)
+      foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+#endif
+      RET_P(val);
+  } 
+  else
+  {
+      /* No further putMVars, MVar is now empty */
+      StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
+ 
+#if defined(THREADED_RTS)
+      foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+#else
+      SET_INFO(mvar,stg_EMPTY_MVAR_info);
+#endif
+
+      RET_P(val);
+  }
+}
+
+
+tryTakeMVarzh_fast
+{
+    W_ mvar, val, info, tso;
+
+    /* args: R1 = MVar closure */
+
+    mvar = R1;
+
+#if defined(THREADED_RTS)
+    "ptr" info = foreign "C" lockClosure(mvar "ptr") [];
+#else
+    info = GET_INFO(mvar);
+#endif
+
+    if (info == stg_EMPTY_MVAR_info) {
+#if defined(THREADED_RTS)
+        foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+#endif
+	/* HACK: we need a pointer to pass back, 
+	 * so we abuse NO_FINALIZER_closure
+	 */
+	RET_NP(0, stg_NO_FINALIZER_closure);
+    }
+
+    /* we got the value... */
+    val = StgMVar_value(mvar);
+
+    if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
+
+	/* There are putMVar(s) waiting... 
+	 * wake up the first thread on the queue
+	 */
+	ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
+
+	/* actually perform the putMVar for the thread that we just woke up */
+	tso = StgMVar_head(mvar);
+	PerformPut(tso,StgMVar_value(mvar));
+        foreign "C" dirtyTSO(tso "ptr") [];
+
+#if defined(GRAN) || defined(PAR)
+	/* ToDo: check 2nd arg (mvar) is right */
+	"ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar) "ptr", mvar "ptr") [];
+	StgMVar_head(mvar) = tso;
+#else
+	"ptr" tso = foreign "C" unblockOne(MyCapability() "ptr",
+					   StgMVar_head(mvar) "ptr") [];
+	StgMVar_head(mvar) = tso;
+#endif
+
+	if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+	    StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
+	}
+#if defined(THREADED_RTS)
+        foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+#endif
+    }
+    else 
+    {
+	/* No further putMVars, MVar is now empty */
+	StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
+#if defined(THREADED_RTS)
+    	foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+#else
+	SET_INFO(mvar,stg_EMPTY_MVAR_info);
+#endif
+    }
+    
+    RET_NP(1, val);
+}
+
+
+putMVarzh_fast
+{
+    W_ mvar, info, tso;
+
+    /* args: R1 = MVar, R2 = value */
+    mvar = R1;
+
+#if defined(THREADED_RTS)
+    "ptr" info = foreign "C" lockClosure(mvar "ptr") [R2];
+#else
+    info = GET_INFO(mvar);
+#endif
+
+    if (info == stg_FULL_MVAR_info) {
+	if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+	    StgMVar_head(mvar) = CurrentTSO;
+	} else {
+	    StgTSO_link(StgMVar_tail(mvar)) = CurrentTSO;
+	}
+	StgTSO_link(CurrentTSO)        = stg_END_TSO_QUEUE_closure;
+	StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
+	StgTSO_block_info(CurrentTSO)  = mvar;
+	StgMVar_tail(mvar) = CurrentTSO;
+	
+	jump stg_block_putmvar;
+    }
+  
+    if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
+
+	/* There are takeMVar(s) waiting: wake up the first one
+	 */
+	ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
+
+	/* actually perform the takeMVar */
+	tso = StgMVar_head(mvar);
+	PerformTake(tso, R2);
+        foreign "C" dirtyTSO(tso "ptr") [];
+      
+#if defined(GRAN) || defined(PAR)
+	/* ToDo: check 2nd arg (mvar) is right */
+	"ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr",mvar "ptr") [];
+	StgMVar_head(mvar) = tso;
+#else
+	"ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr") [];
+	StgMVar_head(mvar) = tso;
+#endif
+
+	if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+	    StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
+	}
+
+#if defined(THREADED_RTS)
+    	foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+#endif
+	jump %ENTRY_CODE(Sp(0));
+    }
+    else
+    {
+	/* No further takes, the MVar is now full. */
+	StgMVar_value(mvar) = R2;
+
+#if defined(THREADED_RTS)
+    	foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+#else
+	SET_INFO(mvar,stg_FULL_MVAR_info);
+#endif
+	jump %ENTRY_CODE(Sp(0));
+    }
+    
+    /* ToDo: yield afterward for better communication performance? */
+}
+
+
+tryPutMVarzh_fast
+{
+    W_ mvar, info, tso;
+
+    /* args: R1 = MVar, R2 = value */
+    mvar = R1;
+
+#if defined(THREADED_RTS)
+    "ptr" info = foreign "C" lockClosure(mvar "ptr") [R2];
+#else
+    info = GET_INFO(mvar);
+#endif
+
+    if (info == stg_FULL_MVAR_info) {
+#if defined(THREADED_RTS)
+    	foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+#endif
+	RET_N(0);
+    }
+  
+    if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
+
+	/* There are takeMVar(s) waiting: wake up the first one
+	 */
+	ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
+	
+	/* actually perform the takeMVar */
+	tso = StgMVar_head(mvar);
+	PerformTake(tso, R2);
+        foreign "C" dirtyTSO(tso "ptr") [];
+      
+#if defined(GRAN) || defined(PAR)
+	/* ToDo: check 2nd arg (mvar) is right */
+	"ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr",mvar "ptr") [];
+	StgMVar_head(mvar) = tso;
+#else
+	"ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr") [];
+	StgMVar_head(mvar) = tso;
+#endif
+
+	if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+	    StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
+	}
+
+#if defined(THREADED_RTS)
+    	foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+#endif
+    }
+    else
+    {
+	/* No further takes, the MVar is now full. */
+	StgMVar_value(mvar) = R2;
+
+#if defined(THREADED_RTS)
+    	foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+#else
+	SET_INFO(mvar,stg_FULL_MVAR_info);
+#endif
+    }
+    
+    RET_N(1);
+    /* ToDo: yield afterward for better communication performance? */
+}
+
+
+/* -----------------------------------------------------------------------------
+   Stable pointer primitives
+   -------------------------------------------------------------------------  */
+
+makeStableNamezh_fast
+{
+    W_ index, sn_obj;
+
+    ALLOC_PRIM( SIZEOF_StgStableName, R1_PTR, makeStableNamezh_fast );
+  
+    index = foreign "C" lookupStableName(R1 "ptr") [];
+
+    /* Is there already a StableName for this heap object?
+     *  stable_ptr_table is a pointer to an array of snEntry structs.
+     */
+    if ( snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) == NULL ) {
+	sn_obj = Hp - SIZEOF_StgStableName + WDS(1);
+	SET_HDR(sn_obj, stg_STABLE_NAME_info, W_[CCCS]);
+	StgStableName_sn(sn_obj) = index;
+	snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) = sn_obj;
+    } else {
+	sn_obj = snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry);
+    }
+    
+    RET_P(sn_obj);
+}
+
+
+makeStablePtrzh_fast
+{
+    /* Args: R1 = a */
+    W_ sp;
+    MAYBE_GC(R1_PTR, makeStablePtrzh_fast);
+    "ptr" sp = foreign "C" getStablePtr(R1 "ptr") [];
+    RET_N(sp);
+}
+
+deRefStablePtrzh_fast
+{
+    /* Args: R1 = the stable ptr */
+    W_ r, sp;
+    sp = R1;
+    r = snEntry_addr(W_[stable_ptr_table] + sp*SIZEOF_snEntry);
+    RET_P(r);
+}
+
+/* -----------------------------------------------------------------------------
+   Bytecode object primitives
+   -------------------------------------------------------------------------  */
+
+newBCOzh_fast
+{
+    /* R1 = instrs
+       R2 = literals
+       R3 = ptrs
+       R4 = itbls
+       R5 = arity
+       R6 = bitmap array
+    */
+    W_ bco, bitmap_arr, bytes, words;
+    
+    bitmap_arr = R6;
+    words = BYTES_TO_WDS(SIZEOF_StgBCO) + StgArrWords_words(bitmap_arr);
+    bytes = WDS(words);
+
+    ALLOC_PRIM( bytes, R1_PTR&R2_PTR&R3_PTR&R4_PTR&R6_PTR, newBCOzh_fast );
+
+    bco = Hp - bytes + WDS(1);
+    SET_HDR(bco, stg_BCO_info, W_[CCCS]);
+    
+    StgBCO_instrs(bco)     = R1;
+    StgBCO_literals(bco)   = R2;
+    StgBCO_ptrs(bco)       = R3;
+    StgBCO_itbls(bco)      = R4;
+    StgBCO_arity(bco)      = HALF_W_(R5);
+    StgBCO_size(bco)       = HALF_W_(words);
+    
+    // Copy the arity/bitmap info into the BCO
+    W_ i;
+    i = 0;
+for:
+    if (i < StgArrWords_words(bitmap_arr)) {
+	StgBCO_bitmap(bco,i) = StgArrWords_payload(bitmap_arr,i);
+	i = i + 1;
+	goto for;
+    }
+    
+    RET_P(bco);
+}
+
+
+mkApUpd0zh_fast
+{
+    // R1 = the BCO# for the AP
+    //	
+    W_ ap;
+
+    // This function is *only* used to wrap zero-arity BCOs in an
+    // updatable wrapper (see ByteCodeLink.lhs).  An AP thunk is always
+    // saturated and always points directly to a FUN or BCO.
+    ASSERT(%INFO_TYPE(%GET_STD_INFO(R1)) == HALF_W_(BCO) &&
+	   StgBCO_arity(R1) == HALF_W_(0));
+
+    HP_CHK_GEN_TICKY(SIZEOF_StgAP, R1_PTR, mkApUpd0zh_fast);
+    TICK_ALLOC_UP_THK(0, 0);
+    CCCS_ALLOC(SIZEOF_StgAP);
+
+    ap = Hp - SIZEOF_StgAP + WDS(1);
+    SET_HDR(ap, stg_AP_info, W_[CCCS]);
+    
+    StgAP_n_args(ap) = HALF_W_(0);
+    StgAP_fun(ap) = R1;
+    
+    RET_P(ap);
+}
+
+/* -----------------------------------------------------------------------------
+   Thread I/O blocking primitives
+   -------------------------------------------------------------------------- */
+
+/* Add a thread to the end of the blocked queue. (C-- version of the C
+ * macro in Schedule.h).
+ */
+#define APPEND_TO_BLOCKED_QUEUE(tso)			\
+    ASSERT(StgTSO_link(tso) == END_TSO_QUEUE);		\
+    if (W_[blocked_queue_hd] == END_TSO_QUEUE) {	\
+      W_[blocked_queue_hd] = tso;			\
+    } else {						\
+      StgTSO_link(W_[blocked_queue_tl]) = tso;		\
+    }							\
+    W_[blocked_queue_tl] = tso;
+
+waitReadzh_fast
+{
+    /* args: R1 */
+#ifdef THREADED_RTS
+    foreign "C" barf("waitRead# on threaded RTS");
+#else
+
+    ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
+    StgTSO_why_blocked(CurrentTSO) = BlockedOnRead::I16;
+    StgTSO_block_info(CurrentTSO) = R1;
+    // No locking - we're not going to use this interface in the
+    // threaded RTS anyway.
+    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+    jump stg_block_noregs;
+#endif
+}
+
+waitWritezh_fast
+{
+    /* args: R1 */
+#ifdef THREADED_RTS
+    foreign "C" barf("waitWrite# on threaded RTS");
+#else
+
+    ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
+    StgTSO_why_blocked(CurrentTSO) = BlockedOnWrite::I16;
+    StgTSO_block_info(CurrentTSO) = R1;
+    // No locking - we're not going to use this interface in the
+    // threaded RTS anyway.
+    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+    jump stg_block_noregs;
+#endif
+}
+
+
+STRING(stg_delayzh_malloc_str, "delayzh_fast")
+delayzh_fast
+{
+#ifdef mingw32_HOST_OS
+    W_ ares;
+    CInt reqID;
+#else
+    W_ t, prev, target;
+#endif
+
+#ifdef THREADED_RTS
+    foreign "C" barf("delay# on threaded RTS");
+#else
+
+    /* args: R1 (microsecond delay amount) */
+    ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
+    StgTSO_why_blocked(CurrentTSO) = BlockedOnDelay::I16;
+
+#ifdef mingw32_HOST_OS
+
+    /* could probably allocate this on the heap instead */
+    "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
+					    stg_delayzh_malloc_str);
+    reqID = foreign "C" addDelayRequest(R1);
+    StgAsyncIOResult_reqID(ares)   = reqID;
+    StgAsyncIOResult_len(ares)     = 0;
+    StgAsyncIOResult_errCode(ares) = 0;
+    StgTSO_block_info(CurrentTSO)  = ares;
+
+    /* Having all async-blocked threads reside on the blocked_queue
+     * simplifies matters, so change the status to OnDoProc put the
+     * delayed thread on the blocked_queue.
+     */
+    StgTSO_why_blocked(CurrentTSO) = BlockedOnDoProc::I16;
+    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+    jump stg_block_async_void;
+
+#else
+
+    W_ time;
+    time = foreign "C" getourtimeofday();
+    target = (R1 / (TICK_MILLISECS*1000)) + time;
+    StgTSO_block_info(CurrentTSO) = target;
+
+    /* Insert the new thread in the sleeping queue. */
+    prev = NULL;
+    t = W_[sleeping_queue];
+while:
+    if (t != END_TSO_QUEUE && StgTSO_block_info(t) < target) {
+	prev = t;
+	t = StgTSO_link(t);
+	goto while;
+    }
+
+    StgTSO_link(CurrentTSO) = t;
+    if (prev == NULL) {
+	W_[sleeping_queue] = CurrentTSO;
+    } else {
+	StgTSO_link(prev) = CurrentTSO;
+    }
+    jump stg_block_noregs;
+#endif
+#endif /* !THREADED_RTS */
+}
+
+
+#ifdef mingw32_HOST_OS
+STRING(stg_asyncReadzh_malloc_str, "asyncReadzh_fast")
+asyncReadzh_fast
+{
+    W_ ares;
+    CInt reqID;
+
+#ifdef THREADED_RTS
+    foreign "C" barf("asyncRead# on threaded RTS");
+#else
+
+    /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
+    ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
+    StgTSO_why_blocked(CurrentTSO) = BlockedOnRead::I16;
+
+    /* could probably allocate this on the heap instead */
+    "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
+					    stg_asyncReadzh_malloc_str)
+			[R1,R2,R3,R4];
+    reqID = foreign "C" addIORequest(R1, 0/*FALSE*/,R2,R3,R4 "ptr") [];
+    StgAsyncIOResult_reqID(ares)   = reqID;
+    StgAsyncIOResult_len(ares)     = 0;
+    StgAsyncIOResult_errCode(ares) = 0;
+    StgTSO_block_info(CurrentTSO)  = ares;
+    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+    jump stg_block_async;
+#endif
+}
+
+STRING(stg_asyncWritezh_malloc_str, "asyncWritezh_fast")
+asyncWritezh_fast
+{
+    W_ ares;
+    CInt reqID;
+
+#ifdef THREADED_RTS
+    foreign "C" barf("asyncWrite# on threaded RTS");
+#else
+
+    /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
+    ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
+    StgTSO_why_blocked(CurrentTSO) = BlockedOnWrite::I16;
+
+    "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
+					    stg_asyncWritezh_malloc_str)
+			[R1,R2,R3,R4];
+    reqID = foreign "C" addIORequest(R1, 1/*TRUE*/,R2,R3,R4 "ptr") [];
+
+    StgAsyncIOResult_reqID(ares)   = reqID;
+    StgAsyncIOResult_len(ares)     = 0;
+    StgAsyncIOResult_errCode(ares) = 0;
+    StgTSO_block_info(CurrentTSO)  = ares;
+    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+    jump stg_block_async;
+#endif
+}
+
+STRING(stg_asyncDoProczh_malloc_str, "asyncDoProczh_fast")
+asyncDoProczh_fast
+{
+    W_ ares;
+    CInt reqID;
+
+#ifdef THREADED_RTS
+    foreign "C" barf("asyncDoProc# on threaded RTS");
+#else
+
+    /* args: R1 = proc, R2 = param */
+    ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
+    StgTSO_why_blocked(CurrentTSO) = BlockedOnDoProc::I16;
+
+    /* could probably allocate this on the heap instead */
+    "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
+					    stg_asyncDoProczh_malloc_str) 
+				[R1,R2];
+    reqID = foreign "C" addDoProcRequest(R1 "ptr",R2 "ptr") [];
+    StgAsyncIOResult_reqID(ares)   = reqID;
+    StgAsyncIOResult_len(ares)     = 0;
+    StgAsyncIOResult_errCode(ares) = 0;
+    StgTSO_block_info(CurrentTSO) = ares;
+    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+    jump stg_block_async;
+#endif
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+  ** temporary **
+
+   classes CCallable and CReturnable don't really exist, but the
+   compiler insists on generating dictionaries containing references
+   to GHC_ZcCCallable_static_info etc., so we provide dummy symbols
+   for these.  Some C compilers can't cope with zero-length static arrays,
+   so we have to make these one element long.
+  --------------------------------------------------------------------------- */
+
+section "rodata" {
+  GHC_ZCCCallable_static_info:   W_ 0;
+}
+
+section "rodata" {
+  GHC_ZCCReturnable_static_info: W_ 0;
+}
diff --git a/rts/Printer.c b/rts/Printer.c
new file mode 100644
index 0000000000..8290d220a0
--- /dev/null
+++ b/rts/Printer.c
@@ -0,0 +1,1127 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1994-2000.
+ *
+ * Heap printer
+ * 
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "Printer.h"
+#include "RtsUtils.h"
+
+#ifdef DEBUG
+
+#include "RtsFlags.h"
+#include "MBlock.h"
+#include "Storage.h"
+#include "Bytecodes.h"  /* for InstrPtr */
+#include "Disassembler.h"
+#include "Apply.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#if defined(GRAN) || defined(PAR)
+// HWL: explicit fixed header size to make debugging easier
+int fixed_hs = sizeof(StgHeader), itbl_sz = sizeofW(StgInfoTable), 
+    uf_sz=sizeofW(StgUpdateFrame); 
+#endif
+
+/* --------------------------------------------------------------------------
+ * local function decls
+ * ------------------------------------------------------------------------*/
+
+static void    printStdObjPayload( StgClosure *obj );
+#ifdef USING_LIBBFD
+static void    reset_table   ( int size );
+static void    prepare_table ( void );
+static void    insert        ( unsigned value, const char *name );
+#endif
+#if 0 /* unused but might be useful sometime */
+static rtsBool lookup_name   ( char *name, unsigned *result );
+static void    enZcode       ( char *in, char *out );
+#endif
+static char    unZcode       ( char ch );
+const char *   lookupGHCName ( void *addr );
+static void    printZcoded   ( const char *raw );
+
+/* --------------------------------------------------------------------------
+ * Printer
+ * ------------------------------------------------------------------------*/
+
+void printPtr( StgPtr p )
+{
+    const char *raw;
+    raw = lookupGHCName(p);
+    if (raw != NULL) {
+        printZcoded(raw);
+    } else {
+        debugBelch("%p", p);
+    }
+}
+  
+void printObj( StgClosure *obj )
+{
+    debugBelch("Object "); printPtr((StgPtr)obj); debugBelch(" = ");
+    printClosure(obj);
+}
+
+STATIC_INLINE void
+printStdObjHdr( StgClosure *obj, char* tag )
+{
+    debugBelch("%s(",tag);
+    printPtr((StgPtr)obj->header.info);
+#ifdef PROFILING
+    debugBelch(", %s", obj->header.prof.ccs->cc->label);
+#endif
+}
+
+static void
+printStdObjPayload( StgClosure *obj )
+{
+    StgWord i, j;
+    const StgInfoTable* info;
+
+    info = get_itbl(obj);
+    for (i = 0; i < info->layout.payload.ptrs; ++i) {
+        debugBelch(", ");
+        printPtr((StgPtr)obj->payload[i]);
+    }
+    for (j = 0; j < info->layout.payload.nptrs; ++j) {
+        debugBelch(", %pd#",obj->payload[i+j]);
+    }
+    debugBelch(")\n");
+}
+
+static void
+printThunkPayload( StgThunk *obj )
+{
+    StgWord i, j;
+    const StgInfoTable* info;
+
+    info = get_itbl(obj);
+    for (i = 0; i < info->layout.payload.ptrs; ++i) {
+        debugBelch(", ");
+        printPtr((StgPtr)obj->payload[i]);
+    }
+    for (j = 0; j < info->layout.payload.nptrs; ++j) {
+        debugBelch(", %pd#",obj->payload[i+j]);
+    }
+    debugBelch(")\n");
+}
+
+static void
+printThunkObject( StgThunk *obj, char* tag )
+{
+    printStdObjHdr( (StgClosure *)obj, tag );
+    printThunkPayload( obj );
+}
+
+void
+printClosure( StgClosure *obj )
+{
+    StgInfoTable *info;
+    
+    info = get_itbl(obj);
+
+    switch ( info->type ) {
+    case INVALID_OBJECT:
+            barf("Invalid object");
+
+    case CONSTR:
+    case CONSTR_1_0: case CONSTR_0_1:
+    case CONSTR_1_1: case CONSTR_0_2: case CONSTR_2_0:
+    case CONSTR_INTLIKE:
+    case CONSTR_CHARLIKE:
+    case CONSTR_STATIC:
+    case CONSTR_NOCAF_STATIC:
+        {
+            StgWord i, j;
+#ifdef PROFILING
+	    debugBelch("%s(", info->prof.closure_desc);
+	    debugBelch("%s", obj->header.prof.ccs->cc->label);
+#else
+            debugBelch("CONSTR(");
+            printPtr((StgPtr)obj->header.info);
+            debugBelch("(tag=%d)",info->srt_bitmap);
+#endif
+            for (i = 0; i < info->layout.payload.ptrs; ++i) {
+		debugBelch(", ");
+                printPtr((StgPtr)obj->payload[i]);
+            }
+            for (j = 0; j < info->layout.payload.nptrs; ++j) {
+                debugBelch(", %p#", obj->payload[i+j]);
+            }
+            debugBelch(")\n");
+            break;
+        }
+
+    case FUN:
+    case FUN_1_0: case FUN_0_1: 
+    case FUN_1_1: case FUN_0_2: case FUN_2_0:
+    case FUN_STATIC:
+	debugBelch("FUN/%d(",itbl_to_fun_itbl(info)->f.arity);
+	printPtr((StgPtr)obj->header.info);
+#ifdef PROFILING
+	debugBelch(", %s", obj->header.prof.ccs->cc->label);
+#endif
+	printStdObjPayload(obj);
+	break;
+
+    case THUNK:
+    case THUNK_1_0: case THUNK_0_1:
+    case THUNK_1_1: case THUNK_0_2: case THUNK_2_0:
+    case THUNK_STATIC:
+            /* ToDo: will this work for THUNK_STATIC too? */
+#ifdef PROFILING
+	    printThunkObject((StgThunk *)obj,info->prof.closure_desc);
+#else
+            printThunkObject((StgThunk *)obj,"THUNK");
+#endif
+            break;
+
+    case THUNK_SELECTOR:
+	printStdObjHdr(obj, "THUNK_SELECTOR");
+	debugBelch(", %p)\n", ((StgSelector *)obj)->selectee);
+	break;
+
+    case BCO:
+            disassemble( (StgBCO*)obj );
+            break;
+
+    case AP:
+        {
+	    StgAP* ap = stgCast(StgAP*,obj);
+            StgWord i;
+            debugBelch("AP("); printPtr((StgPtr)ap->fun);
+            for (i = 0; i < ap->n_args; ++i) {
+                debugBelch(", ");
+                printPtr((P_)ap->payload[i]);
+            }
+            debugBelch(")\n");
+            break;
+        }
+
+    case PAP:
+        {
+	    StgPAP* pap = stgCast(StgPAP*,obj);
+            StgWord i;
+            debugBelch("PAP/%d(",pap->arity); 
+	    printPtr((StgPtr)pap->fun);
+            for (i = 0; i < pap->n_args; ++i) {
+                debugBelch(", ");
+                printPtr((StgPtr)pap->payload[i]);
+            }
+            debugBelch(")\n");
+            break;
+        }
+
+    case AP_STACK:
+        {
+	    StgAP_STACK* ap = stgCast(StgAP_STACK*,obj);
+            StgWord i;
+            debugBelch("AP_STACK("); printPtr((StgPtr)ap->fun);
+            for (i = 0; i < ap->size; ++i) {
+                debugBelch(", ");
+                printPtr((P_)ap->payload[i]);
+            }
+            debugBelch(")\n");
+            break;
+        }
+
+    case IND:
+            debugBelch("IND("); 
+            printPtr((StgPtr)stgCast(StgInd*,obj)->indirectee);
+            debugBelch(")\n"); 
+            break;
+
+    case IND_OLDGEN:
+            debugBelch("IND_OLDGEN("); 
+            printPtr((StgPtr)stgCast(StgInd*,obj)->indirectee);
+            debugBelch(")\n"); 
+            break;
+
+    case IND_PERM:
+            debugBelch("IND("); 
+            printPtr((StgPtr)stgCast(StgInd*,obj)->indirectee);
+            debugBelch(")\n"); 
+            break;
+
+    case IND_OLDGEN_PERM:
+            debugBelch("IND_OLDGEN_PERM("); 
+            printPtr((StgPtr)stgCast(StgInd*,obj)->indirectee);
+            debugBelch(")\n"); 
+            break;
+
+    case IND_STATIC:
+            debugBelch("IND_STATIC("); 
+            printPtr((StgPtr)stgCast(StgInd*,obj)->indirectee);
+            debugBelch(")\n"); 
+            break;
+
+    /* Cannot happen -- use default case.
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+    case RET_BIG:
+    case RET_VEC_BIG:
+    case RET_DYN:
+    case RET_FUN:
+    */
+
+    case UPDATE_FRAME:
+        {
+            StgUpdateFrame* u = stgCast(StgUpdateFrame*,obj);
+            debugBelch("UPDATE_FRAME(");
+            printPtr((StgPtr)GET_INFO(u));
+            debugBelch(",");
+            printPtr((StgPtr)u->updatee);
+            debugBelch(")\n"); 
+            break;
+        }
+
+    case CATCH_FRAME:
+        {
+            StgCatchFrame* u = stgCast(StgCatchFrame*,obj);
+            debugBelch("CATCH_FRAME(");
+            printPtr((StgPtr)GET_INFO(u));
+            debugBelch(",");
+            printPtr((StgPtr)u->handler);
+            debugBelch(")\n"); 
+            break;
+        }
+
+    case STOP_FRAME:
+        {
+            StgStopFrame* u = stgCast(StgStopFrame*,obj);
+            debugBelch("STOP_FRAME(");
+            printPtr((StgPtr)GET_INFO(u));
+            debugBelch(")\n"); 
+            break;
+        }
+
+    case CAF_BLACKHOLE:
+            debugBelch("CAF_BH"); 
+            break;
+
+    case BLACKHOLE:
+            debugBelch("BH\n"); 
+            break;
+
+    case SE_BLACKHOLE:
+            debugBelch("SE_BH\n"); 
+            break;
+
+    case SE_CAF_BLACKHOLE:
+            debugBelch("SE_CAF_BH\n"); 
+            break;
+
+    case ARR_WORDS:
+        {
+            StgWord i;
+            debugBelch("ARR_WORDS(\"");
+            /* ToDo: we can't safely assume that this is a string! 
+            for (i = 0; arrWordsGetChar(obj,i); ++i) {
+                putchar(arrWordsGetChar(obj,i));
+		} */
+	    for (i=0; i<((StgArrWords *)obj)->words; i++)
+	      debugBelch("%lu", (lnat)((StgArrWords *)obj)->payload[i]);
+            debugBelch("\")\n");
+            break;
+        }
+
+    case MUT_ARR_PTRS_CLEAN:
+	debugBelch("MUT_ARR_PTRS_CLEAN(size=%lu)\n", (lnat)((StgMutArrPtrs *)obj)->ptrs);
+	break;
+
+    case MUT_ARR_PTRS_DIRTY:
+	debugBelch("MUT_ARR_PTRS_DIRTY(size=%lu)\n", (lnat)((StgMutArrPtrs *)obj)->ptrs);
+	break;
+
+    case MUT_ARR_PTRS_FROZEN:
+	debugBelch("MUT_ARR_PTRS_FROZEN(size=%lu)\n", (lnat)((StgMutArrPtrs *)obj)->ptrs);
+	break;
+
+    case MVAR:
+        {
+	  StgMVar* mv = (StgMVar*)obj;
+	  debugBelch("MVAR(head=%p, tail=%p, value=%p)\n", mv->head, mv->tail, mv->value);
+          break;
+        }
+
+    case MUT_VAR_CLEAN:
+        {
+	  StgMutVar* mv = (StgMutVar*)obj;
+	  debugBelch("MUT_VAR_CLEAN(var=%p)\n", mv->var);
+          break;
+        }
+
+    case MUT_VAR_DIRTY:
+        {
+	  StgMutVar* mv = (StgMutVar*)obj;
+	  debugBelch("MUT_VAR_DIRTY(var=%p)\n", mv->var);
+          break;
+        }
+
+    case WEAK:
+            debugBelch("WEAK("); 
+	    debugBelch(" key=%p value=%p finalizer=%p", 
+		    (StgPtr)(((StgWeak*)obj)->key),
+		    (StgPtr)(((StgWeak*)obj)->value),
+		    (StgPtr)(((StgWeak*)obj)->finalizer));
+            debugBelch(")\n"); 
+	    /* ToDo: chase 'link' ? */
+            break;
+
+    case STABLE_NAME:
+            debugBelch("STABLE_NAME(%lu)\n", (lnat)((StgStableName*)obj)->sn); 
+            break;
+
+    case TSO:
+      debugBelch("TSO("); 
+      debugBelch("%d (%p)",((StgTSO*)obj)->id, (StgTSO*)obj);
+      debugBelch(")\n"); 
+      break;
+
+#if defined(PAR)
+    case BLOCKED_FETCH:
+      debugBelch("BLOCKED_FETCH("); 
+      printGA(&(stgCast(StgBlockedFetch*,obj)->ga));
+      printPtr((StgPtr)(stgCast(StgBlockedFetch*,obj)->node));
+      debugBelch(")\n"); 
+      break;
+
+    case FETCH_ME:
+      debugBelch("FETCH_ME("); 
+      printGA((globalAddr *)stgCast(StgFetchMe*,obj)->ga);
+      debugBelch(")\n"); 
+      break;
+
+    case FETCH_ME_BQ:
+      debugBelch("FETCH_ME_BQ("); 
+      // printGA((globalAddr *)stgCast(StgFetchMe*,obj)->ga);
+      printPtr((StgPtr)stgCast(StgFetchMeBlockingQueue*,obj)->blocking_queue);
+      debugBelch(")\n"); 
+      break;
+#endif
+
+#if defined(GRAN) || defined(PAR)
+    case RBH:
+      debugBelch("RBH("); 
+      printPtr((StgPtr)stgCast(StgRBH*,obj)->blocking_queue);
+      debugBelch(")\n"); 
+      break;
+
+#endif
+
+#if 0
+      /* Symptomatic of a problem elsewhere, have it fall-through & fail */
+    case EVACUATED:
+      debugBelch("EVACUATED("); 
+      printClosure((StgEvacuated*)obj->evacuee);
+      debugBelch(")\n"); 
+      break;
+#endif
+
+#if defined(PAR) && defined(DIST)
+    case REMOTE_REF:
+      debugBelch("REMOTE_REF("); 
+      printGA((globalAddr *)stgCast(StgFetchMe*,obj)->ga);
+      debugBelch(")\n"); 
+      break;
+#endif
+
+    default:
+            //barf("printClosure %d",get_itbl(obj)->type);
+            debugBelch("*** printClosure: unknown type %d ****\n",
+                    get_itbl(obj)->type );
+            barf("printClosure %d",get_itbl(obj)->type);
+            return;
+    }
+}
+
+/*
+void printGraph( StgClosure *obj )
+{
+ printClosure(obj);
+}
+*/
+
+StgPtr
+printStackObj( StgPtr sp )
+{
+    /*debugBelch("Stack[%d] = ", &stgStack[STACK_SIZE] - sp); */
+
+        StgClosure* c = (StgClosure*)(*sp);
+        printPtr((StgPtr)*sp);
+        if (c == (StgClosure*)&stg_ctoi_R1p_info) {
+           debugBelch("\t\t\tstg_ctoi_ret_R1p_info\n" );
+	} else
+        if (c == (StgClosure*)&stg_ctoi_R1n_info) {
+           debugBelch("\t\t\tstg_ctoi_ret_R1n_info\n" );
+	} else
+        if (c == (StgClosure*)&stg_ctoi_F1_info) {
+           debugBelch("\t\t\tstg_ctoi_ret_F1_info\n" );
+	} else
+        if (c == (StgClosure*)&stg_ctoi_D1_info) {
+           debugBelch("\t\t\tstg_ctoi_ret_D1_info\n" );
+	} else
+        if (c == (StgClosure*)&stg_ctoi_V_info) {
+           debugBelch("\t\t\tstg_ctoi_ret_V_info\n" );
+	} else
+        if (get_itbl(c)->type == BCO) {
+           debugBelch("\t\t\t");
+           debugBelch("BCO(...)\n"); 
+        }
+        else {
+           debugBelch("\t\t\t");
+           printClosure ( (StgClosure*)(*sp));
+        }
+        sp += 1;
+
+    return sp;
+    
+}
+
+static void
+printSmallBitmap( StgPtr spBottom, StgPtr payload, StgWord bitmap, nat size )
+{
+    StgPtr p;
+    nat i;
+
+    p = payload;
+    for(i = 0; i < size; i++, bitmap >>= 1 ) {
+	debugBelch("   stk[%ld] (%p) = ", (long)(spBottom-(payload+i)), payload+i);
+	if ((bitmap & 1) == 0) {
+	    printPtr((P_)payload[i]);
+	    debugBelch("\n");
+	} else {
+	    debugBelch("Word# %lu\n", (lnat)payload[i]);
+	}
+    }
+}
+
+static void
+printLargeBitmap( StgPtr spBottom, StgPtr payload, StgLargeBitmap* large_bitmap, nat size )
+{
+    StgWord bmp;
+    nat i, j;
+
+    i = 0;
+    for (bmp=0; i < size; bmp++) {
+	StgWord bitmap = large_bitmap->bitmap[bmp];
+	j = 0;
+	for(; i < size && j < BITS_IN(W_); j++, i++, bitmap >>= 1 ) {
+	    debugBelch("   stk[%lu] (%p) = ", (lnat)(spBottom-(payload+i)), payload+i);
+	    if ((bitmap & 1) == 0) {
+		printPtr((P_)payload[i]);
+		debugBelch("\n");
+	    } else {
+		debugBelch("Word# %lu\n", (lnat)payload[i]);
+	    }
+	}
+    }
+}
+
+void
+printStackChunk( StgPtr sp, StgPtr spBottom )
+{
+    StgWord bitmap;
+    const StgInfoTable *info;
+
+    ASSERT(sp <= spBottom);
+    for (; sp < spBottom; sp += stack_frame_sizeW((StgClosure *)sp)) {
+
+	info = get_itbl((StgClosure *)sp);
+
+	switch (info->type) {
+	    
+	case UPDATE_FRAME:
+	case CATCH_FRAME:
+	    printObj((StgClosure*)sp);
+	    continue;
+
+	case STOP_FRAME:
+	    printObj((StgClosure*)sp);
+	    return;
+
+	case RET_DYN:
+	{ 
+	    StgRetDyn* r;
+	    StgPtr p;
+	    StgWord dyn;
+	    nat size;
+
+	    r = (StgRetDyn *)sp;
+	    dyn = r->liveness;
+	    debugBelch("RET_DYN (%p)\n", r);
+
+	    p = (P_)(r->payload);
+	    printSmallBitmap(spBottom, sp,
+			     RET_DYN_LIVENESS(r->liveness), 
+			     RET_DYN_BITMAP_SIZE);
+	    p += RET_DYN_BITMAP_SIZE + RET_DYN_NONPTR_REGS_SIZE;
+
+	    for (size = RET_DYN_NONPTRS(dyn); size > 0; size--) {
+		debugBelch("   stk[%ld] (%p) = ", (long)(spBottom-p), p);
+		debugBelch("Word# %ld\n", (long)*p);
+		p++;
+	    }
+	
+	    for (size = RET_DYN_PTRS(dyn); size > 0; size--) {
+		debugBelch("   stk[%ld] (%p) = ", (long)(spBottom-p), p);
+		printPtr(p);
+		p++;
+	    }
+	    continue;
+	}
+
+	case RET_SMALL:
+	case RET_VEC_SMALL:
+	    debugBelch("RET_SMALL (%p)\n", info);
+	    bitmap = info->layout.bitmap;
+	    printSmallBitmap(spBottom, sp+1, 
+			     BITMAP_BITS(bitmap), BITMAP_SIZE(bitmap));
+	    continue;
+
+	case RET_BCO: {
+	    StgBCO *bco;
+	    
+	    bco = ((StgBCO *)sp[1]);
+
+	    debugBelch("RET_BCO (%p)\n", sp);
+	    printLargeBitmap(spBottom, sp+2,
+			     BCO_BITMAP(bco), BCO_BITMAP_SIZE(bco));
+	    continue;
+	}
+
+	case RET_BIG:
+	case RET_VEC_BIG:
+	    barf("todo");
+
+	case RET_FUN:
+	{
+	    StgFunInfoTable *fun_info;
+	    StgRetFun *ret_fun;
+	    nat size;
+
+	    ret_fun = (StgRetFun *)sp;
+	    fun_info = get_fun_itbl(ret_fun->fun);
+	    size = ret_fun->size;
+	    debugBelch("RET_FUN (%p) (type=%d)\n", ret_fun->fun, fun_info->f.fun_type);
+	    switch (fun_info->f.fun_type) {
+	    case ARG_GEN:
+		printSmallBitmap(spBottom, sp+2,
+				 BITMAP_BITS(fun_info->f.b.bitmap),
+				 BITMAP_SIZE(fun_info->f.b.bitmap));
+		break;
+	    case ARG_GEN_BIG:
+		printLargeBitmap(spBottom, sp+2,
+				 GET_FUN_LARGE_BITMAP(fun_info),
+				 GET_FUN_LARGE_BITMAP(fun_info)->size);
+		break;
+	    default:
+		printSmallBitmap(spBottom, sp+2,
+				 BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]),
+				 BITMAP_SIZE(stg_arg_bitmaps[fun_info->f.fun_type]));
+		break;
+	    }
+	    continue;
+	}
+	   
+	default:
+	    debugBelch("unknown object %d\n", info->type);
+	    barf("printStackChunk");
+	}
+    }
+}
+
+void printTSO( StgTSO *tso )
+{
+    printStackChunk( tso->sp, tso->stack+tso->stack_size);
+}
+
+/* -----------------------------------------------------------------------------
+   Closure types
+   
+   NOTE: must be kept in sync with the closure types in includes/ClosureTypes.h
+   -------------------------------------------------------------------------- */
+
+static char *closure_type_names[] = {
+    "INVALID_OBJECT",
+    "CONSTR",
+    "CONSTR_1",
+    "CONSTR_0",
+    "CONSTR_2",
+    "CONSTR_1",
+    "CONSTR_0",
+    "CONSTR_INTLIKE",
+    "CONSTR_CHARLIKE",
+    "CONSTR_STATIC",
+    "CONSTR_NOCAF_STATIC",
+    "FUN",
+    "FUN_1_0",
+    "FUN_0_1",
+    "FUN_2_0",
+    "FUN_1_1",
+    "FUN_0",
+    "FUN_STATIC",
+    "THUNK",
+    "THUNK_1_0",
+    "THUNK_0_1",
+    "THUNK_2_0",
+    "THUNK_1_1",
+    "THUNK_0",
+    "THUNK_STATIC",
+    "THUNK_SELECTOR",
+    "BCO",
+    "AP_UPD",
+    "PAP",
+    "AP_STACK",
+    "IND",
+    "IND_OLDGEN",
+    "IND_PERM",
+    "IND_OLDGEN_PERM",
+    "IND_STATIC",
+    "RET_BCO",
+    "RET_SMALL",
+    "RET_VEC_SMALL",
+    "RET_BIG",
+    "RET_VEC_BIG",
+    "RET_DYN",
+    "RET_FUN",
+    "UPDATE_FRAME",
+    "CATCH_FRAME",
+    "STOP_FRAME",
+    "CAF_BLACKHOLE",
+    "BLACKHOLE",
+    "BLACKHOLE_BQ",
+    "SE_BLACKHOLE",
+    "SE_CAF_BLACKHOLE",
+    "MVAR",
+    "ARR_WORDS",
+    "MUT_ARR_PTRS_CLEAN",
+    "MUT_ARR_PTRS_DIRTY",
+    "MUT_ARR_PTRS_FROZEN",
+    "MUT_VAR_CLEAN",
+    "MUT_VAR_DIRTY",
+    "MUT_CONS",
+    "WEAK",
+    "FOREIGN",
+    "STABLE_NAME",
+    "TSO",
+    "BLOCKED_FETCH",
+    "FETCH_ME",
+    "FETCH_ME_BQ",
+    "RBH",
+    "EVACUATED",
+    "REMOTE_REF",
+    "TVAR_WAIT_QUEUE",
+    "TVAR",
+    "TREC_CHUNK",
+    "TREC_HEADER",
+    "ATOMICALLY_FRAME",
+    "CATCH_RETRY_FRAME"
+};
+
+
+char *
+info_type(StgClosure *closure){ 
+  return closure_type_names[get_itbl(closure)->type];
+}
+
+char *
+info_type_by_ip(StgInfoTable *ip){ 
+  return closure_type_names[ip->type];
+}
+
+void
+info_hdr_type(StgClosure *closure, char *res){ 
+  strcpy(res,closure_type_names[get_itbl(closure)->type]);
+}
+
+/* --------------------------------------------------------------------------
+ * Address printing code
+ *
+ * Uses symbol table in (unstripped executable)
+ * ------------------------------------------------------------------------*/
+
+/* --------------------------------------------------------------------------
+ * Simple lookup table
+ *
+ * Current implementation is pretty dumb!
+ * ------------------------------------------------------------------------*/
+
+struct entry {
+    nat value;
+    const char *name;
+};
+
+static nat table_size;
+static struct entry* table;
+
+#ifdef USING_LIBBFD
+static nat max_table_size;
+
+static void reset_table( int size )
+{
+    max_table_size = size;
+    table_size = 0;
+    table = (struct entry *)stgMallocBytes(size * sizeof(struct entry), "Printer.c:reset_table()");
+}
+
+static void prepare_table( void )
+{
+    /* Could sort it...  */
+}
+
+static void insert( unsigned value, const char *name )
+{
+    if ( table_size >= max_table_size ) {
+        barf( "Symbol table overflow\n" );
+    }
+    table[table_size].value = value;
+    table[table_size].name = name;
+    table_size = table_size + 1;
+}
+#endif
+
+#if 0
+static rtsBool lookup_name( char *name, unsigned *result )
+{
+    int i;
+    for( i = 0; i < table_size && strcmp(name,table[i].name) != 0; ++i ) {
+    }
+    if (i < table_size) {
+        *result = table[i].value;
+        return rtsTrue;
+    } else {
+        return rtsFalse;
+    }
+}
+#endif
+
+/* Code from somewhere inside GHC (circa 1994)
+ * * Z-escapes:
+ *     "std"++xs -> "Zstd"++xs
+ *     char_to_c 'Z'  = "ZZ"
+ *     char_to_c '&'  = "Za"
+ *     char_to_c '|'  = "Zb"
+ *     char_to_c ':'  = "Zc"
+ *     char_to_c '/'  = "Zd"
+ *     char_to_c '='  = "Ze"
+ *     char_to_c '>'  = "Zg"
+ *     char_to_c '#'  = "Zh"
+ *     char_to_c '<'  = "Zl"
+ *     char_to_c '-'  = "Zm"
+ *     char_to_c '!'  = "Zn"
+ *     char_to_c '.'  = "Zo"
+ *     char_to_c '+'  = "Zp"
+ *     char_to_c '\'' = "Zq"
+ *     char_to_c '*'  = "Zt"
+ *     char_to_c '_'  = "Zu"
+ *     char_to_c c    = "Z" ++ show (ord c)
+ */
+static char unZcode( char ch )
+{
+    switch (ch) {
+    case 'a'  : return ('&');
+    case 'b'  : return ('|');
+    case 'c'  : return (':');
+    case 'd'  : return ('/');
+    case 'e'  : return ('=');
+    case 'g'  : return ('>');
+    case 'h'  : return ('#');
+    case 'l'  : return ('<');
+    case 'm'  : return ('-');
+    case 'n'  : return ('!');
+    case 'o'  : return ('.');
+    case 'p'  : return ('+');
+    case 'q'  : return ('\'');
+    case 't'  : return ('*');
+    case 'u'  : return ('_');
+    case 'Z'  :
+    case '\0' : return ('Z');
+    default   : return (ch);
+    }
+}
+
+#if 0
+/* Precondition: out big enough to handle output (about twice length of in) */
+static void enZcode( char *in, char *out )
+{
+    int i, j;
+
+    j = 0;
+    out[ j++ ] = '_';
+    for( i = 0; in[i] != '\0'; ++i ) {
+        switch (in[i]) {
+        case 'Z'  : 
+                out[j++] = 'Z';
+                out[j++] = 'Z';
+                break;
+        case '&'  : 
+                out[j++] = 'Z';
+                out[j++] = 'a';
+                break;
+        case '|'  : 
+                out[j++] = 'Z';
+                out[j++] = 'b';
+                break;
+        case ':'  : 
+                out[j++] = 'Z';
+                out[j++] = 'c';
+                break;
+        case '/'  : 
+                out[j++] = 'Z';
+                out[j++] = 'd';
+                break;
+        case '='  : 
+                out[j++] = 'Z';
+                out[j++] = 'e';
+                break;
+        case '>'  : 
+                out[j++] = 'Z';
+                out[j++] = 'g';
+                break;
+        case '#'  : 
+                out[j++] = 'Z';
+                out[j++] = 'h';
+                break;
+        case '<'  : 
+                out[j++] = 'Z';
+                out[j++] = 'l';
+                break;
+        case '-'  : 
+                out[j++] = 'Z';
+                out[j++] = 'm';
+                break;
+        case '!'  : 
+                out[j++] = 'Z';
+                out[j++] = 'n';
+                break;
+        case '.'  : 
+                out[j++] = 'Z';
+                out[j++] = 'o';
+                break;
+        case '+'  : 
+                out[j++] = 'Z';
+                out[j++] = 'p';
+                break;
+        case '\'' : 
+                out[j++] = 'Z';
+                out[j++] = 'q';
+                break;
+        case '*'  : 
+                out[j++] = 'Z';
+                out[j++] = 't';
+                break;
+        case '_'  : 
+                out[j++] = 'Z';
+                out[j++] = 'u';
+                break;
+        default :
+                out[j++] = in[i];
+                break;
+        }
+    }
+    out[j] = '\0';
+}
+#endif
+
+const char *lookupGHCName( void *addr )
+{
+    nat i;
+    for( i = 0; i < table_size && table[i].value != (unsigned) addr; ++i ) {
+    }
+    if (i < table_size) {
+        return table[i].name;
+    } else {
+        return NULL;
+    }
+}
+
+static void printZcoded( const char *raw )
+{
+    nat j = 0;
+    
+    while ( raw[j] != '\0' ) {
+        if (raw[j] == 'Z') {
+            debugBelch("%c", unZcode(raw[j+1]));
+            j = j + 2;
+        } else {
+            debugBelch("%c", unZcode(raw[j+1]));
+            j = j + 1;
+        }
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Symbol table loading
+ * ------------------------------------------------------------------------*/
+
+/* Causing linking trouble on Win32 plats, so I'm
+   disabling this for now. 
+*/
+#ifdef USING_LIBBFD
+
+#include <bfd.h>
+
+/* Fairly ad-hoc piece of code that seems to filter out a lot of
+ * rubbish like the obj-splitting symbols
+ */
+
+static rtsBool isReal( flagword flags STG_UNUSED, const char *name )
+{
+#if 0
+    /* ToDo: make this work on BFD */
+    int tp = type & N_TYPE;    
+    if (tp == N_TEXT || tp == N_DATA) {
+        return (name[0] == '_' && name[1] != '_');
+    } else {
+        return rtsFalse;
+    }
+#else
+    if (*name == '\0'  || 
+	(name[0] == 'g' && name[1] == 'c' && name[2] == 'c') ||
+	(name[0] == 'c' && name[1] == 'c' && name[2] == '.')) {
+	return rtsFalse;
+    }
+    return rtsTrue;
+#endif
+}
+
+extern void DEBUG_LoadSymbols( char *name )
+{
+    bfd* abfd;
+    char **matching;
+
+    bfd_init();
+    abfd = bfd_openr(name, "default");
+    if (abfd == NULL) {
+	barf("can't open executable %s to get symbol table", name);
+    }
+    if (!bfd_check_format_matches (abfd, bfd_object, &matching)) {
+	barf("mismatch");
+    }
+
+    {
+	long storage_needed;
+	asymbol **symbol_table;
+	long number_of_symbols;
+        long num_real_syms = 0;
+	long i;
+     
+	storage_needed = bfd_get_symtab_upper_bound (abfd);
+     
+	if (storage_needed < 0) {
+	    barf("can't read symbol table");
+	}     
+#if 0
+	if (storage_needed == 0) {
+	    debugBelch("no storage needed");
+	}
+#endif
+	symbol_table = (asymbol **) stgMallocBytes(storage_needed,"DEBUG_LoadSymbols");
+
+	number_of_symbols = bfd_canonicalize_symtab (abfd, symbol_table);
+     
+	if (number_of_symbols < 0) {
+	    barf("can't canonicalise symbol table");
+	}
+
+        for( i = 0; i != number_of_symbols; ++i ) {
+            symbol_info info;
+            bfd_get_symbol_info(abfd,symbol_table[i],&info);
+            /*debugBelch("\t%c\t0x%x      \t%s\n",info.type,(nat)info.value,info.name); */
+            if (isReal(info.type, info.name)) {
+                num_real_syms += 1;
+            }
+        }
+    
+        IF_DEBUG(interpreter,
+                 debugBelch("Loaded %ld symbols. Of which %ld are real symbols\n", 
+                         number_of_symbols, num_real_syms)
+                 );
+
+        reset_table( num_real_syms );
+    
+        for( i = 0; i != number_of_symbols; ++i ) {
+            symbol_info info;
+            bfd_get_symbol_info(abfd,symbol_table[i],&info);
+            if (isReal(info.type, info.name)) {
+                insert( info.value, info.name );
+            }
+        }
+
+        stgFree(symbol_table);
+    }
+    prepare_table();
+}
+
+#else /* HAVE_BFD_H */
+
+extern void DEBUG_LoadSymbols( char *name STG_UNUSED )
+{
+  /* nothing, yet */
+}
+
+#endif /* HAVE_BFD_H */
+
+void findPtr(P_ p, int);		/* keep gcc -Wall happy */
+
+void
+findPtr(P_ p, int follow)
+{
+  nat s, g;
+  P_ q, r;
+  bdescr *bd;
+#if defined(__GNUC__)
+  const int arr_size = 1024;
+#else
+#define arr_size 1024
+#endif
+  StgPtr arr[arr_size];
+  int i = 0;
+
+  for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+      for (s = 0; s < generations[g].n_steps; s++) {
+	  bd = generations[g].steps[s].blocks;
+	  for (; bd; bd = bd->link) {
+	      for (q = bd->start; q < bd->free; q++) {
+		  if (*q == (W_)p) {
+		      if (i < arr_size) {
+			  r = q;
+			  while (!LOOKS_LIKE_INFO_PTR(*r) || (P_)*r == NULL) {
+			      r--;
+			  }
+			  debugBelch("%p = ", r);
+			  printClosure((StgClosure *)r);
+			  arr[i++] = r;
+		      } else {
+			  return;
+		      }
+		  }
+	      }
+	  }
+      }
+  }
+  if (follow && i == 1) {
+      debugBelch("-->\n");
+      findPtr(arr[0], 1);
+  }
+}
+
+#else /* DEBUG */
+void printPtr( StgPtr p )
+{
+    debugBelch("ptr 0x%p (enable -DDEBUG for more info) " , p );
+}
+  
+void printObj( StgClosure *obj )
+{
+    debugBelch("obj 0x%p (enable -DDEBUG for more info) " , obj );
+}
+#endif /* DEBUG */
diff --git a/rts/Printer.h b/rts/Printer.h
new file mode 100644
index 0000000000..54bf611250
--- /dev/null
+++ b/rts/Printer.h
@@ -0,0 +1,31 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Prototypes for functions in Printer.c
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef PRINTER_H
+#define PRINTER_H
+
+extern void   	   printPtr        ( StgPtr p );
+extern void   	   printObj        ( StgClosure *obj );
+
+#ifdef DEBUG
+extern void   	   printClosure    ( StgClosure *obj );
+extern StgStackPtr printStackObj   ( StgStackPtr sp );
+extern void        printStackChunk ( StgStackPtr sp, StgStackPtr spLim );
+extern void        printTSO        ( StgTSO *tso );
+
+void   	           info_hdr_type   ( StgClosure *closure, char *res );
+char  *	           info_type       ( StgClosure *closure );
+char  *	           info_type_by_ip ( StgInfoTable *ip );
+
+extern void DEBUG_LoadSymbols( char *name );
+
+extern const char *lookupGHCName( void *addr );
+#endif
+
+#endif /* PRINTER_H */
+
diff --git a/rts/ProfHeap.c b/rts/ProfHeap.c
new file mode 100644
index 0000000000..312bee735c
--- /dev/null
+++ b/rts/ProfHeap.c
@@ -0,0 +1,1156 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2003
+ *
+ * Support for heap profiling
+ *
+ * ---------------------------------------------------------------------------*/
+
+#if defined(DEBUG) && !defined(PROFILING)
+#define DEBUG_HEAP_PROF
+#else
+#undef DEBUG_HEAP_PROF
+#endif
+
+#if defined(PROFILING) || defined(DEBUG_HEAP_PROF)
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "Profiling.h"
+#include "Storage.h"
+#include "ProfHeap.h"
+#include "Stats.h"
+#include "Hash.h"
+#include "RetainerProfile.h"
+#include "LdvProfile.h"
+#include "Arena.h"
+#include "Printer.h"
+
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+
+/* -----------------------------------------------------------------------------
+ * era stores the current time period.  It is the same as the
+ * number of censuses that have been performed.
+ *
+ * RESTRICTION:
+ *   era must be no longer than LDV_SHIFT (15 or 30) bits.
+ * Invariants:
+ *   era is initialized to 1 in initHeapProfiling().
+ *
+ * max_era is initialized to 2^LDV_SHIFT in initHeapProfiling().
+ * When era reaches max_era, the profiling stops because a closure can
+ * store only up to (max_era - 1) as its creation or last use time.
+ * -------------------------------------------------------------------------- */
+unsigned int era;
+static nat max_era;
+
+/* -----------------------------------------------------------------------------
+ * Counters
+ *
+ * For most heap profiles each closure identity gets a simple count
+ * of live words in the heap at each census.  However, if we're
+ * selecting by biography, then we have to keep the various
+ * lag/drag/void counters for each identity.
+ * -------------------------------------------------------------------------- */
+typedef struct _counter {
+    void *identity;
+    union {
+	nat resid;
+	struct {
+	    int prim;     // total size of 'inherently used' closures
+	    int not_used; // total size of 'never used' closures
+	    int used;     // total size of 'used at least once' closures
+	    int void_total;  // current total size of 'destroyed without being used' closures
+	    int drag_total;  // current total size of 'used at least once and waiting to die'
+	} ldv;
+    } c;
+    struct _counter *next;
+} counter;
+
+STATIC_INLINE void
+initLDVCtr( counter *ctr )
+{
+    ctr->c.ldv.prim = 0;
+    ctr->c.ldv.not_used = 0;
+    ctr->c.ldv.used = 0;
+    ctr->c.ldv.void_total = 0;
+    ctr->c.ldv.drag_total = 0;
+}
+
+typedef struct {
+    double      time;    // the time in MUT time when the census is made
+    HashTable * hash;
+    counter   * ctrs;
+    Arena     * arena;
+
+    // for LDV profiling, when just displaying by LDV
+    int       prim;
+    int       not_used;
+    int       used;
+    int       void_total;
+    int       drag_total;
+} Census;
+
+static Census *censuses = NULL;
+static nat n_censuses = 0;
+
+#ifdef PROFILING
+static void aggregateCensusInfo( void );
+#endif
+
+static void dumpCensus( Census *census );
+
+/* -----------------------------------------------------------------------------
+   Closure Type Profiling;
+
+   PROBABLY TOTALLY OUT OF DATE -- ToDo (SDM)
+   -------------------------------------------------------------------------- */
+
+#ifdef DEBUG_HEAP_PROF
+static char *type_names[] = {
+      "INVALID_OBJECT"
+    , "CONSTR"
+    , "CONSTR_INTLIKE"
+    , "CONSTR_CHARLIKE"
+    , "CONSTR_STATIC"
+    , "CONSTR_NOCAF_STATIC"
+
+    , "FUN"
+    , "FUN_STATIC"
+
+    , "THUNK"
+    , "THUNK_STATIC"
+    , "THUNK_SELECTOR"
+
+    , "BCO"
+    , "AP_STACK"
+    , "AP"
+
+    , "PAP"
+
+    , "IND"
+    , "IND_OLDGEN"
+    , "IND_PERM"
+    , "IND_OLDGEN_PERM"
+    , "IND_STATIC"
+
+    , "RET_BCO"
+    , "RET_SMALL"
+    , "RET_VEC_SMALL"
+    , "RET_BIG"
+    , "RET_VEC_BIG"
+    , "RET_DYN"
+    , "UPDATE_FRAME"
+    , "CATCH_FRAME"
+    , "STOP_FRAME"
+
+    , "BLACKHOLE"
+    , "MVAR"
+
+    , "ARR_WORDS"
+
+    , "MUT_ARR_PTRS_CLEAN"
+    , "MUT_ARR_PTRS_DIRTY"
+    , "MUT_ARR_PTRS_FROZEN"
+    , "MUT_VAR_CLEAN"
+    , "MUT_VAR_DIRTY"
+
+    , "WEAK"
+  
+    , "TSO"
+
+    , "BLOCKED_FETCH"
+    , "FETCH_ME"
+
+    , "EVACUATED"
+};
+
+#endif /* DEBUG_HEAP_PROF */
+
+/* -----------------------------------------------------------------------------
+ * Find the "closure identity", which is a unique pointer reresenting
+ * the band to which this closure's heap space is attributed in the
+ * heap profile.
+ * ------------------------------------------------------------------------- */
+STATIC_INLINE void *
+closureIdentity( StgClosure *p )
+{
+    switch (RtsFlags.ProfFlags.doHeapProfile) {
+
+#ifdef PROFILING
+    case HEAP_BY_CCS:
+	return p->header.prof.ccs;
+    case HEAP_BY_MOD:
+	return p->header.prof.ccs->cc->module;
+    case HEAP_BY_DESCR:
+	return get_itbl(p)->prof.closure_desc;
+    case HEAP_BY_TYPE:
+	return get_itbl(p)->prof.closure_type;
+    case HEAP_BY_RETAINER:
+	// AFAIK, the only closures in the heap which might not have a
+	// valid retainer set are DEAD_WEAK closures.
+	if (isRetainerSetFieldValid(p))
+	    return retainerSetOf(p);
+	else
+	    return NULL;
+
+#else // DEBUG
+    case HEAP_BY_INFOPTR:
+	return (void *)((StgClosure *)p)->header.info; 
+    case HEAP_BY_CLOSURE_TYPE:
+	return type_names[get_itbl(p)->type];
+
+#endif
+    default:
+	barf("closureIdentity");
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Profiling type predicates
+ * ----------------------------------------------------------------------- */
+#ifdef PROFILING
+STATIC_INLINE rtsBool
+doingLDVProfiling( void )
+{
+    return (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_LDV 
+	    || RtsFlags.ProfFlags.bioSelector != NULL);
+}
+
+STATIC_INLINE rtsBool
+doingRetainerProfiling( void )
+{
+    return (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER
+	    || RtsFlags.ProfFlags.retainerSelector != NULL);
+}
+#endif /* PROFILING */
+
+// Precesses a closure 'c' being destroyed whose size is 'size'.
+// Make sure that LDV_recordDead() is not invoked on 'inherently used' closures
+// such as TSO; they should not be involved in computing dragNew or voidNew.
+// 
+// Even though era is checked in both LdvCensusForDead() and 
+// LdvCensusKillAll(), we still need to make sure that era is > 0 because 
+// LDV_recordDead() may be called from elsewhere in the runtime system. E.g., 
+// when a thunk is replaced by an indirection object.
+
+#ifdef PROFILING
+void
+LDV_recordDead( StgClosure *c, nat size )
+{
+    void *id;
+    nat t;
+    counter *ctr;
+
+    if (era > 0 && closureSatisfiesConstraints(c)) {
+	size -= sizeofW(StgProfHeader);
+	ASSERT(LDVW(c) != 0);
+	if ((LDVW((c)) & LDV_STATE_MASK) == LDV_STATE_CREATE) {
+	    t = (LDVW((c)) & LDV_CREATE_MASK) >> LDV_SHIFT;
+	    if (t < era) {
+		if (RtsFlags.ProfFlags.bioSelector == NULL) {
+		    censuses[t].void_total   += (int)size;
+		    censuses[era].void_total -= (int)size;
+		    ASSERT(censuses[t].void_total < censuses[t].not_used);
+		} else {
+		    id = closureIdentity(c);
+		    ctr = lookupHashTable(censuses[t].hash, (StgWord)id);
+		    ASSERT( ctr != NULL );
+		    ctr->c.ldv.void_total += (int)size;
+		    ctr = lookupHashTable(censuses[era].hash, (StgWord)id);
+		    if (ctr == NULL) {
+			ctr = arenaAlloc(censuses[era].arena, sizeof(counter));
+			initLDVCtr(ctr);
+			insertHashTable(censuses[era].hash, (StgWord)id, ctr);
+			ctr->identity = id;
+			ctr->next = censuses[era].ctrs;
+			censuses[era].ctrs = ctr;
+		    }
+		    ctr->c.ldv.void_total -= (int)size;
+		}
+	    }
+	} else {
+	    t = LDVW((c)) & LDV_LAST_MASK;
+	    if (t + 1 < era) {
+		if (RtsFlags.ProfFlags.bioSelector == NULL) {
+		    censuses[t+1].drag_total += size;
+		    censuses[era].drag_total -= size;
+		} else {
+		    void *id;
+		    id = closureIdentity(c);
+		    ctr = lookupHashTable(censuses[t+1].hash, (StgWord)id);
+		    ASSERT( ctr != NULL );
+		    ctr->c.ldv.drag_total += (int)size;
+		    ctr = lookupHashTable(censuses[era].hash, (StgWord)id);
+		    if (ctr == NULL) {
+			ctr = arenaAlloc(censuses[era].arena, sizeof(counter));
+			initLDVCtr(ctr);
+			insertHashTable(censuses[era].hash, (StgWord)id, ctr);
+			ctr->identity = id;
+			ctr->next = censuses[era].ctrs;
+			censuses[era].ctrs = ctr;
+		    }
+		    ctr->c.ldv.drag_total -= (int)size;
+		}
+	    }
+	}
+    }
+}
+#endif
+
+/* --------------------------------------------------------------------------
+ * Initialize censuses[era];
+ * ----------------------------------------------------------------------- */
+STATIC_INLINE void
+initEra(Census *census)
+{
+    census->hash  = allocHashTable();
+    census->ctrs  = NULL;
+    census->arena = newArena();
+
+    census->not_used   = 0;
+    census->used       = 0;
+    census->prim       = 0;
+    census->void_total = 0;
+    census->drag_total = 0;
+}
+
+/* --------------------------------------------------------------------------
+ * Increases era by 1 and initialize census[era].
+ * Reallocates gi[] and increases its size if needed.
+ * ----------------------------------------------------------------------- */
+static void
+nextEra( void )
+{
+#ifdef PROFILING
+    if (doingLDVProfiling()) { 
+	era++;
+
+	if (era == max_era) {
+	    errorBelch("maximum number of censuses reached; use +RTS -i to reduce");
+	    stg_exit(EXIT_FAILURE);
+	}
+	
+	if (era == n_censuses) {
+	    n_censuses *= 2;
+	    censuses = stgReallocBytes(censuses, sizeof(Census) * n_censuses,
+				       "nextEra");
+	}
+    }
+#endif /* PROFILING */
+
+    initEra( &censuses[era] );
+}
+
+/* -----------------------------------------------------------------------------
+ * DEBUG heap profiling, by info table
+ * -------------------------------------------------------------------------- */
+
+#ifdef DEBUG_HEAP_PROF
+FILE *hp_file;
+static char *hp_filename;
+
+void initProfiling1( void )
+{
+}
+
+void initProfiling2( void )
+{
+  if (RtsFlags.ProfFlags.doHeapProfile) {
+    /* Initialise the log file name */
+    hp_filename = stgMallocBytes(strlen(prog_name) + 6, "hpFileName");
+    sprintf(hp_filename, "%s.hp", prog_name);
+    
+    /* open the log file */
+    if ((hp_file = fopen(hp_filename, "w")) == NULL) {
+      debugBelch("Can't open profiling report file %s\n", 
+	      hp_filename);
+      RtsFlags.ProfFlags.doHeapProfile = 0;
+      return;
+    }
+  }
+  
+  initHeapProfiling();
+}
+
+void endProfiling( void )
+{
+  endHeapProfiling();
+}
+#endif /* DEBUG_HEAP_PROF */
+
+static void
+printSample(rtsBool beginSample, StgDouble sampleValue)
+{
+    StgDouble fractionalPart, integralPart;
+    fractionalPart = modf(sampleValue, &integralPart);
+    fprintf(hp_file, "%s %d.%02d\n",
+            (beginSample ? "BEGIN_SAMPLE" : "END_SAMPLE"),
+            (int)integralPart, (int)(fractionalPart * 100));
+}
+
+/* --------------------------------------------------------------------------
+ * Initialize the heap profilier
+ * ----------------------------------------------------------------------- */
+nat
+initHeapProfiling(void)
+{
+    if (! RtsFlags.ProfFlags.doHeapProfile) {
+        return 0;
+    }
+
+#ifdef PROFILING
+    if (doingLDVProfiling() && doingRetainerProfiling()) {
+	errorBelch("cannot mix -hb and -hr");
+	stg_exit(EXIT_FAILURE);
+    }
+#endif
+
+    // we only count eras if we're doing LDV profiling.  Otherwise era
+    // is fixed at zero.
+#ifdef PROFILING
+    if (doingLDVProfiling()) {
+	era = 1;
+    } else
+#endif
+    {
+	era = 0;
+    }
+
+    {   // max_era = 2^LDV_SHIFT
+	nat p;
+	max_era = 1;
+	for (p = 0; p < LDV_SHIFT; p++)
+	    max_era *= 2;
+    }
+
+    n_censuses = 32;
+    censuses = stgMallocBytes(sizeof(Census) * n_censuses, "initHeapProfiling");
+
+    initEra( &censuses[era] );
+
+    /* initProfilingLogFile(); */
+    fprintf(hp_file, "JOB \"%s", prog_name);
+
+#ifdef PROFILING
+    {
+	int count;
+	for(count = 1; count < prog_argc; count++)
+	    fprintf(hp_file, " %s", prog_argv[count]);
+	fprintf(hp_file, " +RTS");
+	for(count = 0; count < rts_argc; count++)
+	    fprintf(hp_file, " %s", rts_argv[count]);
+    }
+#endif /* PROFILING */
+
+    fprintf(hp_file, "\"\n" );
+
+    fprintf(hp_file, "DATE \"%s\"\n", time_str());
+
+    fprintf(hp_file, "SAMPLE_UNIT \"seconds\"\n");
+    fprintf(hp_file, "VALUE_UNIT \"bytes\"\n");
+
+    printSample(rtsTrue, 0);
+    printSample(rtsFalse, 0);
+
+#ifdef DEBUG_HEAP_PROF
+    DEBUG_LoadSymbols(prog_name);
+#endif
+
+#ifdef PROFILING
+    if (doingRetainerProfiling()) {
+	initRetainerProfiling();
+    }
+#endif
+
+    return 0;
+}
+
+void
+endHeapProfiling(void)
+{
+    StgDouble seconds;
+
+    if (! RtsFlags.ProfFlags.doHeapProfile) {
+        return;
+    }
+
+#ifdef PROFILING
+    if (doingRetainerProfiling()) {
+	endRetainerProfiling();
+    }
+#endif
+
+#ifdef PROFILING
+    if (doingLDVProfiling()) {
+	nat t;
+	LdvCensusKillAll();
+	aggregateCensusInfo();
+	for (t = 1; t < era; t++) {
+	    dumpCensus( &censuses[t] );
+	}
+    }
+#endif
+
+    seconds = mut_user_time();
+    printSample(rtsTrue, seconds);
+    printSample(rtsFalse, seconds);
+    fclose(hp_file);
+}
+
+
+
+#ifdef PROFILING
+static size_t
+buf_append(char *p, const char *q, char *end)
+{
+    int m;
+
+    for (m = 0; p < end; p++, q++, m++) {
+	*p = *q;
+	if (*q == '\0') { break; }
+    }
+    return m;
+}
+
+static void
+fprint_ccs(FILE *fp, CostCentreStack *ccs, nat max_length)
+{
+    char buf[max_length+1], *p, *buf_end;
+
+    // MAIN on its own gets printed as "MAIN", otherwise we ignore MAIN.
+    if (ccs == CCS_MAIN) {
+	fprintf(fp, "MAIN");
+	return;
+    }
+
+    fprintf(fp, "(%ld)", ccs->ccsID);
+
+    p = buf;
+    buf_end = buf + max_length + 1;
+
+    // keep printing components of the stack until we run out of space
+    // in the buffer.  If we run out of space, end with "...".
+    for (; ccs != NULL && ccs != CCS_MAIN; ccs = ccs->prevStack) {
+
+	// CAF cost centres print as M.CAF, but we leave the module
+	// name out of all the others to save space.
+	if (!strcmp(ccs->cc->label,"CAF")) {
+	    p += buf_append(p, ccs->cc->module, buf_end);
+	    p += buf_append(p, ".CAF", buf_end);
+	} else {
+	    if (ccs->prevStack != NULL && ccs->prevStack != CCS_MAIN) {
+		p += buf_append(p, "/", buf_end);
+	    }
+	    p += buf_append(p, ccs->cc->label, buf_end);
+	}
+	
+	if (p >= buf_end) {
+	    sprintf(buf+max_length-4, "...");
+	    break;
+	}
+    }
+    fprintf(fp, "%s", buf);
+}
+#endif /* PROFILING */
+
+rtsBool
+strMatchesSelector( char* str, char* sel )
+{
+   char* p;
+   // debugBelch("str_matches_selector %s %s\n", str, sel);
+   while (1) {
+       // Compare str against wherever we've got to in sel.
+       p = str;
+       while (*p != '\0' && *sel != ',' && *sel != '\0' && *p == *sel) {
+	   p++; sel++;
+       }
+       // Match if all of str used and have reached the end of a sel fragment.
+       if (*p == '\0' && (*sel == ',' || *sel == '\0'))
+	   return rtsTrue;
+       
+       // No match.  Advance sel to the start of the next elem.
+       while (*sel != ',' && *sel != '\0') sel++;
+       if (*sel == ',') sel++;
+       
+       /* Run out of sel ?? */
+       if (*sel == '\0') return rtsFalse;
+   }
+}
+
+/* -----------------------------------------------------------------------------
+ * Figure out whether a closure should be counted in this census, by
+ * testing against all the specified constraints.
+ * -------------------------------------------------------------------------- */
+rtsBool
+closureSatisfiesConstraints( StgClosure* p )
+{
+#ifdef DEBUG_HEAP_PROF
+    (void)p;   /* keep gcc -Wall happy */
+    return rtsTrue;
+#else
+   rtsBool b;
+
+   // The CCS has a selected field to indicate whether this closure is
+   // deselected by not being mentioned in the module, CC, or CCS
+   // selectors.
+   if (!p->header.prof.ccs->selected) {
+       return rtsFalse;
+   }
+
+   if (RtsFlags.ProfFlags.descrSelector) {
+       b = strMatchesSelector( (get_itbl((StgClosure *)p))->prof.closure_desc,
+				 RtsFlags.ProfFlags.descrSelector );
+       if (!b) return rtsFalse;
+   }
+   if (RtsFlags.ProfFlags.typeSelector) {
+       b = strMatchesSelector( (get_itbl((StgClosure *)p))->prof.closure_type,
+                                RtsFlags.ProfFlags.typeSelector );
+       if (!b) return rtsFalse;
+   }
+   if (RtsFlags.ProfFlags.retainerSelector) {
+       RetainerSet *rs;
+       nat i;
+       // We must check that the retainer set is valid here.  One
+       // reason it might not be valid is if this closure is a
+       // a newly deceased weak pointer (i.e. a DEAD_WEAK), since
+       // these aren't reached by the retainer profiler's traversal.
+       if (isRetainerSetFieldValid((StgClosure *)p)) {
+	   rs = retainerSetOf((StgClosure *)p);
+	   if (rs != NULL) {
+	       for (i = 0; i < rs->num; i++) {
+		   b = strMatchesSelector( rs->element[i]->cc->label,
+					   RtsFlags.ProfFlags.retainerSelector );
+		   if (b) return rtsTrue;
+	       }
+	   }
+       }
+       return rtsFalse;
+   }
+   return rtsTrue;
+#endif /* PROFILING */
+}
+
+/* -----------------------------------------------------------------------------
+ * Aggregate the heap census info for biographical profiling
+ * -------------------------------------------------------------------------- */
+#ifdef PROFILING
+static void
+aggregateCensusInfo( void )
+{
+    HashTable *acc;
+    nat t;
+    counter *c, *d, *ctrs;
+    Arena *arena;
+
+    if (!doingLDVProfiling()) return;
+
+    // Aggregate the LDV counters when displaying by biography.
+    if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_LDV) {
+	int void_total, drag_total;
+
+	// Now we compute void_total and drag_total for each census
+	// After the program has finished, the void_total field of
+	// each census contains the count of words that were *created*
+	// in this era and were eventually void.  Conversely, if a
+	// void closure was destroyed in this era, it will be
+	// represented by a negative count of words in void_total.
+	//
+	// To get the count of live words that are void at each
+	// census, just propagate the void_total count forwards:
+
+	void_total = 0;
+	drag_total = 0;
+	for (t = 1; t < era; t++) { // note: start at 1, not 0
+	    void_total += censuses[t].void_total;
+	    drag_total += censuses[t].drag_total;
+	    censuses[t].void_total = void_total;
+	    censuses[t].drag_total = drag_total;
+
+	    ASSERT( censuses[t].void_total <= censuses[t].not_used );
+	    // should be true because: void_total is the count of
+	    // live words that are void at this census, which *must*
+	    // be less than the number of live words that have not
+	    // been used yet.
+
+	    ASSERT( censuses[t].drag_total <= censuses[t].used );
+	    // similar reasoning as above.
+	}
+	
+	return;
+    }
+
+    // otherwise... we're doing a heap profile that is restricted to
+    // some combination of lag, drag, void or use.  We've kept all the
+    // census info for all censuses so far, but we still need to
+    // aggregate the counters forwards.
+
+    arena = newArena();
+    acc = allocHashTable();
+    ctrs = NULL;
+
+    for (t = 1; t < era; t++) {
+
+	// first look through all the counters we're aggregating
+	for (c = ctrs; c != NULL; c = c->next) {
+	    // if one of the totals is non-zero, then this closure
+	    // type must be present in the heap at this census time...
+	    d = lookupHashTable(censuses[t].hash, (StgWord)c->identity);
+
+	    if (d == NULL) {
+		// if this closure identity isn't present in the
+		// census for this time period, then our running
+		// totals *must* be zero.
+		ASSERT(c->c.ldv.void_total == 0 && c->c.ldv.drag_total == 0);
+
+		// debugCCS(c->identity);
+		// debugBelch(" census=%d void_total=%d drag_total=%d\n",
+		//         t, c->c.ldv.void_total, c->c.ldv.drag_total);
+	    } else {
+		d->c.ldv.void_total += c->c.ldv.void_total;
+		d->c.ldv.drag_total += c->c.ldv.drag_total;
+		c->c.ldv.void_total =  d->c.ldv.void_total;
+		c->c.ldv.drag_total =  d->c.ldv.drag_total;
+
+		ASSERT( c->c.ldv.void_total >= 0 );
+		ASSERT( c->c.ldv.drag_total >= 0 );
+	    }
+	}
+
+	// now look through the counters in this census to find new ones
+	for (c = censuses[t].ctrs; c != NULL; c = c->next) {
+	    d = lookupHashTable(acc, (StgWord)c->identity);
+	    if (d == NULL) {
+		d = arenaAlloc( arena, sizeof(counter) );
+		initLDVCtr(d);
+		insertHashTable( acc, (StgWord)c->identity, d );
+		d->identity = c->identity;
+		d->next = ctrs;
+		ctrs = d;
+		d->c.ldv.void_total = c->c.ldv.void_total;
+		d->c.ldv.drag_total = c->c.ldv.drag_total;
+	    }
+	    ASSERT( c->c.ldv.void_total >= 0 );
+	    ASSERT( c->c.ldv.drag_total >= 0 );
+	}
+    }
+
+    freeHashTable(acc, NULL);
+    arenaFree(arena);
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Print out the results of a heap census.
+ * -------------------------------------------------------------------------- */
+static void
+dumpCensus( Census *census )
+{
+    counter *ctr;
+    int count;
+
+    printSample(rtsTrue, census->time);
+
+#ifdef PROFILING
+    if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_LDV) {
+      fprintf(hp_file, "VOID\t%lu\n", (unsigned long)(census->void_total) * sizeof(W_));
+	fprintf(hp_file, "LAG\t%lu\n", 
+		(unsigned long)(census->not_used - census->void_total) * sizeof(W_));
+	fprintf(hp_file, "USE\t%lu\n", 
+		(unsigned long)(census->used - census->drag_total) * sizeof(W_));
+	fprintf(hp_file, "INHERENT_USE\t%lu\n", 
+		(unsigned long)(census->prim) * sizeof(W_));
+	fprintf(hp_file, "DRAG\t%lu\n",
+		(unsigned long)(census->drag_total) * sizeof(W_));
+	printSample(rtsFalse, census->time);
+	return;
+    }
+#endif
+
+    for (ctr = census->ctrs; ctr != NULL; ctr = ctr->next) {
+
+#ifdef PROFILING
+	if (RtsFlags.ProfFlags.bioSelector != NULL) {
+	    count = 0;
+	    if (strMatchesSelector("lag", RtsFlags.ProfFlags.bioSelector))
+		count += ctr->c.ldv.not_used - ctr->c.ldv.void_total;
+	    if (strMatchesSelector("drag", RtsFlags.ProfFlags.bioSelector))
+		count += ctr->c.ldv.drag_total;
+	    if (strMatchesSelector("void", RtsFlags.ProfFlags.bioSelector))
+		count += ctr->c.ldv.void_total;
+	    if (strMatchesSelector("use", RtsFlags.ProfFlags.bioSelector))
+		count += ctr->c.ldv.used - ctr->c.ldv.drag_total;
+	} else
+#endif
+	{
+	    count = ctr->c.resid;
+	}
+
+	ASSERT( count >= 0 );
+
+	if (count == 0) continue;
+
+#ifdef DEBUG_HEAP_PROF
+	switch (RtsFlags.ProfFlags.doHeapProfile) {
+	case HEAP_BY_INFOPTR:
+	    fprintf(hp_file, "%s", lookupGHCName(ctr->identity));
+	    break;
+	case HEAP_BY_CLOSURE_TYPE:
+	    fprintf(hp_file, "%s", (char *)ctr->identity);
+	    break;
+	}
+#endif
+	
+#ifdef PROFILING
+	switch (RtsFlags.ProfFlags.doHeapProfile) {
+	case HEAP_BY_CCS:
+	    fprint_ccs(hp_file, (CostCentreStack *)ctr->identity, 25);
+	    break;
+	case HEAP_BY_MOD:
+	case HEAP_BY_DESCR:
+	case HEAP_BY_TYPE:
+	    fprintf(hp_file, "%s", (char *)ctr->identity);
+	    break;
+	case HEAP_BY_RETAINER:
+	{
+	    RetainerSet *rs = (RetainerSet *)ctr->identity;
+
+	    // it might be the distinguished retainer set rs_MANY:
+	    if (rs == &rs_MANY) {
+		fprintf(hp_file, "MANY");
+		break;
+	    }
+
+	    // Mark this retainer set by negating its id, because it
+	    // has appeared in at least one census.  We print the
+	    // values of all such retainer sets into the log file at
+	    // the end.  A retainer set may exist but not feature in
+	    // any censuses if it arose as the intermediate retainer
+	    // set for some closure during retainer set calculation.
+	    if (rs->id > 0)
+		rs->id = -(rs->id);
+
+	    // report in the unit of bytes: * sizeof(StgWord)
+	    printRetainerSetShort(hp_file, rs);
+	    break;
+	}
+	default:
+	    barf("dumpCensus; doHeapProfile");
+	}
+#endif
+
+	fprintf(hp_file, "\t%lu\n", (unsigned long)count * sizeof(W_));
+    }
+
+    printSample(rtsFalse, census->time);
+}
+
+/* -----------------------------------------------------------------------------
+ * Code to perform a heap census.
+ * -------------------------------------------------------------------------- */
+static void
+heapCensusChain( Census *census, bdescr *bd )
+{
+    StgPtr p;
+    StgInfoTable *info;
+    void *identity;
+    nat size;
+    counter *ctr;
+    nat real_size;
+    rtsBool prim;
+
+    for (; bd != NULL; bd = bd->link) {
+
+	// HACK: ignore pinned blocks, because they contain gaps.
+	// It's not clear exactly what we'd like to do here, since we
+	// can't tell which objects in the block are actually alive.
+	// Perhaps the whole block should be counted as SYSTEM memory.
+	if (bd->flags & BF_PINNED) {
+	    continue;
+	}
+
+	p = bd->start;
+	while (p < bd->free) {
+	    info = get_itbl((StgClosure *)p);
+	    prim = rtsFalse;
+	    
+	    switch (info->type) {
+
+	    case THUNK:
+		size = thunk_sizeW_fromITBL(info);
+		break;
+
+	    case THUNK_1_1:
+	    case THUNK_0_2:
+	    case THUNK_2_0:
+		size = sizeofW(StgThunkHeader) + 2;
+		break;
+
+	    case THUNK_1_0:
+	    case THUNK_0_1:
+	    case THUNK_SELECTOR:
+		size = sizeofW(StgThunkHeader) + 1;
+		break;
+
+	    case CONSTR:
+	    case FUN:
+	    case IND_PERM:
+	    case IND_OLDGEN:
+	    case IND_OLDGEN_PERM:
+	    case CAF_BLACKHOLE:
+	    case SE_CAF_BLACKHOLE:
+	    case SE_BLACKHOLE:
+	    case BLACKHOLE:
+	    case CONSTR_INTLIKE:
+	    case CONSTR_CHARLIKE:
+	    case FUN_1_0:
+	    case FUN_0_1:
+	    case FUN_1_1:
+	    case FUN_0_2:
+	    case FUN_2_0:
+	    case CONSTR_1_0:
+	    case CONSTR_0_1:
+	    case CONSTR_1_1:
+	    case CONSTR_0_2:
+	    case CONSTR_2_0:
+		size = sizeW_fromITBL(info);
+		break;
+
+	    case IND:
+		// Special case/Delicate Hack: INDs don't normally
+		// appear, since we're doing this heap census right
+		// after GC.  However, GarbageCollect() also does
+		// resurrectThreads(), which can update some
+		// blackholes when it calls raiseAsync() on the
+		// resurrected threads.  So we know that any IND will
+		// be the size of a BLACKHOLE.
+		size = BLACKHOLE_sizeW();
+		break;
+
+	    case BCO:
+		prim = rtsTrue;
+		size = bco_sizeW((StgBCO *)p);
+		break;
+
+	    case MVAR:
+	    case WEAK:
+	    case STABLE_NAME:
+	    case MUT_VAR_CLEAN:
+	    case MUT_VAR_DIRTY:
+		prim = rtsTrue;
+		size = sizeW_fromITBL(info);
+		break;
+
+	    case AP:
+		size = ap_sizeW((StgAP *)p);
+		break;
+
+	    case PAP:
+		size = pap_sizeW((StgPAP *)p);
+		break;
+
+	    case AP_STACK:
+		size = ap_stack_sizeW((StgAP_STACK *)p);
+		break;
+		
+	    case ARR_WORDS:
+		prim = rtsTrue;
+		size = arr_words_sizeW(stgCast(StgArrWords*,p));
+		break;
+		
+	    case MUT_ARR_PTRS_CLEAN:
+	    case MUT_ARR_PTRS_DIRTY:
+	    case MUT_ARR_PTRS_FROZEN:
+	    case MUT_ARR_PTRS_FROZEN0:
+		prim = rtsTrue;
+		size = mut_arr_ptrs_sizeW((StgMutArrPtrs *)p);
+		break;
+		
+	    case TSO:
+		prim = rtsTrue;
+#ifdef DEBUG_HEAP_PROF
+		size = tso_sizeW((StgTSO *)p);
+		break;
+#else
+		if (RtsFlags.ProfFlags.includeTSOs) {
+		    size = tso_sizeW((StgTSO *)p);
+		    break;
+		} else {
+		    // Skip this TSO and move on to the next object
+		    p += tso_sizeW((StgTSO *)p);
+		    continue;
+		}
+#endif
+
+	    case TREC_HEADER: 
+		prim = rtsTrue;
+		size = sizeofW(StgTRecHeader);
+		break;
+
+	    case TVAR_WAIT_QUEUE:
+		prim = rtsTrue;
+		size = sizeofW(StgTVarWaitQueue);
+		break;
+		
+	    case TVAR:
+		prim = rtsTrue;
+		size = sizeofW(StgTVar);
+		break;
+		
+	    case TREC_CHUNK:
+		prim = rtsTrue;
+		size = sizeofW(StgTRecChunk);
+		break;
+
+	    default:
+		barf("heapCensus, unknown object: %d", info->type);
+	    }
+	    
+	    identity = NULL;
+
+#ifdef DEBUG_HEAP_PROF
+	    real_size = size;
+#else
+	    // subtract the profiling overhead
+	    real_size = size - sizeofW(StgProfHeader);
+#endif
+
+	    if (closureSatisfiesConstraints((StgClosure*)p)) {
+#ifdef PROFILING
+		if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_LDV) {
+		    if (prim)
+			census->prim += real_size;
+		    else if ((LDVW(p) & LDV_STATE_MASK) == LDV_STATE_CREATE)
+			census->not_used += real_size;
+		    else
+			census->used += real_size;
+		} else
+#endif
+		{
+		    identity = closureIdentity((StgClosure *)p);
+
+		    if (identity != NULL) {
+			ctr = lookupHashTable( census->hash, (StgWord)identity );
+			if (ctr != NULL) {
+#ifdef PROFILING
+			    if (RtsFlags.ProfFlags.bioSelector != NULL) {
+				if (prim)
+				    ctr->c.ldv.prim += real_size;
+				else if ((LDVW(p) & LDV_STATE_MASK) == LDV_STATE_CREATE)
+				    ctr->c.ldv.not_used += real_size;
+				else
+				    ctr->c.ldv.used += real_size;
+			    } else
+#endif
+			    {
+				ctr->c.resid += real_size;
+			    }
+			} else {
+			    ctr = arenaAlloc( census->arena, sizeof(counter) );
+			    initLDVCtr(ctr);
+			    insertHashTable( census->hash, (StgWord)identity, ctr );
+			    ctr->identity = identity;
+			    ctr->next = census->ctrs;
+			    census->ctrs = ctr;
+
+#ifdef PROFILING
+			    if (RtsFlags.ProfFlags.bioSelector != NULL) {
+				if (prim)
+				    ctr->c.ldv.prim = real_size;
+				else if ((LDVW(p) & LDV_STATE_MASK) == LDV_STATE_CREATE)
+				    ctr->c.ldv.not_used = real_size;
+				else
+				    ctr->c.ldv.used = real_size;
+			    } else
+#endif
+			    {
+				ctr->c.resid = real_size;
+			    }
+			}
+		    }
+		}
+	    }
+
+	    p += size;
+	}
+    }
+}
+
+void
+heapCensus( void )
+{
+  nat g, s;
+  Census *census;
+
+  census = &censuses[era];
+  census->time  = mut_user_time();
+    
+  // calculate retainer sets if necessary
+#ifdef PROFILING
+  if (doingRetainerProfiling()) {
+      retainerProfile();
+  }
+#endif
+
+#ifdef PROFILING
+  stat_startHeapCensus();
+#endif
+
+  // Traverse the heap, collecting the census info
+
+  // First the small_alloc_list: we have to fix the free pointer at
+  // the end by calling tidyAllocatedLists() first.
+  tidyAllocateLists();
+  heapCensusChain( census, small_alloc_list );
+
+  // Now traverse the heap in each generation/step.
+  if (RtsFlags.GcFlags.generations == 1) {
+      heapCensusChain( census, g0s0->blocks );
+  } else {
+      for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	  for (s = 0; s < generations[g].n_steps; s++) {
+	      heapCensusChain( census, generations[g].steps[s].blocks );
+	      // Are we interested in large objects?  might be
+	      // confusing to include the stack in a heap profile.
+	      heapCensusChain( census, generations[g].steps[s].large_objects );
+	  }
+      }
+  }
+
+  // dump out the census info
+#ifdef PROFILING
+    // We can't generate any info for LDV profiling until
+    // the end of the run...
+    if (!doingLDVProfiling())
+	dumpCensus( census );
+#else
+    dumpCensus( census );
+#endif
+
+
+  // free our storage, unless we're keeping all the census info for
+  // future restriction by biography.
+#ifdef PROFILING
+  if (RtsFlags.ProfFlags.bioSelector == NULL)
+#endif
+  {
+      freeHashTable( census->hash, NULL/* don't free the elements */ );
+      arenaFree( census->arena );
+      census->hash = NULL;
+      census->arena = NULL;
+  }
+
+  // we're into the next time period now
+  nextEra();
+
+#ifdef PROFILING
+  stat_endHeapCensus();
+#endif
+}    
+
+#endif /* PROFILING || DEBUG_HEAP_PROF */
+
diff --git a/rts/ProfHeap.h b/rts/ProfHeap.h
new file mode 100644
index 0000000000..0251416762
--- /dev/null
+++ b/rts/ProfHeap.h
@@ -0,0 +1,19 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Support for heap profiling
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef PROFHEAP_H
+#define PROFHEAP_H
+
+extern void    heapCensus( void );
+extern nat     initHeapProfiling( void );
+extern void    endHeapProfiling( void );
+extern rtsBool closureSatisfiesConstraints( StgClosure* p );
+extern void    LDV_recordDead( StgClosure *c, nat size );
+extern rtsBool strMatchesSelector( char* str, char* sel );
+
+#endif /* PROFHEAP_H */
diff --git a/rts/Profiling.c b/rts/Profiling.c
new file mode 100644
index 0000000000..028dc5a509
--- /dev/null
+++ b/rts/Profiling.c
@@ -0,0 +1,941 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2000
+ *
+ * Support for profiling
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifdef PROFILING
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "Profiling.h"
+#include "Storage.h"
+#include "Proftimer.h"
+#include "Timer.h"
+#include "ProfHeap.h"
+#include "Arena.h"
+#include "RetainerProfile.h"
+#include "LdvProfile.h"
+
+#include <string.h>
+
+/*
+ * Profiling allocation arena.
+ */
+Arena *prof_arena;
+
+/*
+ * Global variables used to assign unique IDs to cc's, ccs's, and 
+ * closure_cats
+ */
+
+unsigned int CC_ID;
+unsigned int CCS_ID;
+unsigned int HP_ID;
+
+/* figures for the profiling report.
+ */
+static ullong total_alloc;
+static lnat   total_prof_ticks;
+
+/* Globals for opening the profiling log file(s)
+ */
+static char *prof_filename; /* prof report file name = <program>.prof */
+FILE *prof_file;
+
+static char *hp_filename;	/* heap profile (hp2ps style) log file */
+FILE *hp_file;
+
+/* The Current Cost Centre Stack (for attributing costs)
+ */
+CostCentreStack *CCCS;
+
+/* Linked lists to keep track of cc's and ccs's that haven't
+ * been declared in the log file yet
+ */
+CostCentre *CC_LIST;
+CostCentreStack *CCS_LIST;
+
+/*
+ * Built-in cost centres and cost-centre stacks:
+ *
+ *    MAIN   is the root of the cost-centre stack tree.  If there are
+ *           no _scc_s in the program, all costs will be attributed
+ *           to MAIN.
+ *
+ *    SYSTEM is the RTS in general (scheduler, etc.).  All costs for
+ *           RTS operations apart from garbage collection are attributed
+ *           to SYSTEM.
+ *
+ *    GC     is the storage manager / garbage collector.
+ *
+ *    OVERHEAD gets all costs generated by the profiling system
+ *           itself.  These are costs that would not be incurred
+ *           during non-profiled execution of the program.
+ *
+ *    SUBSUMED is the one-and-only CCS placed on top-level functions. 
+ *           It indicates that all costs are to be attributed to the
+ *           enclosing cost centre stack.  SUBSUMED never accumulates
+ *           any costs.  The is_caf flag is set on the subsumed cost
+ *           centre.
+ *
+ *    DONT_CARE is a placeholder cost-centre we assign to static
+ *           constructors.  It should *never* accumulate any costs.
+ */
+
+CC_DECLARE(CC_MAIN,      "MAIN", 	"MAIN",      CC_IS_BORING, );
+CC_DECLARE(CC_SYSTEM,    "SYSTEM",   	"MAIN",      CC_IS_BORING, );
+CC_DECLARE(CC_GC,        "GC",   	"GC",        CC_IS_BORING, );
+CC_DECLARE(CC_OVERHEAD,  "OVERHEAD_of", "PROFILING", CC_IS_CAF,    );
+CC_DECLARE(CC_SUBSUMED,  "SUBSUMED",    "MAIN",      CC_IS_CAF,    );
+CC_DECLARE(CC_DONT_CARE, "DONT_CARE",   "MAIN",      CC_IS_BORING, );
+
+CCS_DECLARE(CCS_MAIN, 	    CC_MAIN,       );
+CCS_DECLARE(CCS_SYSTEM,	    CC_SYSTEM,     );
+CCS_DECLARE(CCS_GC,         CC_GC,         );
+CCS_DECLARE(CCS_OVERHEAD,   CC_OVERHEAD,   );
+CCS_DECLARE(CCS_SUBSUMED,   CC_SUBSUMED,   );
+CCS_DECLARE(CCS_DONT_CARE,  CC_DONT_CARE, );
+
+/* 
+ * Uniques for the XML log-file format
+ */
+#define CC_UQ         1
+#define CCS_UQ        2
+#define TC_UQ         3
+#define HEAP_OBJ_UQ   4
+#define TIME_UPD_UQ   5
+#define HEAP_UPD_UQ   6
+
+/* 
+ * Static Functions
+ */
+
+static  CostCentreStack * ActualPush_     ( CostCentreStack *ccs, CostCentre *cc, 
+					    CostCentreStack *new_ccs );
+static  rtsBool           ccs_to_ignore   ( CostCentreStack *ccs );
+static  void              count_ticks     ( CostCentreStack *ccs );
+static  void              inherit_costs   ( CostCentreStack *ccs );
+static  void              reportCCS       ( CostCentreStack *ccs, nat indent );
+static  void              DecCCS          ( CostCentreStack *ccs );
+static  void              DecBackEdge     ( CostCentreStack *ccs, 
+					    CostCentreStack *oldccs );
+static  CostCentreStack * CheckLoop       ( CostCentreStack *ccs, CostCentre *cc );
+static  CostCentreStack * pruneCCSTree    ( CostCentreStack *ccs );
+static  CostCentreStack * ActualPush      ( CostCentreStack *, CostCentre * );
+static  CostCentreStack * IsInIndexTable  ( IndexTable *, CostCentre * );
+static  IndexTable *      AddToIndexTable ( IndexTable *, CostCentreStack *, 
+					    CostCentre *, unsigned int );
+static  void              ccsSetSelected  ( CostCentreStack *ccs );
+
+static  void              initTimeProfiling   ( void );
+static  void              initProfilingLogFile( void );
+
+static  void              reportCCS_XML       ( CostCentreStack *ccs );
+
+/* -----------------------------------------------------------------------------
+   Initialise the profiling environment
+   -------------------------------------------------------------------------- */
+
+void
+initProfiling1 (void)
+{
+  // initialise our arena
+  prof_arena = newArena();
+
+  /* for the benefit of allocate()... */
+  CCCS = CCS_SYSTEM;
+  
+  /* Initialize counters for IDs */
+  CC_ID  = 1;
+  CCS_ID = 1;
+  HP_ID  = 1;
+  
+  /* Initialize Declaration lists to NULL */
+  CC_LIST  = NULL;
+  CCS_LIST = NULL;
+
+  /* Register all the cost centres / stacks in the program 
+   * CC_MAIN gets link = 0, all others have non-zero link.
+   */
+  REGISTER_CC(CC_MAIN);
+  REGISTER_CC(CC_SYSTEM);
+  REGISTER_CC(CC_GC);
+  REGISTER_CC(CC_OVERHEAD);
+  REGISTER_CC(CC_SUBSUMED);
+  REGISTER_CC(CC_DONT_CARE);
+  REGISTER_CCS(CCS_MAIN);
+  REGISTER_CCS(CCS_SYSTEM);
+  REGISTER_CCS(CCS_GC);
+  REGISTER_CCS(CCS_OVERHEAD);
+  REGISTER_CCS(CCS_SUBSUMED);
+  REGISTER_CCS(CCS_DONT_CARE);
+
+  CCCS = CCS_OVERHEAD;
+
+  /* cost centres are registered by the per-module 
+   * initialisation code now... 
+   */
+}
+
+void
+initProfiling2 (void)
+{
+  CostCentreStack *ccs, *next;
+
+  CCCS = CCS_SYSTEM;
+
+  /* Set up the log file, and dump the header and cost centre
+   * information into it.  */
+  initProfilingLogFile();
+
+  /* find all the "special" cost centre stacks, and make them children
+   * of CCS_MAIN.
+   */
+  ASSERT(CCS_MAIN->prevStack == 0);
+  CCS_MAIN->root = CC_MAIN;
+  ccsSetSelected(CCS_MAIN);
+  DecCCS(CCS_MAIN);
+
+  for (ccs = CCS_LIST; ccs != CCS_MAIN; ) {
+    next = ccs->prevStack;
+    ccs->prevStack = 0;
+    ActualPush_(CCS_MAIN,ccs->cc,ccs);
+    ccs->root = ccs->cc;
+    ccs = next;
+  }
+  
+  if (RtsFlags.CcFlags.doCostCentres) {
+    initTimeProfiling();
+  }
+
+  if (RtsFlags.ProfFlags.doHeapProfile) {
+    initHeapProfiling();
+  }
+}
+
+// Decide whether closures with this CCS should contribute to the heap
+// profile.
+static void 
+ccsSetSelected( CostCentreStack *ccs )
+{
+    if (RtsFlags.ProfFlags.modSelector) {
+	if (! strMatchesSelector( ccs->cc->module,
+				  RtsFlags.ProfFlags.modSelector ) ) {
+	    ccs->selected = 0;
+	    return;
+	}
+    }
+    if (RtsFlags.ProfFlags.ccSelector) {
+	if (! strMatchesSelector( ccs->cc->label,
+				  RtsFlags.ProfFlags.ccSelector ) ) {
+	    ccs->selected = 0;
+	    return;
+	}
+    }
+    if (RtsFlags.ProfFlags.ccsSelector) {
+	CostCentreStack *c;
+	for (c = ccs; c != NULL; c = c->prevStack) {
+	    if ( strMatchesSelector( c->cc->label,
+				     RtsFlags.ProfFlags.ccsSelector )) {
+		break; 
+	    }
+	}
+	if (c == NULL) {
+	    ccs->selected = 0;
+	    return;
+	}
+    }
+
+    ccs->selected = 1;
+    return;
+}
+
+
+static void
+initProfilingLogFile(void)
+{
+    /* Initialise the log file name */
+    prof_filename = arenaAlloc(prof_arena, strlen(prog_name) + 6);
+    sprintf(prof_filename, "%s.prof", prog_name);
+
+    /* open the log file */
+    if ((prof_file = fopen(prof_filename, "w")) == NULL) {
+	debugBelch("Can't open profiling report file %s\n", prof_filename);
+	RtsFlags.CcFlags.doCostCentres = 0;
+        // The following line was added by Sung; retainer/LDV profiling may need
+        // two output files, i.e., <program>.prof/hp.
+        if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER)
+            RtsFlags.ProfFlags.doHeapProfile = 0;
+	return;
+    }
+
+    if (RtsFlags.CcFlags.doCostCentres == COST_CENTRES_XML) {
+	/* dump the time, and the profiling interval */
+	fprintf(prof_file, "\"%s\"\n", time_str());
+	fprintf(prof_file, "\"%d ms\"\n", TICK_MILLISECS);
+	
+	/* declare all the cost centres */
+	{
+	    CostCentre *cc;
+	    for (cc = CC_LIST; cc != NULL; cc = cc->link) {
+		fprintf(prof_file, "%d %d \"%s\" \"%s\"\n",
+			CC_UQ, cc->ccID, cc->label, cc->module);
+	    }
+	}
+    }
+    
+    if (RtsFlags.ProfFlags.doHeapProfile) {
+	/* Initialise the log file name */
+	hp_filename = arenaAlloc(prof_arena, strlen(prog_name) + 6);
+	sprintf(hp_filename, "%s.hp", prog_name);
+	
+	/* open the log file */
+	if ((hp_file = fopen(hp_filename, "w")) == NULL) {
+	    debugBelch("Can't open profiling report file %s\n", 
+		    hp_filename);
+	    RtsFlags.ProfFlags.doHeapProfile = 0;
+	    return;
+	}
+    }
+}
+
+void
+initTimeProfiling(void)
+{
+  /* Start ticking */
+  startProfTimer();
+};
+
+void 
+endProfiling ( void )
+{
+  if (RtsFlags.CcFlags.doCostCentres) {
+    stopProfTimer();
+  }
+  if (RtsFlags.ProfFlags.doHeapProfile) {
+    endHeapProfiling();
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   Set cost centre stack when entering a function.
+   -------------------------------------------------------------------------- */
+rtsBool entering_PAP;
+
+void
+EnterFunCCS ( CostCentreStack *ccsfn )
+{
+  /* PAP_entry has already set CCCS for us */
+  if (entering_PAP) {
+    entering_PAP = rtsFalse;
+    return;
+  }
+
+  if (ccsfn->root->is_caf == CC_IS_CAF) {
+    CCCS = AppendCCS(CCCS,ccsfn);
+  } else {
+    CCCS = ccsfn;
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   Cost-centre stack manipulation
+   -------------------------------------------------------------------------- */
+
+#ifdef DEBUG
+CostCentreStack * _PushCostCentre ( CostCentreStack *ccs, CostCentre *cc );
+CostCentreStack *
+PushCostCentre ( CostCentreStack *ccs, CostCentre *cc )
+#define PushCostCentre _PushCostCentre
+{
+  IF_DEBUG(prof, 
+	   debugBelch("Pushing %s on ", cc->label);
+	   debugCCS(ccs);
+	   debugBelch("\n"));
+  return PushCostCentre(ccs,cc);
+}
+#endif
+
+CostCentreStack *
+PushCostCentre ( CostCentreStack *ccs, CostCentre *cc )
+{
+  CostCentreStack *temp_ccs;
+  
+  if (ccs == EMPTY_STACK)
+    return ActualPush(ccs,cc);
+  else {
+    if (ccs->cc == cc)
+      return ccs;
+    else {
+      /* check if we've already memoized this stack */
+      temp_ccs = IsInIndexTable(ccs->indexTable,cc);
+      
+      if (temp_ccs != EMPTY_STACK)
+	return temp_ccs;
+      else {
+	temp_ccs = CheckLoop(ccs,cc);
+	if (temp_ccs != NULL) {
+	  /* we have recursed to an older CCS.  Mark this in
+	   * the index table, and emit a "back edge" into the
+	   * log file.
+	   */
+	  ccs->indexTable = AddToIndexTable(ccs->indexTable,temp_ccs,cc,1);
+	  DecBackEdge(temp_ccs,ccs);
+	  return temp_ccs;
+	} else {
+	  return ActualPush(ccs,cc);
+	}
+      }
+    }
+  }
+}
+
+static CostCentreStack *
+CheckLoop ( CostCentreStack *ccs, CostCentre *cc )
+{
+  while (ccs != EMPTY_STACK) {
+    if (ccs->cc == cc)
+      return ccs;
+    ccs = ccs->prevStack;
+  }
+  return NULL;
+}
+
+/* Append ccs1 to ccs2 (ignoring any CAF cost centre at the root of ccs1 */
+
+#ifdef DEBUG
+CostCentreStack *_AppendCCS ( CostCentreStack *ccs1, CostCentreStack *ccs2 );
+CostCentreStack *
+AppendCCS ( CostCentreStack *ccs1, CostCentreStack *ccs2 )
+#define AppendCCS _AppendCCS
+{
+  IF_DEBUG(prof, 
+	   if (ccs1 != ccs2) {
+	     debugBelch("Appending ");
+	     debugCCS(ccs1);
+	     debugBelch(" to ");
+	     debugCCS(ccs2);
+	     debugBelch("\n");});
+  return AppendCCS(ccs1,ccs2);
+}
+#endif
+
+CostCentreStack *
+AppendCCS ( CostCentreStack *ccs1, CostCentreStack *ccs2 )
+{
+  CostCentreStack *ccs = NULL;
+
+  if (ccs1 == ccs2) {
+    return ccs1;
+  }
+
+  if (ccs2->cc->is_caf == CC_IS_CAF) {
+    return ccs1;
+  }
+  
+  if (ccs2->prevStack != NULL) {
+    ccs = AppendCCS(ccs1, ccs2->prevStack);
+  }
+
+  return PushCostCentre(ccs,ccs2->cc);
+}
+
+static CostCentreStack *
+ActualPush ( CostCentreStack *ccs, CostCentre *cc )
+{
+  CostCentreStack *new_ccs;
+  
+  /* allocate space for a new CostCentreStack */
+  new_ccs = (CostCentreStack *) arenaAlloc(prof_arena, sizeof(CostCentreStack));
+  
+  return ActualPush_(ccs, cc, new_ccs);
+}
+
+static CostCentreStack *
+ActualPush_ ( CostCentreStack *ccs, CostCentre *cc, CostCentreStack *new_ccs )
+{
+  /* assign values to each member of the structure */
+  new_ccs->ccsID = CCS_ID++;
+  new_ccs->cc = cc;
+  new_ccs->prevStack = ccs;
+  
+  new_ccs->indexTable = EMPTY_TABLE;
+  
+  /* Initialise the various _scc_ counters to zero
+   */
+  new_ccs->scc_count        = 0;
+  
+  /* Initialize all other stats here.  There should be a quick way
+   * that's easily used elsewhere too 
+   */
+  new_ccs->time_ticks = 0;
+  new_ccs->mem_alloc = 0;
+  new_ccs->inherited_ticks = 0;
+  new_ccs->inherited_alloc = 0;
+  
+  new_ccs->root = ccs->root;
+
+  // Set the selected field.
+  ccsSetSelected(new_ccs);
+
+  /* update the memoization table for the parent stack */
+  if (ccs != EMPTY_STACK)
+    ccs->indexTable = AddToIndexTable(ccs->indexTable, new_ccs, cc, 
+				      0/*not a back edge*/);
+  
+  /* make sure this CC is declared at the next heap/time sample */
+  DecCCS(new_ccs);
+  
+  /* return a pointer to the new stack */
+  return new_ccs;
+}
+
+
+static CostCentreStack *
+IsInIndexTable(IndexTable *it, CostCentre *cc)
+{
+  while (it!=EMPTY_TABLE)
+    {
+      if (it->cc==cc)
+	return it->ccs;
+      else
+	it = it->next;
+    }
+  
+  /* otherwise we never found it so return EMPTY_TABLE */
+  return EMPTY_TABLE;
+}
+
+
+static IndexTable *
+AddToIndexTable(IndexTable *it, CostCentreStack *new_ccs, 
+		CostCentre *cc, unsigned int back_edge)
+{
+  IndexTable *new_it;
+  
+  new_it = arenaAlloc(prof_arena, sizeof(IndexTable));
+  
+  new_it->cc = cc;
+  new_it->ccs = new_ccs;
+  new_it->next = it;
+  new_it->back_edge = back_edge;
+  return new_it;
+}
+
+
+static void
+DecCCS(CostCentreStack *ccs)
+{
+  if (prof_file && RtsFlags.CcFlags.doCostCentres == COST_CENTRES_XML) {
+    if (ccs->prevStack == EMPTY_STACK)
+      fprintf(prof_file, "%d %d 1 %d\n", CCS_UQ, 
+	      ccs->ccsID, ccs->cc->ccID);
+    else
+      fprintf(prof_file, "%d %d 2 %d %d\n", CCS_UQ, 
+	      ccs->ccsID, ccs->cc->ccID, ccs->prevStack->ccsID);
+  }
+}
+
+static void
+DecBackEdge( CostCentreStack *ccs, CostCentreStack *oldccs )
+{
+  if (prof_file && RtsFlags.CcFlags.doCostCentres == COST_CENTRES_XML) {
+    if (ccs->prevStack == EMPTY_STACK)
+      fprintf(prof_file, "%d %d 1 %d\n", CCS_UQ, 
+	      ccs->ccsID, ccs->cc->ccID);
+    else
+      fprintf(prof_file, "%d %d 2 %d %d\n", CCS_UQ, 
+	      ccs->ccsID, ccs->cc->ccID, oldccs->ccsID);
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   Generating a time & allocation profiling report.
+   -------------------------------------------------------------------------- */
+
+/* We omit certain system-related CCs and CCSs from the default
+ * reports, so as not to cause confusion.
+ */
+static rtsBool
+cc_to_ignore (CostCentre *cc)
+{
+    if (    cc == CC_OVERHEAD 
+	 || cc == CC_DONT_CARE
+	 || cc == CC_GC 
+	 || cc == CC_SYSTEM) {
+	return rtsTrue;
+    } else {
+	return rtsFalse;
+    }
+}
+
+static rtsBool
+ccs_to_ignore (CostCentreStack *ccs)
+{
+    if (    ccs == CCS_OVERHEAD 
+	 || ccs == CCS_DONT_CARE
+	 || ccs == CCS_GC 
+	 || ccs == CCS_SYSTEM) {
+	return rtsTrue;
+    } else {
+	return rtsFalse;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   Generating the aggregated per-cost-centre time/alloc report.
+   -------------------------------------------------------------------------- */
+
+static CostCentre *sorted_cc_list;
+
+static void
+aggregate_cc_costs( CostCentreStack *ccs )
+{
+  IndexTable *i;
+
+  ccs->cc->mem_alloc += ccs->mem_alloc;
+  ccs->cc->time_ticks += ccs->time_ticks;
+
+  for (i = ccs->indexTable; i != 0; i = i->next) {
+    if (!i->back_edge) {
+      aggregate_cc_costs(i->ccs);
+    }
+  }
+}
+
+static void
+insert_cc_in_sorted_list( CostCentre *new_cc )
+{
+  CostCentre **prev, *cc;
+
+  prev = &sorted_cc_list;
+  for (cc = sorted_cc_list; cc != NULL; cc = cc->link) {
+    if (new_cc->time_ticks > cc->time_ticks) {
+      new_cc->link = cc;
+      *prev = new_cc;
+      return;
+    } else {
+      prev = &(cc->link);
+    }
+  }
+  new_cc->link = NULL;
+  *prev = new_cc;
+}
+
+static void
+report_per_cc_costs( void )
+{
+  CostCentre *cc, *next;
+
+  aggregate_cc_costs(CCS_MAIN);
+  sorted_cc_list = NULL;
+
+  for (cc = CC_LIST; cc != NULL; cc = next) {
+    next = cc->link;
+    if (cc->time_ticks > total_prof_ticks/100
+	|| cc->mem_alloc > total_alloc/100
+	|| RtsFlags.CcFlags.doCostCentres >= COST_CENTRES_ALL) {
+      insert_cc_in_sorted_list(cc);
+    }
+  }
+  
+  fprintf(prof_file, "%-30s %-20s", "COST CENTRE", "MODULE");  
+  fprintf(prof_file, "%6s %6s", "%time", "%alloc");
+  if (RtsFlags.CcFlags.doCostCentres >= COST_CENTRES_VERBOSE) {
+    fprintf(prof_file, "  %5s %9s", "ticks", "bytes");
+  }
+  fprintf(prof_file, "\n\n");
+
+  for (cc = sorted_cc_list; cc != NULL; cc = cc->link) {
+      if (cc_to_ignore(cc)) {
+	  continue;
+      }
+      fprintf(prof_file, "%-30s %-20s", cc->label, cc->module);
+      fprintf(prof_file, "%6.1f %6.1f",
+	      total_prof_ticks == 0 ? 0.0 : (cc->time_ticks / (StgFloat) total_prof_ticks * 100),
+	      total_alloc == 0 ? 0.0 : (cc->mem_alloc / (StgFloat)
+					total_alloc * 100)
+	  );
+      
+      if (RtsFlags.CcFlags.doCostCentres >= COST_CENTRES_VERBOSE) {
+	fprintf(prof_file, "  %5llu %9llu", (StgWord64)(cc->time_ticks), cc->mem_alloc);
+      }
+      fprintf(prof_file, "\n");
+  }
+
+  fprintf(prof_file,"\n\n");
+}
+
+/* -----------------------------------------------------------------------------
+   Generate the cost-centre-stack time/alloc report
+   -------------------------------------------------------------------------- */
+
+static void 
+fprint_header( void )
+{
+  fprintf(prof_file, "%-24s %-10s                                                            individual    inherited\n", "", "");
+
+  fprintf(prof_file, "%-24s %-50s", "COST CENTRE", "MODULE");  
+  fprintf(prof_file, "%6s %10s  %5s %5s   %5s %5s", "no.", "entries", "%time", "%alloc", "%time", "%alloc");
+
+  if (RtsFlags.CcFlags.doCostCentres >= COST_CENTRES_VERBOSE) {
+    fprintf(prof_file, "  %5s %9s", "ticks", "bytes");
+#if defined(PROFILING_DETAIL_COUNTS)
+    fprintf(prof_file, "  %8s %8s %8s %8s %8s %8s %8s",
+	    "closures", "thunks", "funcs", "PAPs", "subfuns", "subcafs", "cafssub");
+#endif
+  }
+
+  fprintf(prof_file, "\n\n");
+}
+
+void
+reportCCSProfiling( void )
+{
+    nat count;
+    char temp[128]; /* sigh: magic constant */
+
+    stopProfTimer();
+
+    total_prof_ticks = 0;
+    total_alloc = 0;
+    count_ticks(CCS_MAIN);
+    
+    switch (RtsFlags.CcFlags.doCostCentres) {
+    case 0:
+      return;
+    case COST_CENTRES_XML:
+      gen_XML_logfile();
+      return;
+    default:
+      break;
+    }
+
+    fprintf(prof_file, "\t%s Time and Allocation Profiling Report  (%s)\n", 
+	    time_str(), "Final");
+
+    fprintf(prof_file, "\n\t  ");
+    fprintf(prof_file, " %s", prog_name);
+    fprintf(prof_file, " +RTS");
+    for (count = 0; rts_argv[count]; count++)
+	fprintf(prof_file, " %s", rts_argv[count]);
+    fprintf(prof_file, " -RTS");
+    for (count = 1; prog_argv[count]; count++)
+	fprintf(prof_file, " %s", prog_argv[count]);
+    fprintf(prof_file, "\n\n");
+
+    fprintf(prof_file, "\ttotal time  = %11.2f secs   (%lu ticks @ %d ms)\n",
+	    total_prof_ticks / (StgFloat) TICK_FREQUENCY, 
+	    total_prof_ticks, TICK_MILLISECS);
+
+    fprintf(prof_file, "\ttotal alloc = %11s bytes",
+	    ullong_format_string(total_alloc * sizeof(W_),
+				 temp, rtsTrue/*commas*/));
+
+#if defined(PROFILING_DETAIL_COUNTS)
+    fprintf(prof_file, "  (%lu closures)", total_allocs);
+#endif
+    fprintf(prof_file, "  (excludes profiling overheads)\n\n");
+
+    report_per_cc_costs();
+
+    inherit_costs(CCS_MAIN);
+
+    fprint_header();
+    reportCCS(pruneCCSTree(CCS_MAIN), 0);
+}
+
+static void 
+reportCCS(CostCentreStack *ccs, nat indent)
+{
+  CostCentre *cc;
+  IndexTable *i;
+
+  cc = ccs->cc;
+  
+  /* Only print cost centres with non 0 data ! */
+  
+  if ( RtsFlags.CcFlags.doCostCentres >= COST_CENTRES_ALL ||
+       ! ccs_to_ignore(ccs))
+	/* force printing of *all* cost centres if -P -P */ 
+    {
+
+    fprintf(prof_file, "%-*s%-*s %-50s", 
+	    indent, "", 24-indent, cc->label, cc->module);
+
+    fprintf(prof_file, "%6d %11.0f %5.1f  %5.1f   %5.1f  %5.1f",
+	    ccs->ccsID, (double) ccs->scc_count, 
+	    total_prof_ticks == 0 ? 0.0 : ((double)ccs->time_ticks / (double)total_prof_ticks * 100.0),
+	    total_alloc == 0 ? 0.0 : ((double)ccs->mem_alloc / (double)total_alloc * 100.0),
+	    total_prof_ticks == 0 ? 0.0 : ((double)ccs->inherited_ticks / (double)total_prof_ticks * 100.0),
+	    total_alloc == 0 ? 0.0 : ((double)ccs->inherited_alloc / (double)total_alloc * 100.0)
+	    );
+
+    if (RtsFlags.CcFlags.doCostCentres >= COST_CENTRES_VERBOSE) {
+      fprintf(prof_file, "  %5llu %9llu", (StgWord64)(ccs->time_ticks), ccs->mem_alloc*sizeof(W_));
+#if defined(PROFILING_DETAIL_COUNTS)
+      fprintf(prof_file, "  %8ld %8ld %8ld %8ld %8ld %8ld %8ld",
+	      ccs->mem_allocs, ccs->thunk_count,
+	      ccs->function_count, ccs->pap_count,
+	      ccs->subsumed_fun_count,	ccs->subsumed_caf_count,
+	      ccs->caffun_subsumed);
+#endif
+    }
+    fprintf(prof_file, "\n");
+  }
+
+  for (i = ccs->indexTable; i != 0; i = i->next) {
+    if (!i->back_edge) {
+      reportCCS(i->ccs, indent+1);
+    }
+  }
+}
+
+
+/* Traverse the cost centre stack tree and accumulate
+ * ticks/allocations.
+ */
+static void
+count_ticks(CostCentreStack *ccs)
+{
+  IndexTable *i;
+  
+  if (!ccs_to_ignore(ccs)) {
+    total_alloc += ccs->mem_alloc;
+    total_prof_ticks += ccs->time_ticks;
+  }
+  for (i = ccs->indexTable; i != NULL; i = i->next)
+    if (!i->back_edge) {
+      count_ticks(i->ccs);
+    }
+}
+
+/* Traverse the cost centre stack tree and inherit ticks & allocs.
+ */
+static void
+inherit_costs(CostCentreStack *ccs)
+{
+  IndexTable *i;
+
+  if (ccs_to_ignore(ccs)) { return; }
+
+  ccs->inherited_ticks += ccs->time_ticks;
+  ccs->inherited_alloc += ccs->mem_alloc;
+
+  for (i = ccs->indexTable; i != NULL; i = i->next)
+      if (!i->back_edge) {
+	  inherit_costs(i->ccs);
+	  ccs->inherited_ticks += i->ccs->inherited_ticks;
+	  ccs->inherited_alloc += i->ccs->inherited_alloc;
+      }
+  
+  return;
+}
+
+static CostCentreStack *
+pruneCCSTree( CostCentreStack *ccs )
+{
+  CostCentreStack *ccs1;
+  IndexTable *i, **prev;
+  
+  prev = &ccs->indexTable;
+  for (i = ccs->indexTable; i != 0; i = i->next) {
+    if (i->back_edge) { continue; }
+
+    ccs1 = pruneCCSTree(i->ccs);
+    if (ccs1 == NULL) {
+      *prev = i->next;
+    } else {
+      prev = &(i->next);
+    }
+  }
+
+  if ( (RtsFlags.CcFlags.doCostCentres >= COST_CENTRES_ALL
+	/* force printing of *all* cost centres if -P -P */ )
+       
+       || ( ccs->indexTable != 0 )
+       || ( ccs->scc_count || ccs->time_ticks || ccs->mem_alloc )
+      ) {
+      return ccs;
+  } else {
+      return NULL;
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   Generate the XML time/allocation profile
+   -------------------------------------------------------------------------- */
+
+void
+gen_XML_logfile( void )
+{
+  fprintf(prof_file, "%d %lu", TIME_UPD_UQ, total_prof_ticks);
+
+  reportCCS_XML(pruneCCSTree(CCS_MAIN));
+
+  fprintf(prof_file, " 0\n");
+
+  fclose(prof_file);
+}
+
+static void 
+reportCCS_XML(CostCentreStack *ccs)
+{
+  CostCentre *cc;
+  IndexTable *i;
+
+  if (ccs_to_ignore(ccs)) { return; }
+
+  cc = ccs->cc;
+  
+  fprintf(prof_file, " 1 %d %llu %llu %llu", 
+	  ccs->ccsID, ccs->scc_count, (StgWord64)(ccs->time_ticks), ccs->mem_alloc);
+
+  for (i = ccs->indexTable; i != 0; i = i->next) {
+    if (!i->back_edge) {
+      reportCCS_XML(i->ccs);
+    }
+  }
+}
+
+void
+fprintCCS( FILE *f, CostCentreStack *ccs )
+{
+  fprintf(f,"<");
+  for (; ccs && ccs != CCS_MAIN; ccs = ccs->prevStack ) {
+      fprintf(f,"%s.%s", ccs->cc->module, ccs->cc->label);
+      if (ccs->prevStack && ccs->prevStack != CCS_MAIN) {
+	  fprintf(f,",");
+      }
+  }
+  fprintf(f,">");
+}
+
+/* For calling from .cmm code, where we can't reliably refer to stderr */
+void
+fprintCCS_stderr( CostCentreStack *ccs )
+{
+    fprintCCS(stderr, ccs);
+}
+
+#ifdef DEBUG
+void
+debugCCS( CostCentreStack *ccs )
+{
+  debugBelch("<");
+  for (; ccs && ccs != CCS_MAIN; ccs = ccs->prevStack ) {
+      debugBelch("%s.%s", ccs->cc->module, ccs->cc->label);
+      if (ccs->prevStack && ccs->prevStack != CCS_MAIN) {
+	  debugBelch(",");
+      }
+  }
+  debugBelch(">");
+}
+#endif /* DEBUG */
+
+#endif /* PROFILING */
diff --git a/rts/Profiling.h b/rts/Profiling.h
new file mode 100644
index 0000000000..d968349a52
--- /dev/null
+++ b/rts/Profiling.h
@@ -0,0 +1,39 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Support for profiling
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef PROFILING_H
+#define PROFILING_H
+
+#include <stdio.h>
+
+#if defined(PROFILING) || defined(DEBUG)
+void initProfiling1 ( void );
+void initProfiling2 ( void );
+void endProfiling   ( void );
+
+extern FILE *prof_file;
+extern FILE *hp_file;
+#endif
+
+#ifdef PROFILING
+
+void gen_XML_logfile     ( void );
+void reportCCSProfiling ( void );
+
+void PrintNewStackDecls ( void );
+
+extern void fprintCCS( FILE *f, CostCentreStack *ccs );
+extern void fprintCCS_stderr( CostCentreStack *ccs );
+
+#ifdef DEBUG
+extern void debugCCS( CostCentreStack *ccs );
+#endif
+
+#endif
+
+#endif /* PROFILING_H */
diff --git a/rts/Proftimer.c b/rts/Proftimer.c
new file mode 100644
index 0000000000..3b499152d6
--- /dev/null
+++ b/rts/Proftimer.c
@@ -0,0 +1,85 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-1999
+ *
+ * Profiling interval timer
+ *
+ * ---------------------------------------------------------------------------*/
+
+#if defined (PROFILING)
+
+#include "PosixSource.h"
+
+#include "Rts.h"
+#include "Profiling.h"
+#include "Timer.h"
+#include "Proftimer.h"
+#include "RtsFlags.h"
+
+static rtsBool do_prof_ticks = rtsFalse;       // enable profiling ticks
+static rtsBool do_heap_prof_ticks = rtsFalse;  // enable heap profiling ticks
+
+// Number of ticks until next heap census
+static int ticks_to_heap_profile;
+
+// Time for a heap profile on the next context switch
+rtsBool performHeapProfile;
+
+void
+stopProfTimer( void )
+{
+    do_prof_ticks = rtsFalse;
+}
+
+void
+startProfTimer( void )
+{
+    do_prof_ticks = rtsTrue;
+}
+
+void
+stopHeapProfTimer( void )
+{
+    do_heap_prof_ticks = rtsFalse;
+}
+
+void
+startHeapProfTimer( void )
+{
+    if (RtsFlags.ProfFlags.doHeapProfile && 
+	RtsFlags.ProfFlags.profileIntervalTicks > 0) {
+	do_heap_prof_ticks = rtsTrue;
+    }
+}
+
+void
+initProfTimer( void )
+{
+    performHeapProfile = rtsFalse;
+
+    RtsFlags.ProfFlags.profileIntervalTicks = 
+	RtsFlags.ProfFlags.profileInterval / TICK_MILLISECS;
+
+    ticks_to_heap_profile = RtsFlags.ProfFlags.profileIntervalTicks;
+
+    startHeapProfTimer();
+}
+
+
+void
+handleProfTick(void)
+{
+    if (do_prof_ticks) {
+	CCCS->time_ticks++;
+    }
+
+    if (do_heap_prof_ticks) {
+	ticks_to_heap_profile--;
+	if (ticks_to_heap_profile <= 0) {
+	    ticks_to_heap_profile = RtsFlags.ProfFlags.profileIntervalTicks;
+	    performHeapProfile = rtsTrue;
+	}
+    }
+}
+
+#endif /* PROFILING */
diff --git a/rts/Proftimer.h b/rts/Proftimer.h
new file mode 100644
index 0000000000..c837b855f9
--- /dev/null
+++ b/rts/Proftimer.h
@@ -0,0 +1,22 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Profiling interval timer
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef PROFTIMER_H
+#define PROFTIMER_H
+
+extern void initProfTimer      ( void );
+extern void handleProfTick     ( void );
+
+extern void stopProfTimer      ( void );
+extern void startProfTimer     ( void );
+extern void stopHeapProfTimer  ( void );
+extern void startHeapProfTimer ( void );
+
+extern rtsBool performHeapProfile;
+
+#endif /* PROFTIMER_H */
diff --git a/rts/RetainerProfile.c b/rts/RetainerProfile.c
new file mode 100644
index 0000000000..c5c3de5314
--- /dev/null
+++ b/rts/RetainerProfile.c
@@ -0,0 +1,2338 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001
+ * Author: Sungwoo Park
+ *
+ * Retainer profiling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifdef PROFILING
+
+// Turn off inlining when debugging - it obfuscates things
+#ifdef DEBUG
+#define INLINE
+#else
+#define INLINE inline
+#endif
+
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "RetainerProfile.h"
+#include "RetainerSet.h"
+#include "Schedule.h"
+#include "Printer.h"
+#include "Storage.h"
+#include "RtsFlags.h"
+#include "Weak.h"
+#include "Sanity.h"
+#include "Profiling.h"
+#include "Stats.h"
+#include "BlockAlloc.h"
+#include "ProfHeap.h"
+#include "Apply.h"
+
+/*
+  Note: what to change in order to plug-in a new retainer profiling scheme?
+    (1) type retainer in ../includes/StgRetainerProf.h
+    (2) retainer function R(), i.e., getRetainerFrom()
+    (3) the two hashing functions, hashKeySingleton() and hashKeyAddElement(),
+        in RetainerSet.h, if needed.
+    (4) printRetainer() and printRetainerSetShort() in RetainerSet.c.
+ */
+
+/* -----------------------------------------------------------------------------
+ * Declarations...
+ * -------------------------------------------------------------------------- */
+
+static nat retainerGeneration;	// generation
+
+static nat numObjectVisited;	// total number of objects visited
+static nat timesAnyObjectVisited; // number of times any objects are visited
+
+/*
+  The rs field in the profile header of any object points to its retainer
+  set in an indirect way: if flip is 0, it points to the retainer set;
+  if flip is 1, it points to the next byte after the retainer set (even
+  for NULL pointers). Therefore, with flip 1, (rs ^ 1) is the actual
+  pointer. See retainerSetOf().
+ */
+
+StgWord flip = 0;     // flip bit
+                      // must be 0 if DEBUG_RETAINER is on (for static closures)
+
+#define setRetainerSetToNull(c)   \
+  (c)->header.prof.hp.rs = (RetainerSet *)((StgWord)NULL | flip)
+
+static void retainStack(StgClosure *, retainer, StgPtr, StgPtr);
+static void retainClosure(StgClosure *, StgClosure *, retainer);
+#ifdef DEBUG_RETAINER
+static void belongToHeap(StgPtr p);
+#endif
+
+#ifdef DEBUG_RETAINER
+/*
+  cStackSize records how many times retainStack() has been invoked recursively,
+  that is, the number of activation records for retainStack() on the C stack.
+  maxCStackSize records its max value.
+  Invariants:
+    cStackSize <= maxCStackSize
+ */
+static nat cStackSize, maxCStackSize;
+
+static nat sumOfNewCost;	// sum of the cost of each object, computed
+				// when the object is first visited
+static nat sumOfNewCostExtra;   // for those objects not visited during
+                                // retainer profiling, e.g., MUT_VAR
+static nat costArray[N_CLOSURE_TYPES];
+
+nat sumOfCostLinear;		// sum of the costs of all object, computed
+				// when linearly traversing the heap after
+				// retainer profiling
+nat costArrayLinear[N_CLOSURE_TYPES];
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Retainer stack - header
+ *   Note:
+ *     Although the retainer stack implementation could be separated *
+ *     from the retainer profiling engine, there does not seem to be
+ *     any advantage in doing that; retainer stack is an integral part
+ *     of retainer profiling engine and cannot be use elsewhere at
+ *     all.
+ * -------------------------------------------------------------------------- */
+
+typedef enum {
+    posTypeStep,
+    posTypePtrs,
+    posTypeSRT,
+    posTypeLargeSRT,
+} nextPosType;
+
+typedef union {
+    // fixed layout or layout specified by a field in the closure
+    StgWord step;
+
+    // layout.payload
+    struct {
+    // See StgClosureInfo in InfoTables.h
+#if SIZEOF_VOID_P == 8
+	StgWord32 pos;
+	StgWord32 ptrs;
+#else
+	StgWord16 pos;
+	StgWord16 ptrs;
+#endif
+	StgPtr payload;
+    } ptrs;
+
+    // SRT
+    struct {
+	StgClosure **srt;
+	StgWord    srt_bitmap;
+    } srt;
+
+    // Large SRT
+    struct {
+	StgLargeSRT *srt;
+	StgWord offset;
+    } large_srt;
+	
+} nextPos;
+
+typedef struct {
+    nextPosType type;
+    nextPos next;
+} stackPos;
+
+typedef struct {
+    StgClosure *c;
+    retainer c_child_r;
+    stackPos info;
+} stackElement;
+
+/*
+  Invariants:
+    firstStack points to the first block group.
+    currentStack points to the block group currently being used.
+    currentStack->free == stackLimit.
+    stackTop points to the topmost byte in the stack of currentStack.
+    Unless the whole stack is empty, stackTop must point to the topmost
+    object (or byte) in the whole stack. Thus, it is only when the whole stack
+    is empty that stackTop == stackLimit (not during the execution of push()
+    and pop()).
+    stackBottom == currentStack->start.
+    stackLimit == currentStack->start + BLOCK_SIZE_W * currentStack->blocks.
+  Note:
+    When a current stack becomes empty, stackTop is set to point to
+    the topmost element on the previous block group so as to satisfy
+    the invariants described above.
+ */
+static bdescr *firstStack = NULL;
+static bdescr *currentStack;
+static stackElement *stackBottom, *stackTop, *stackLimit;
+
+/*
+  currentStackBoundary is used to mark the current stack chunk.
+  If stackTop == currentStackBoundary, it means that the current stack chunk
+  is empty. It is the responsibility of the user to keep currentStackBoundary
+  valid all the time if it is to be employed.
+ */
+static stackElement *currentStackBoundary;
+
+/*
+  stackSize records the current size of the stack.
+  maxStackSize records its high water mark.
+  Invariants:
+    stackSize <= maxStackSize
+  Note:
+    stackSize is just an estimate measure of the depth of the graph. The reason
+    is that some heap objects have only a single child and may not result
+    in a new element being pushed onto the stack. Therefore, at the end of
+    retainer profiling, maxStackSize + maxCStackSize is some value no greater
+    than the actual depth of the graph.
+ */
+#ifdef DEBUG_RETAINER
+static int stackSize, maxStackSize;
+#endif
+
+// number of blocks allocated for one stack
+#define BLOCKS_IN_STACK 1
+
+/* -----------------------------------------------------------------------------
+ * Add a new block group to the stack.
+ * Invariants:
+ *  currentStack->link == s.
+ * -------------------------------------------------------------------------- */
+static INLINE void
+newStackBlock( bdescr *bd )
+{
+    currentStack = bd;
+    stackTop     = (stackElement *)(bd->start + BLOCK_SIZE_W * bd->blocks);
+    stackBottom  = (stackElement *)bd->start;
+    stackLimit   = (stackElement *)stackTop;
+    bd->free     = (StgPtr)stackLimit;
+}
+
+/* -----------------------------------------------------------------------------
+ * Return to the previous block group.
+ * Invariants:
+ *   s->link == currentStack.
+ * -------------------------------------------------------------------------- */
+static INLINE void
+returnToOldStack( bdescr *bd )
+{
+    currentStack = bd;
+    stackTop = (stackElement *)bd->free;
+    stackBottom = (stackElement *)bd->start;
+    stackLimit = (stackElement *)(bd->start + BLOCK_SIZE_W * bd->blocks);
+    bd->free = (StgPtr)stackLimit;
+}
+
+/* -----------------------------------------------------------------------------
+ *  Initializes the traverse stack.
+ * -------------------------------------------------------------------------- */
+static void
+initializeTraverseStack( void )
+{
+    if (firstStack != NULL) {
+	freeChain(firstStack);
+    }
+
+    firstStack = allocGroup(BLOCKS_IN_STACK);
+    firstStack->link = NULL;
+    firstStack->u.back = NULL;
+
+    newStackBlock(firstStack);
+}
+
+/* -----------------------------------------------------------------------------
+ * Frees all the block groups in the traverse stack.
+ * Invariants:
+ *   firstStack != NULL
+ * -------------------------------------------------------------------------- */
+static void
+closeTraverseStack( void )
+{
+    freeChain(firstStack);
+    firstStack = NULL;
+}
+
+/* -----------------------------------------------------------------------------
+ * Returns rtsTrue if the whole stack is empty.
+ * -------------------------------------------------------------------------- */
+static INLINE rtsBool
+isEmptyRetainerStack( void )
+{
+    return (firstStack == currentStack) && stackTop == stackLimit;
+}
+
+/* -----------------------------------------------------------------------------
+ * Returns size of stack
+ * -------------------------------------------------------------------------- */
+#ifdef DEBUG
+lnat
+retainerStackBlocks( void )
+{
+    bdescr* bd;
+    lnat res = 0;
+
+    for (bd = firstStack; bd != NULL; bd = bd->link) 
+      res += bd->blocks;
+
+    return res;
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Returns rtsTrue if stackTop is at the stack boundary of the current stack,
+ * i.e., if the current stack chunk is empty.
+ * -------------------------------------------------------------------------- */
+static INLINE rtsBool
+isOnBoundary( void )
+{
+    return stackTop == currentStackBoundary;
+}
+
+/* -----------------------------------------------------------------------------
+ * Initializes *info from ptrs and payload.
+ * Invariants:
+ *   payload[] begins with ptrs pointers followed by non-pointers.
+ * -------------------------------------------------------------------------- */
+static INLINE void
+init_ptrs( stackPos *info, nat ptrs, StgPtr payload )
+{
+    info->type              = posTypePtrs;
+    info->next.ptrs.pos     = 0;
+    info->next.ptrs.ptrs    = ptrs;
+    info->next.ptrs.payload = payload;
+}
+
+/* -----------------------------------------------------------------------------
+ * Find the next object from *info.
+ * -------------------------------------------------------------------------- */
+static INLINE StgClosure *
+find_ptrs( stackPos *info )
+{
+    if (info->next.ptrs.pos < info->next.ptrs.ptrs) {
+	return (StgClosure *)info->next.ptrs.payload[info->next.ptrs.pos++];
+    } else {
+	return NULL;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ *  Initializes *info from SRT information stored in *infoTable.
+ * -------------------------------------------------------------------------- */
+static INLINE void
+init_srt_fun( stackPos *info, StgFunInfoTable *infoTable )
+{
+    if (infoTable->i.srt_bitmap == (StgHalfWord)(-1)) {
+	info->type = posTypeLargeSRT;
+	info->next.large_srt.srt = (StgLargeSRT *)GET_FUN_SRT(infoTable);
+	info->next.large_srt.offset = 0;
+    } else {
+	info->type = posTypeSRT;
+	info->next.srt.srt = (StgClosure **)GET_FUN_SRT(infoTable);
+	info->next.srt.srt_bitmap = infoTable->i.srt_bitmap;
+    }
+}
+
+static INLINE void
+init_srt_thunk( stackPos *info, StgThunkInfoTable *infoTable )
+{
+    if (infoTable->i.srt_bitmap == (StgHalfWord)(-1)) {
+	info->type = posTypeLargeSRT;
+	info->next.large_srt.srt = (StgLargeSRT *)GET_SRT(infoTable);
+	info->next.large_srt.offset = 0;
+    } else {
+	info->type = posTypeSRT;
+	info->next.srt.srt = (StgClosure **)GET_SRT(infoTable);
+	info->next.srt.srt_bitmap = infoTable->i.srt_bitmap;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Find the next object from *info.
+ * -------------------------------------------------------------------------- */
+static INLINE StgClosure *
+find_srt( stackPos *info )
+{
+    StgClosure *c;
+    StgWord bitmap;
+
+    if (info->type == posTypeSRT) {
+	// Small SRT bitmap
+	bitmap = info->next.srt.srt_bitmap;
+	while (bitmap != 0) {
+	    if ((bitmap & 1) != 0) {
+#ifdef ENABLE_WIN32_DLL_SUPPORT
+		
+		if ((unsigned long)(*(info->next.srt.srt)) & 0x1)
+		    c = (* (StgClosure **)((unsigned long)*(info->next.srt.srt)) & ~0x1);
+		else
+		    c = *(info->next.srt.srt);
+#else
+		c = *(info->next.srt.srt);
+#endif
+		bitmap = bitmap >> 1;
+		info->next.srt.srt++;
+		info->next.srt.srt_bitmap = bitmap;
+		return c;
+	    }
+	    bitmap = bitmap >> 1;
+	    info->next.srt.srt++;
+	}
+	// bitmap is now zero...
+	return NULL;
+    }
+    else {
+	// Large SRT bitmap
+	nat i = info->next.large_srt.offset;
+	StgWord bitmap;
+
+	// Follow the pattern from GC.c:scavenge_large_srt_bitmap().
+	bitmap = info->next.large_srt.srt->l.bitmap[i / BITS_IN(W_)];
+	bitmap = bitmap >> (i % BITS_IN(StgWord));
+	while (i < info->next.large_srt.srt->l.size) {
+	    if ((bitmap & 1) != 0) {
+		c = ((StgClosure **)info->next.large_srt.srt->srt)[i];
+		i++;
+		info->next.large_srt.offset = i;
+		return c;
+	    }
+	    i++;
+	    if (i % BITS_IN(W_) == 0) {
+		bitmap = info->next.large_srt.srt->l.bitmap[i / BITS_IN(W_)];
+	    } else {
+		bitmap = bitmap >> 1;
+	    }
+	}
+	// reached the end of this bitmap.
+	info->next.large_srt.offset = i;
+	return NULL;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ *  push() pushes a stackElement representing the next child of *c
+ *  onto the traverse stack. If *c has no child, *first_child is set
+ *  to NULL and nothing is pushed onto the stack. If *c has only one
+ *  child, *c_chlid is set to that child and nothing is pushed onto
+ *  the stack.  If *c has more than two children, *first_child is set
+ *  to the first child and a stackElement representing the second
+ *  child is pushed onto the stack.
+
+ *  Invariants:
+ *     *c_child_r is the most recent retainer of *c's children.
+ *     *c is not any of TSO, AP, PAP, AP_STACK, which means that
+ *        there cannot be any stack objects.
+ *  Note: SRTs are considered to  be children as well.
+ * -------------------------------------------------------------------------- */
+static INLINE void
+push( StgClosure *c, retainer c_child_r, StgClosure **first_child )
+{
+    stackElement se;
+    bdescr *nbd;      // Next Block Descriptor
+
+#ifdef DEBUG_RETAINER
+    // debugBelch("push(): stackTop = 0x%x, currentStackBoundary = 0x%x\n", stackTop, currentStackBoundary);
+#endif
+
+    ASSERT(get_itbl(c)->type != TSO);
+    ASSERT(get_itbl(c)->type != AP_STACK);
+
+    //
+    // fill in se
+    //
+
+    se.c = c;
+    se.c_child_r = c_child_r;
+
+    // fill in se.info
+    switch (get_itbl(c)->type) {
+	// no child, no SRT
+    case CONSTR_0_1:
+    case CONSTR_0_2:
+    case CAF_BLACKHOLE:
+    case BLACKHOLE:
+    case SE_BLACKHOLE:
+    case SE_CAF_BLACKHOLE:
+    case ARR_WORDS:
+	*first_child = NULL;
+	return;
+
+	// one child (fixed), no SRT
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+	*first_child = ((StgMutVar *)c)->var;
+	return;
+    case THUNK_SELECTOR:
+	*first_child = ((StgSelector *)c)->selectee;
+	return;
+    case IND_PERM:
+    case IND_OLDGEN_PERM:
+    case IND_OLDGEN:
+	*first_child = ((StgInd *)c)->indirectee;
+	return;
+    case CONSTR_1_0:
+    case CONSTR_1_1:
+	*first_child = c->payload[0];
+	return;
+
+	// For CONSTR_2_0 and MVAR, we use se.info.step to record the position
+	// of the next child. We do not write a separate initialization code.
+	// Also we do not have to initialize info.type;
+
+	// two children (fixed), no SRT
+	// need to push a stackElement, but nothing to store in se.info
+    case CONSTR_2_0:
+	*first_child = c->payload[0];         // return the first pointer
+	// se.info.type = posTypeStep;
+	// se.info.next.step = 2;            // 2 = second
+	break;
+
+	// three children (fixed), no SRT
+	// need to push a stackElement
+    case MVAR:
+	// head must be TSO and the head of a linked list of TSOs.
+	// Shoule it be a child? Seems to be yes.
+	*first_child = (StgClosure *)((StgMVar *)c)->head;
+	// se.info.type = posTypeStep;
+	se.info.next.step = 2;            // 2 = second
+	break;
+
+	// three children (fixed), no SRT
+    case WEAK:
+	*first_child = ((StgWeak *)c)->key;
+	// se.info.type = posTypeStep;
+	se.info.next.step = 2;
+	break;
+
+	// layout.payload.ptrs, no SRT
+    case CONSTR:
+    case STABLE_NAME:
+    case BCO:
+    case CONSTR_STATIC:
+	init_ptrs(&se.info, get_itbl(c)->layout.payload.ptrs,
+		  (StgPtr)c->payload);
+	*first_child = find_ptrs(&se.info);
+	if (*first_child == NULL)
+	    return;   // no child
+	break;
+
+	// StgMutArrPtr.ptrs, no SRT
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    case MUT_ARR_PTRS_FROZEN:
+    case MUT_ARR_PTRS_FROZEN0:
+	init_ptrs(&se.info, ((StgMutArrPtrs *)c)->ptrs,
+		  (StgPtr)(((StgMutArrPtrs *)c)->payload));
+	*first_child = find_ptrs(&se.info);
+	if (*first_child == NULL)
+	    return;
+	break;
+
+    // layout.payload.ptrs, SRT
+    case FUN:           // *c is a heap object.
+    case FUN_2_0:
+	init_ptrs(&se.info, get_itbl(c)->layout.payload.ptrs, (StgPtr)c->payload);
+	*first_child = find_ptrs(&se.info);
+	if (*first_child == NULL)
+	    // no child from ptrs, so check SRT
+	    goto fun_srt_only;
+	break;
+
+    case THUNK:
+    case THUNK_2_0:
+	init_ptrs(&se.info, get_itbl(c)->layout.payload.ptrs, 
+		  (StgPtr)((StgThunk *)c)->payload);
+	*first_child = find_ptrs(&se.info);
+	if (*first_child == NULL)
+	    // no child from ptrs, so check SRT
+	    goto thunk_srt_only;
+	break;
+
+	// 1 fixed child, SRT
+    case FUN_1_0:
+    case FUN_1_1:
+	*first_child = c->payload[0];
+	ASSERT(*first_child != NULL);
+	init_srt_fun(&se.info, get_fun_itbl(c));
+	break;
+
+    case THUNK_1_0:
+    case THUNK_1_1:
+	*first_child = ((StgThunk *)c)->payload[0];
+	ASSERT(*first_child != NULL);
+	init_srt_thunk(&se.info, get_thunk_itbl(c));
+	break;
+
+    case FUN_STATIC:      // *c is a heap object.
+	ASSERT(get_itbl(c)->srt_bitmap != 0);
+    case FUN_0_1:
+    case FUN_0_2:
+    fun_srt_only:
+        init_srt_fun(&se.info, get_fun_itbl(c));
+	*first_child = find_srt(&se.info);
+	if (*first_child == NULL)
+	    return;     // no child
+	break;
+
+    // SRT only
+    case THUNK_STATIC:
+	ASSERT(get_itbl(c)->srt_bitmap != 0);
+    case THUNK_0_1:
+    case THUNK_0_2:
+    thunk_srt_only:
+        init_srt_thunk(&se.info, get_thunk_itbl(c));
+	*first_child = find_srt(&se.info);
+	if (*first_child == NULL)
+	    return;     // no child
+	break;
+	
+    case TVAR_WAIT_QUEUE:
+	*first_child = (StgClosure *)((StgTVarWaitQueue *)c)->waiting_tso;
+	se.info.next.step = 2;            // 2 = second
+	break;
+    case TVAR:
+	*first_child = (StgClosure *)((StgTVar *)c)->current_value;
+	break;
+    case TREC_HEADER:
+	*first_child = (StgClosure *)((StgTRecHeader *)c)->enclosing_trec;
+	break;
+    case TREC_CHUNK:
+	*first_child = (StgClosure *)((StgTRecChunk *)c)->prev_chunk;
+	se.info.next.step = 0;  // entry no.
+	break;
+
+	// cannot appear
+    case PAP:
+    case AP:
+    case AP_STACK:
+    case TSO:
+    case IND_STATIC:
+    case CONSTR_INTLIKE:
+    case CONSTR_CHARLIKE:
+    case CONSTR_NOCAF_STATIC:
+	// stack objects
+    case UPDATE_FRAME:
+    case CATCH_FRAME:
+    case STOP_FRAME:
+    case RET_DYN:
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+    case RET_BIG:
+    case RET_VEC_BIG:
+	// invalid objects
+    case IND:
+    case BLOCKED_FETCH:
+    case FETCH_ME:
+    case FETCH_ME_BQ:
+    case RBH:
+    case REMOTE_REF:
+    case EVACUATED:
+    case INVALID_OBJECT:
+    default:
+	barf("Invalid object *c in push()");
+	return;
+    }
+
+    if (stackTop - 1 < stackBottom) {
+#ifdef DEBUG_RETAINER
+	// debugBelch("push() to the next stack.\n");
+#endif
+	// currentStack->free is updated when the active stack is switched
+	// to the next stack.
+	currentStack->free = (StgPtr)stackTop;
+
+	if (currentStack->link == NULL) {
+	    nbd = allocGroup(BLOCKS_IN_STACK);
+	    nbd->link = NULL;
+	    nbd->u.back = currentStack;
+	    currentStack->link = nbd;
+	} else
+	    nbd = currentStack->link;
+
+	newStackBlock(nbd);
+    }
+
+    // adjust stackTop (acutal push)
+    stackTop--;
+    // If the size of stackElement was huge, we would better replace the
+    // following statement by either a memcpy() call or a switch statement
+    // on the type of the element. Currently, the size of stackElement is
+    // small enough (5 words) that this direct assignment seems to be enough.
+    *stackTop = se;
+
+#ifdef DEBUG_RETAINER
+    stackSize++;
+    if (stackSize > maxStackSize) maxStackSize = stackSize;
+    // ASSERT(stackSize >= 0);
+    // debugBelch("stackSize = %d\n", stackSize);
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+ *  popOff() and popOffReal(): Pop a stackElement off the traverse stack.
+ *  Invariants:
+ *    stackTop cannot be equal to stackLimit unless the whole stack is
+ *    empty, in which case popOff() is not allowed.
+ *  Note:
+ *    You can think of popOffReal() as a part of popOff() which is
+ *    executed at the end of popOff() in necessary. Since popOff() is
+ *    likely to be executed quite often while popOffReal() is not, we
+ *    separate popOffReal() from popOff(), which is declared as an
+ *    INLINE function (for the sake of execution speed).  popOffReal()
+ *    is called only within popOff() and nowhere else.
+ * -------------------------------------------------------------------------- */
+static void
+popOffReal(void)
+{
+    bdescr *pbd;    // Previous Block Descriptor
+
+#ifdef DEBUG_RETAINER
+    // debugBelch("pop() to the previous stack.\n");
+#endif
+
+    ASSERT(stackTop + 1 == stackLimit);
+    ASSERT(stackBottom == (stackElement *)currentStack->start);
+
+    if (firstStack == currentStack) {
+	// The stack is completely empty.
+	stackTop++;
+	ASSERT(stackTop == stackLimit);
+#ifdef DEBUG_RETAINER
+	stackSize--;
+	if (stackSize > maxStackSize) maxStackSize = stackSize;
+	/*
+	  ASSERT(stackSize >= 0);
+	  debugBelch("stackSize = %d\n", stackSize);
+	*/
+#endif
+	return;
+    }
+
+    // currentStack->free is updated when the active stack is switched back
+    // to the previous stack.
+    currentStack->free = (StgPtr)stackLimit;
+
+    // find the previous block descriptor
+    pbd = currentStack->u.back;
+    ASSERT(pbd != NULL);
+
+    returnToOldStack(pbd);
+
+#ifdef DEBUG_RETAINER
+    stackSize--;
+    if (stackSize > maxStackSize) maxStackSize = stackSize;
+    /*
+      ASSERT(stackSize >= 0);
+      debugBelch("stackSize = %d\n", stackSize);
+    */
+#endif
+}
+
+static INLINE void
+popOff(void) {
+#ifdef DEBUG_RETAINER
+    // debugBelch("\tpopOff(): stackTop = 0x%x, currentStackBoundary = 0x%x\n", stackTop, currentStackBoundary);
+#endif
+
+    ASSERT(stackTop != stackLimit);
+    ASSERT(!isEmptyRetainerStack());
+
+    // <= (instead of <) is wrong!
+    if (stackTop + 1 < stackLimit) {
+	stackTop++;
+#ifdef DEBUG_RETAINER
+	stackSize--;
+	if (stackSize > maxStackSize) maxStackSize = stackSize;
+	/*
+	  ASSERT(stackSize >= 0);
+	  debugBelch("stackSize = %d\n", stackSize);
+	*/
+#endif
+	return;
+    }
+
+    popOffReal();
+}
+
+/* -----------------------------------------------------------------------------
+ *  Finds the next object to be considered for retainer profiling and store
+ *  its pointer to *c.
+ *  Test if the topmost stack element indicates that more objects are left,
+ *  and if so, retrieve the first object and store its pointer to *c. Also,
+ *  set *cp and *r appropriately, both of which are stored in the stack element.
+ *  The topmost stack element then is overwritten so as for it to now denote
+ *  the next object.
+ *  If the topmost stack element indicates no more objects are left, pop
+ *  off the stack element until either an object can be retrieved or
+ *  the current stack chunk becomes empty, indicated by rtsTrue returned by
+ *  isOnBoundary(), in which case *c is set to NULL.
+ *  Note:
+ *    It is okay to call this function even when the current stack chunk
+ *    is empty.
+ * -------------------------------------------------------------------------- */
+static INLINE void
+pop( StgClosure **c, StgClosure **cp, retainer *r )
+{
+    stackElement *se;
+
+#ifdef DEBUG_RETAINER
+    // debugBelch("pop(): stackTop = 0x%x, currentStackBoundary = 0x%x\n", stackTop, currentStackBoundary);
+#endif
+
+    do {
+	if (isOnBoundary()) {     // if the current stack chunk is depleted
+	    *c = NULL;
+	    return;
+	}
+
+	se = stackTop;
+
+	switch (get_itbl(se->c)->type) {
+	    // two children (fixed), no SRT
+	    // nothing in se.info
+	case CONSTR_2_0:
+	    *c = se->c->payload[1];
+	    *cp = se->c;
+	    *r = se->c_child_r;
+	    popOff();
+	    return;
+
+	    // three children (fixed), no SRT
+	    // need to push a stackElement
+	case MVAR:
+	    if (se->info.next.step == 2) {
+		*c = (StgClosure *)((StgMVar *)se->c)->tail;
+		se->info.next.step++;             // move to the next step
+		// no popOff
+	    } else {
+		*c = ((StgMVar *)se->c)->value;
+		popOff();
+	    }
+	    *cp = se->c;
+	    *r = se->c_child_r;
+	    return;
+
+	    // three children (fixed), no SRT
+	case WEAK:
+	    if (se->info.next.step == 2) {
+		*c = ((StgWeak *)se->c)->value;
+		se->info.next.step++;
+		// no popOff
+	    } else {
+		*c = ((StgWeak *)se->c)->finalizer;
+		popOff();
+	    }
+	    *cp = se->c;
+	    *r = se->c_child_r;
+	    return;
+
+	case TVAR_WAIT_QUEUE:
+	    if (se->info.next.step == 2) {
+		*c = (StgClosure *)((StgTVarWaitQueue *)se->c)->next_queue_entry;
+		se->info.next.step++;             // move to the next step
+		// no popOff
+	    } else {
+		*c = (StgClosure *)((StgTVarWaitQueue *)se->c)->prev_queue_entry;
+		popOff();
+	    }
+	    *cp = se->c;
+	    *r = se->c_child_r;
+	    return;
+
+	case TVAR:
+	    *c = (StgClosure *)((StgTVar *)se->c)->first_wait_queue_entry;
+	    *cp = se->c;
+	    *r = se->c_child_r;
+	    popOff();
+	    return;
+
+	case TREC_HEADER:
+	    *c = (StgClosure *)((StgTRecHeader *)se->c)->current_chunk;
+	    *cp = se->c;
+	    *r = se->c_child_r;
+	    popOff();
+	    return;
+
+	case TREC_CHUNK: {
+	    // These are pretty complicated: we have N entries, each
+	    // of which contains 3 fields that we want to follow.  So
+	    // we divide the step counter: the 2 low bits indicate
+	    // which field, and the rest of the bits indicate the
+	    // entry number (starting from zero).
+	    nat entry_no = se->info.next.step >> 2;
+	    nat field_no = se->info.next.step & 3;
+	    if (entry_no == ((StgTRecChunk *)se->c)->next_entry_idx) {
+		*c = NULL;
+		popOff();
+		return;
+	    }
+	    TRecEntry *entry = &((StgTRecChunk *)se->c)->entries[entry_no];
+	    if (field_no == 0) {
+		*c = (StgClosure *)entry->tvar;
+	    } else if (field_no == 1) {
+		*c = entry->expected_value;
+	    } else {
+		*c = entry->new_value;
+	    }
+	    *cp = se->c;
+	    *r = se->c_child_r;
+	    se->info.next.step++;
+	    return;
+	}
+
+	case CONSTR:
+	case STABLE_NAME:
+	case BCO:
+	case CONSTR_STATIC:
+	    // StgMutArrPtr.ptrs, no SRT
+	case MUT_ARR_PTRS_CLEAN:
+	case MUT_ARR_PTRS_DIRTY:
+	case MUT_ARR_PTRS_FROZEN:
+	case MUT_ARR_PTRS_FROZEN0:
+	    *c = find_ptrs(&se->info);
+	    if (*c == NULL) {
+		popOff();
+		break;
+	    }
+	    *cp = se->c;
+	    *r = se->c_child_r;
+	    return;
+
+	    // layout.payload.ptrs, SRT
+	case FUN:         // always a heap object
+	case FUN_2_0:
+	    if (se->info.type == posTypePtrs) {
+		*c = find_ptrs(&se->info);
+		if (*c != NULL) {
+		    *cp = se->c;
+		    *r = se->c_child_r;
+		    return;
+		}
+		init_srt_fun(&se->info, get_fun_itbl(se->c));
+	    }
+	    goto do_srt;
+
+	case THUNK:
+	case THUNK_2_0:
+	    if (se->info.type == posTypePtrs) {
+		*c = find_ptrs(&se->info);
+		if (*c != NULL) {
+		    *cp = se->c;
+		    *r = se->c_child_r;
+		    return;
+		}
+		init_srt_thunk(&se->info, get_thunk_itbl(se->c));
+	    }
+	    goto do_srt;
+
+	    // SRT
+	do_srt:
+	case THUNK_STATIC:
+	case FUN_STATIC:
+	case FUN_0_1:
+	case FUN_0_2:
+	case THUNK_0_1:
+	case THUNK_0_2:
+	case FUN_1_0:
+	case FUN_1_1:
+	case THUNK_1_0:
+	case THUNK_1_1:
+	    *c = find_srt(&se->info);
+	    if (*c != NULL) {
+		*cp = se->c;
+		*r = se->c_child_r;
+		return;
+	    }
+	    popOff();
+	    break;
+
+	    // no child (fixed), no SRT
+	case CONSTR_0_1:
+	case CONSTR_0_2:
+	case CAF_BLACKHOLE:
+	case BLACKHOLE:
+	case SE_BLACKHOLE:
+	case SE_CAF_BLACKHOLE:
+	case ARR_WORDS:
+	    // one child (fixed), no SRT
+	case MUT_VAR_CLEAN:
+	case MUT_VAR_DIRTY:
+	case THUNK_SELECTOR:
+	case IND_PERM:
+	case IND_OLDGEN_PERM:
+	case IND_OLDGEN:
+	case CONSTR_1_1:
+	    // cannot appear
+	case PAP:
+	case AP:
+	case AP_STACK:
+	case TSO:
+	case IND_STATIC:
+	case CONSTR_INTLIKE:
+	case CONSTR_CHARLIKE:
+	case CONSTR_NOCAF_STATIC:
+	    // stack objects
+	case RET_DYN:
+	case UPDATE_FRAME:
+	case CATCH_FRAME:
+	case STOP_FRAME:
+	case RET_BCO:
+	case RET_SMALL:
+	case RET_VEC_SMALL:
+	case RET_BIG:
+	case RET_VEC_BIG:
+	    // invalid objects
+	case IND:
+	case BLOCKED_FETCH:
+	case FETCH_ME:
+	case FETCH_ME_BQ:
+	case RBH:
+	case REMOTE_REF:
+	case EVACUATED:
+	case INVALID_OBJECT:
+	default:
+	    barf("Invalid object *c in pop()");
+	    return;
+	}
+    } while (rtsTrue);
+}
+
+/* -----------------------------------------------------------------------------
+ * RETAINER PROFILING ENGINE
+ * -------------------------------------------------------------------------- */
+
+void
+initRetainerProfiling( void )
+{
+    initializeAllRetainerSet();
+    retainerGeneration = 0;
+}
+
+/* -----------------------------------------------------------------------------
+ *  This function must be called before f-closing prof_file.
+ * -------------------------------------------------------------------------- */
+void
+endRetainerProfiling( void )
+{
+#ifdef SECOND_APPROACH
+    outputAllRetainerSet(prof_file);
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+ *  Returns the actual pointer to the retainer set of the closure *c.
+ *  It may adjust RSET(c) subject to flip.
+ *  Side effects:
+ *    RSET(c) is initialized to NULL if its current value does not
+ *    conform to flip.
+ *  Note:
+ *    Even though this function has side effects, they CAN be ignored because
+ *    subsequent calls to retainerSetOf() always result in the same return value
+ *    and retainerSetOf() is the only way to retrieve retainerSet of a given
+ *    closure.
+ *    We have to perform an XOR (^) operation each time a closure is examined.
+ *    The reason is that we do not know when a closure is visited last.
+ * -------------------------------------------------------------------------- */
+static INLINE void
+maybeInitRetainerSet( StgClosure *c )
+{
+    if (!isRetainerSetFieldValid(c)) {
+	setRetainerSetToNull(c);
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Returns rtsTrue if *c is a retainer.
+ * -------------------------------------------------------------------------- */
+static INLINE rtsBool
+isRetainer( StgClosure *c )
+{
+    switch (get_itbl(c)->type) {
+	//
+	//  True case
+	//
+	// TSOs MUST be retainers: they constitute the set of roots.
+    case TSO:
+
+	// mutable objects
+    case MVAR:
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    case MUT_ARR_PTRS_FROZEN:
+    case MUT_ARR_PTRS_FROZEN0:
+
+	// thunks are retainers.
+    case THUNK:
+    case THUNK_1_0:
+    case THUNK_0_1:
+    case THUNK_2_0:
+    case THUNK_1_1:
+    case THUNK_0_2:
+    case THUNK_SELECTOR:
+    case AP:
+    case AP_STACK:
+
+	// Static thunks, or CAFS, are obviously retainers.
+    case THUNK_STATIC:
+
+	// WEAK objects are roots; there is separate code in which traversing
+	// begins from WEAK objects.
+    case WEAK:
+
+	// Since the other mutvar-type things are retainers, seems
+	// like the right thing to do:
+    case TVAR:
+	return rtsTrue;
+
+	//
+	// False case
+	//
+
+	// constructors
+    case CONSTR:
+    case CONSTR_1_0:
+    case CONSTR_0_1:
+    case CONSTR_2_0:
+    case CONSTR_1_1:
+    case CONSTR_0_2:
+	// functions
+    case FUN:
+    case FUN_1_0:
+    case FUN_0_1:
+    case FUN_2_0:
+    case FUN_1_1:
+    case FUN_0_2:
+	// partial applications
+    case PAP:
+	// blackholes
+    case CAF_BLACKHOLE:
+    case BLACKHOLE:
+    case SE_BLACKHOLE:
+    case SE_CAF_BLACKHOLE:
+	// indirection
+    case IND_PERM:
+    case IND_OLDGEN_PERM:
+    case IND_OLDGEN:
+	// static objects
+    case CONSTR_STATIC:
+    case FUN_STATIC:
+	// misc
+    case STABLE_NAME:
+    case BCO:
+    case ARR_WORDS:
+	// STM
+    case TVAR_WAIT_QUEUE:
+    case TREC_HEADER:
+    case TREC_CHUNK:
+	return rtsFalse;
+
+	//
+	// Error case
+	//
+	// IND_STATIC cannot be *c, *cp, *r in the retainer profiling loop.
+    case IND_STATIC:
+	// CONSTR_INTLIKE, CONSTR_CHARLIKE, and CONSTR_NOCAF_STATIC
+	// cannot be *c, *cp, *r in the retainer profiling loop.
+    case CONSTR_INTLIKE:
+    case CONSTR_CHARLIKE:
+    case CONSTR_NOCAF_STATIC:
+	// Stack objects are invalid because they are never treated as
+	// legal objects during retainer profiling.
+    case UPDATE_FRAME:
+    case CATCH_FRAME:
+    case STOP_FRAME:
+    case RET_DYN:
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+    case RET_BIG:
+    case RET_VEC_BIG:
+	// other cases
+    case IND:
+    case BLOCKED_FETCH:
+    case FETCH_ME:
+    case FETCH_ME_BQ:
+    case RBH:
+    case REMOTE_REF:
+    case EVACUATED:
+    case INVALID_OBJECT:
+    default:
+	barf("Invalid object in isRetainer(): %d", get_itbl(c)->type);
+	return rtsFalse;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ *  Returns the retainer function value for the closure *c, i.e., R(*c).
+ *  This function does NOT return the retainer(s) of *c.
+ *  Invariants:
+ *    *c must be a retainer.
+ *  Note:
+ *    Depending on the definition of this function, the maintenance of retainer
+ *    sets can be made easier. If most retainer sets are likely to be created
+ *    again across garbage collections, refreshAllRetainerSet() in
+ *    RetainerSet.c can simply do nothing.
+ *    If this is not the case, we can free all the retainer sets and
+ *    re-initialize the hash table.
+ *    See refreshAllRetainerSet() in RetainerSet.c.
+ * -------------------------------------------------------------------------- */
+static INLINE retainer
+getRetainerFrom( StgClosure *c )
+{
+    ASSERT(isRetainer(c));
+
+#if defined(RETAINER_SCHEME_INFO)
+    // Retainer scheme 1: retainer = info table
+    return get_itbl(c);
+#elif defined(RETAINER_SCHEME_CCS)
+    // Retainer scheme 2: retainer = cost centre stack
+    return c->header.prof.ccs;
+#elif defined(RETAINER_SCHEME_CC)
+    // Retainer scheme 3: retainer = cost centre
+    return c->header.prof.ccs->cc;
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+ *  Associates the retainer set *s with the closure *c, that is, *s becomes
+ *  the retainer set of *c.
+ *  Invariants:
+ *    c != NULL
+ *    s != NULL
+ * -------------------------------------------------------------------------- */
+static INLINE void
+associate( StgClosure *c, RetainerSet *s )
+{
+    // StgWord has the same size as pointers, so the following type
+    // casting is okay.
+    RSET(c) = (RetainerSet *)((StgWord)s | flip);
+}
+
+/* -----------------------------------------------------------------------------
+   Call retainClosure for each of the closures covered by a large bitmap.
+   -------------------------------------------------------------------------- */
+
+static void
+retain_large_bitmap (StgPtr p, StgLargeBitmap *large_bitmap, nat size,
+		     StgClosure *c, retainer c_child_r)
+{
+    nat i, b;
+    StgWord bitmap;
+    
+    b = 0;
+    bitmap = large_bitmap->bitmap[b];
+    for (i = 0; i < size; ) {
+	if ((bitmap & 1) == 0) {
+	    retainClosure((StgClosure *)*p, c, c_child_r);
+	}
+	i++;
+	p++;
+	if (i % BITS_IN(W_) == 0) {
+	    b++;
+	    bitmap = large_bitmap->bitmap[b];
+	} else {
+	    bitmap = bitmap >> 1;
+	}
+    }
+}
+
+static INLINE StgPtr
+retain_small_bitmap (StgPtr p, nat size, StgWord bitmap,
+		     StgClosure *c, retainer c_child_r)
+{
+    while (size > 0) {
+	if ((bitmap & 1) == 0) {
+	    retainClosure((StgClosure *)*p, c, c_child_r);
+	}
+	p++;
+	bitmap = bitmap >> 1;
+	size--;
+    }
+    return p;
+}
+
+/* -----------------------------------------------------------------------------
+ * Call retainClosure for each of the closures in an SRT.
+ * ------------------------------------------------------------------------- */
+
+static void
+retain_large_srt_bitmap (StgLargeSRT *srt, StgClosure *c, retainer c_child_r)
+{
+    nat i, b, size;
+    StgWord bitmap;
+    StgClosure **p;
+    
+    b = 0;
+    p = (StgClosure **)srt->srt;
+    size   = srt->l.size;
+    bitmap = srt->l.bitmap[b];
+    for (i = 0; i < size; ) {
+	if ((bitmap & 1) != 0) {
+	    retainClosure((StgClosure *)*p, c, c_child_r);
+	}
+	i++;
+	p++;
+	if (i % BITS_IN(W_) == 0) {
+	    b++;
+	    bitmap = srt->l.bitmap[b];
+	} else {
+	    bitmap = bitmap >> 1;
+	}
+    }
+}
+
+static INLINE void
+retainSRT (StgClosure **srt, nat srt_bitmap, StgClosure *c, retainer c_child_r)
+{
+  nat bitmap;
+  StgClosure **p;
+
+  bitmap = srt_bitmap;
+  p = srt;
+
+  if (bitmap == (StgHalfWord)(-1)) {  
+      retain_large_srt_bitmap( (StgLargeSRT *)srt, c, c_child_r );
+      return;
+  }
+
+  while (bitmap != 0) {
+      if ((bitmap & 1) != 0) {
+#ifdef ENABLE_WIN32_DLL_SUPPORT
+	  if ( (unsigned long)(*srt) & 0x1 ) {
+	      retainClosure(*stgCast(StgClosure**,(stgCast(unsigned long, *srt) & ~0x1)), 
+			    c, c_child_r);
+	  } else {
+	      retainClosure(*srt,c,c_child_r);
+	  }
+#else
+	  retainClosure(*srt,c,c_child_r);
+#endif
+      }
+      p++;
+      bitmap = bitmap >> 1;
+  }
+}
+
+/* -----------------------------------------------------------------------------
+ *  Process all the objects in the stack chunk from stackStart to stackEnd
+ *  with *c and *c_child_r being their parent and their most recent retainer,
+ *  respectively. Treat stackOptionalFun as another child of *c if it is
+ *  not NULL.
+ *  Invariants:
+ *    *c is one of the following: TSO, AP_STACK.
+ *    If *c is TSO, c == c_child_r.
+ *    stackStart < stackEnd.
+ *    RSET(c) and RSET(c_child_r) are valid, i.e., their
+ *    interpretation conforms to the current value of flip (even when they
+ *    are interpreted to be NULL).
+ *    If *c is TSO, its state is not any of ThreadRelocated, ThreadComplete,
+ *    or ThreadKilled, which means that its stack is ready to process.
+ *  Note:
+ *    This code was almost plagiarzied from GC.c! For each pointer,
+ *    retainClosure() is invoked instead of evacuate().
+ * -------------------------------------------------------------------------- */
+static void
+retainStack( StgClosure *c, retainer c_child_r,
+	     StgPtr stackStart, StgPtr stackEnd )
+{
+    stackElement *oldStackBoundary;
+    StgPtr p;
+    StgRetInfoTable *info;
+    StgWord32 bitmap;
+    nat size;
+
+#ifdef DEBUG_RETAINER
+    cStackSize++;
+    if (cStackSize > maxCStackSize) maxCStackSize = cStackSize;
+#endif
+
+    /*
+      Each invocation of retainStack() creates a new virtual
+      stack. Since all such stacks share a single common stack, we
+      record the current currentStackBoundary, which will be restored
+      at the exit.
+    */
+    oldStackBoundary = currentStackBoundary;
+    currentStackBoundary = stackTop;
+
+#ifdef DEBUG_RETAINER
+    // debugBelch("retainStack() called: oldStackBoundary = 0x%x, currentStackBoundary = 0x%x\n", oldStackBoundary, currentStackBoundary);
+#endif
+
+    ASSERT(get_itbl(c)->type != TSO || 
+	   (((StgTSO *)c)->what_next != ThreadRelocated &&
+	    ((StgTSO *)c)->what_next != ThreadComplete &&
+	    ((StgTSO *)c)->what_next != ThreadKilled));
+    
+    p = stackStart;
+    while (p < stackEnd) {
+	info = get_ret_itbl((StgClosure *)p);
+
+	switch(info->i.type) {
+
+	case UPDATE_FRAME:
+	    retainClosure(((StgUpdateFrame *)p)->updatee, c, c_child_r);
+	    p += sizeofW(StgUpdateFrame);
+	    continue;
+
+	case STOP_FRAME:
+	case CATCH_FRAME:
+	case CATCH_STM_FRAME:
+	case CATCH_RETRY_FRAME:
+	case ATOMICALLY_FRAME:
+	case RET_SMALL:
+	case RET_VEC_SMALL:
+	    bitmap = BITMAP_BITS(info->i.layout.bitmap);
+	    size   = BITMAP_SIZE(info->i.layout.bitmap);
+	    p++;
+	    p = retain_small_bitmap(p, size, bitmap, c, c_child_r);
+
+	follow_srt:
+	    retainSRT((StgClosure **)GET_SRT(info), info->i.srt_bitmap, c, c_child_r);
+	    continue;
+
+	case RET_BCO: {
+	    StgBCO *bco;
+	    
+	    p++;
+	    retainClosure((StgClosure *)*p, c, c_child_r);
+	    bco = (StgBCO *)*p;
+	    p++;
+	    size = BCO_BITMAP_SIZE(bco);
+	    retain_large_bitmap(p, BCO_BITMAP(bco), size, c, c_child_r);
+	    p += size;
+	    continue;
+	}
+
+	    // large bitmap (> 32 entries, or > 64 on a 64-bit machine) 
+	case RET_BIG:
+	case RET_VEC_BIG:
+	    size = GET_LARGE_BITMAP(&info->i)->size;
+	    p++;
+	    retain_large_bitmap(p, GET_LARGE_BITMAP(&info->i),
+				size, c, c_child_r);
+	    p += size;
+	    // and don't forget to follow the SRT 
+	    goto follow_srt;
+
+	    // Dynamic bitmap: the mask is stored on the stack 
+	case RET_DYN: {
+	    StgWord dyn;
+	    dyn = ((StgRetDyn *)p)->liveness;
+
+	    // traverse the bitmap first
+	    bitmap = RET_DYN_LIVENESS(dyn);
+	    p      = (P_)&((StgRetDyn *)p)->payload[0];
+	    size   = RET_DYN_BITMAP_SIZE;
+	    p = retain_small_bitmap(p, size, bitmap, c, c_child_r);
+	    
+	    // skip over the non-ptr words
+	    p += RET_DYN_NONPTRS(dyn) + RET_DYN_NONPTR_REGS_SIZE;
+	    
+	    // follow the ptr words
+	    for (size = RET_DYN_PTRS(dyn); size > 0; size--) {
+		retainClosure((StgClosure *)*p, c, c_child_r);
+		p++;
+	    }
+	    continue;
+	}
+
+	case RET_FUN: {
+	    StgRetFun *ret_fun = (StgRetFun *)p;
+	    StgFunInfoTable *fun_info;
+	    
+	    retainClosure(ret_fun->fun, c, c_child_r);
+	    fun_info = get_fun_itbl(ret_fun->fun);
+	    
+	    p = (P_)&ret_fun->payload;
+	    switch (fun_info->f.fun_type) {
+	    case ARG_GEN:
+		bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+		size = BITMAP_SIZE(fun_info->f.b.bitmap);
+		p = retain_small_bitmap(p, size, bitmap, c, c_child_r);
+		break;
+	    case ARG_GEN_BIG:
+		size = GET_FUN_LARGE_BITMAP(fun_info)->size;
+		retain_large_bitmap(p, GET_FUN_LARGE_BITMAP(fun_info), 
+				    size, c, c_child_r);
+		p += size;
+		break;
+	    default:
+		bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+		size = BITMAP_SIZE(stg_arg_bitmaps[fun_info->f.fun_type]);
+		p = retain_small_bitmap(p, size, bitmap, c, c_child_r);
+		break;
+	    }
+	    goto follow_srt;
+	}
+
+	default:
+	    barf("Invalid object found in retainStack(): %d",
+		 (int)(info->i.type));
+	}
+    }
+
+    // restore currentStackBoundary
+    currentStackBoundary = oldStackBoundary;
+#ifdef DEBUG_RETAINER
+    // debugBelch("retainStack() finished: currentStackBoundary = 0x%x\n", currentStackBoundary);
+#endif
+
+#ifdef DEBUG_RETAINER
+    cStackSize--;
+#endif
+}
+
+/* ----------------------------------------------------------------------------
+ * Call retainClosure for each of the children of a PAP/AP
+ * ------------------------------------------------------------------------- */
+
+static INLINE StgPtr
+retain_PAP_payload (StgClosure *pap,  retainer c_child_r, StgClosure *fun, 
+		    StgClosure** payload, StgWord n_args)
+{
+    StgPtr p;
+    StgWord bitmap;
+    StgFunInfoTable *fun_info;
+
+    retainClosure(fun, pap, c_child_r);
+    fun_info = get_fun_itbl(fun);
+    ASSERT(fun_info->i.type != PAP);
+
+    p = (StgPtr)payload;
+
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+	bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+	p = retain_small_bitmap(p, n_args, bitmap, 
+				pap, c_child_r);
+	break;
+    case ARG_GEN_BIG:
+	retain_large_bitmap(p, GET_FUN_LARGE_BITMAP(fun_info),
+			    n_args, pap, c_child_r);
+	p += n_args;
+	break;
+    case ARG_BCO:
+	retain_large_bitmap((StgPtr)payload, BCO_BITMAP(fun),
+			    n_args, pap, c_child_r);
+	p += n_args;
+	break;
+    default:
+	bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+	p = retain_small_bitmap(p, n_args, bitmap, pap, c_child_r);
+	break;
+    }
+    return p;
+}
+
+/* -----------------------------------------------------------------------------
+ *  Compute the retainer set of *c0 and all its desecents by traversing.
+ *  *cp0 is the parent of *c0, and *r0 is the most recent retainer of *c0.
+ *  Invariants:
+ *    c0 = cp0 = r0 holds only for root objects.
+ *    RSET(cp0) and RSET(r0) are valid, i.e., their
+ *    interpretation conforms to the current value of flip (even when they
+ *    are interpreted to be NULL).
+ *    However, RSET(c0) may be corrupt, i.e., it may not conform to
+ *    the current value of flip. If it does not, during the execution
+ *    of this function, RSET(c0) must be initialized as well as all
+ *    its descendants.
+ *  Note:
+ *    stackTop must be the same at the beginning and the exit of this function.
+ *    *c0 can be TSO (as well as AP_STACK).
+ * -------------------------------------------------------------------------- */
+static void
+retainClosure( StgClosure *c0, StgClosure *cp0, retainer r0 )
+{
+    // c = Current closure
+    // cp = Current closure's Parent
+    // r = current closures' most recent Retainer
+    // c_child_r = current closure's children's most recent retainer
+    // first_child = first child of c
+    StgClosure *c, *cp, *first_child;
+    RetainerSet *s, *retainerSetOfc;
+    retainer r, c_child_r;
+    StgWord typeOfc;
+
+#ifdef DEBUG_RETAINER
+    // StgPtr oldStackTop;
+#endif
+
+#ifdef DEBUG_RETAINER
+    // oldStackTop = stackTop;
+    // debugBelch("retainClosure() called: c0 = 0x%x, cp0 = 0x%x, r0 = 0x%x\n", c0, cp0, r0);
+#endif
+
+    // (c, cp, r) = (c0, cp0, r0)
+    c = c0;
+    cp = cp0;
+    r = r0;
+    goto inner_loop;
+
+loop:
+    //debugBelch("loop");
+    // pop to (c, cp, r);
+    pop(&c, &cp, &r);
+
+    if (c == NULL) {
+#ifdef DEBUG_RETAINER
+	// debugBelch("retainClosure() ends: oldStackTop = 0x%x, stackTop = 0x%x\n", oldStackTop, stackTop);
+#endif
+	return;
+    }
+
+    //debugBelch("inner_loop");
+
+inner_loop:
+    // c  = current closure under consideration,
+    // cp = current closure's parent,
+    // r  = current closure's most recent retainer
+    //
+    // Loop invariants (on the meaning of c, cp, r, and their retainer sets):
+    //   RSET(cp) and RSET(r) are valid.
+    //   RSET(c) is valid only if c has been visited before.
+    //
+    // Loop invariants (on the relation between c, cp, and r)
+    //   if cp is not a retainer, r belongs to RSET(cp).
+    //   if cp is a retainer, r == cp.
+
+    typeOfc = get_itbl(c)->type;
+
+#ifdef DEBUG_RETAINER
+    switch (typeOfc) {
+    case IND_STATIC:
+    case CONSTR_INTLIKE:
+    case CONSTR_CHARLIKE:
+    case CONSTR_NOCAF_STATIC:
+    case CONSTR_STATIC:
+    case THUNK_STATIC:
+    case FUN_STATIC:
+	break;
+    default:
+	if (retainerSetOf(c) == NULL) {   // first visit?
+	    costArray[typeOfc] += cost(c);
+	    sumOfNewCost += cost(c);
+	}
+	break;
+    }
+#endif
+
+    // special cases
+    switch (typeOfc) {
+    case TSO:
+	if (((StgTSO *)c)->what_next == ThreadComplete ||
+	    ((StgTSO *)c)->what_next == ThreadKilled) {
+#ifdef DEBUG_RETAINER
+	    debugBelch("ThreadComplete or ThreadKilled encountered in retainClosure()\n");
+#endif
+	    goto loop;
+	}
+	if (((StgTSO *)c)->what_next == ThreadRelocated) {
+#ifdef DEBUG_RETAINER
+	    debugBelch("ThreadRelocated encountered in retainClosure()\n");
+#endif
+	    c = (StgClosure *)((StgTSO *)c)->link;
+	    goto inner_loop;
+	}
+	break;
+
+    case IND_STATIC:
+	// We just skip IND_STATIC, so its retainer set is never computed.
+	c = ((StgIndStatic *)c)->indirectee;
+	goto inner_loop;
+    case CONSTR_INTLIKE:
+    case CONSTR_CHARLIKE:
+	// static objects with no pointers out, so goto loop.
+    case CONSTR_NOCAF_STATIC:
+	// It is not just enough not to compute the retainer set for *c; it is
+	// mandatory because CONSTR_NOCAF_STATIC are not reachable from
+	// scavenged_static_objects, the list from which is assumed to traverse
+	// all static objects after major garbage collections.
+	goto loop;
+    case THUNK_STATIC:
+    case FUN_STATIC:
+	if (get_itbl(c)->srt_bitmap == 0) {
+	    // No need to compute the retainer set; no dynamic objects
+	    // are reachable from *c.
+	    //
+	    // Static objects: if we traverse all the live closures,
+	    // including static closures, during each heap census then
+	    // we will observe that some static closures appear and
+	    // disappear.  eg. a closure may contain a pointer to a
+	    // static function 'f' which is not otherwise reachable
+	    // (it doesn't indirectly point to any CAFs, so it doesn't
+	    // appear in any SRTs), so we would find 'f' during
+	    // traversal.  However on the next sweep there may be no
+	    // closures pointing to 'f'.
+	    //
+	    // We must therefore ignore static closures whose SRT is
+	    // empty, because these are exactly the closures that may
+	    // "appear".  A closure with a non-empty SRT, and which is
+	    // still required, will always be reachable.
+	    //
+	    // But what about CONSTR_STATIC?  Surely these may be able
+	    // to appear, and they don't have SRTs, so we can't
+	    // check.  So for now, we're calling
+	    // resetStaticObjectForRetainerProfiling() from the
+	    // garbage collector to reset the retainer sets in all the
+	    // reachable static objects.
+	    goto loop;
+	}
+    default:
+	break;
+    }
+
+    // The above objects are ignored in computing the average number of times
+    // an object is visited.
+    timesAnyObjectVisited++;
+
+    // If this is the first visit to c, initialize its retainer set.
+    maybeInitRetainerSet(c);
+    retainerSetOfc = retainerSetOf(c);
+
+    // Now compute s:
+    //    isRetainer(cp) == rtsTrue => s == NULL
+    //    isRetainer(cp) == rtsFalse => s == cp.retainer
+    if (isRetainer(cp))
+	s = NULL;
+    else
+	s = retainerSetOf(cp);
+
+    // (c, cp, r, s) is available.
+
+    // (c, cp, r, s, R_r) is available, so compute the retainer set for *c.
+    if (retainerSetOfc == NULL) {
+	// This is the first visit to *c.
+	numObjectVisited++;
+
+	if (s == NULL)
+	    associate(c, singleton(r));
+	else
+	    // s is actually the retainer set of *c!
+	    associate(c, s);
+
+	// compute c_child_r
+	c_child_r = isRetainer(c) ? getRetainerFrom(c) : r;
+    } else {
+	// This is not the first visit to *c.
+	if (isMember(r, retainerSetOfc))
+	    goto loop;          // no need to process child
+
+	if (s == NULL)
+	    associate(c, addElement(r, retainerSetOfc));
+	else {
+	    // s is not NULL and cp is not a retainer. This means that
+	    // each time *cp is visited, so is *c. Thus, if s has
+	    // exactly one more element in its retainer set than c, s
+	    // is also the new retainer set for *c.
+	    if (s->num == retainerSetOfc->num + 1) {
+		associate(c, s);
+	    }
+	    // Otherwise, just add R_r to the current retainer set of *c.
+	    else {
+		associate(c, addElement(r, retainerSetOfc));
+	    }
+	}
+
+	if (isRetainer(c))
+	    goto loop;          // no need to process child
+
+	// compute c_child_r
+	c_child_r = r;
+    }
+
+    // now, RSET() of all of *c, *cp, and *r is valid.
+    // (c, c_child_r) are available.
+
+    // process child
+
+    // Special case closures: we process these all in one go rather
+    // than attempting to save the current position, because doing so
+    // would be hard.
+    switch (typeOfc) {
+    case TSO:
+	retainStack(c, c_child_r,
+		    ((StgTSO *)c)->sp,
+		    ((StgTSO *)c)->stack + ((StgTSO *)c)->stack_size);
+	goto loop;
+
+    case PAP:
+    {
+	StgPAP *pap = (StgPAP *)c;
+	retain_PAP_payload(c, c_child_r, pap->fun, pap->payload, pap->n_args);
+	goto loop;
+    }
+
+    case AP:
+    {
+	StgAP *ap = (StgAP *)c;
+	retain_PAP_payload(c, c_child_r, ap->fun, ap->payload, ap->n_args);
+	goto loop;
+    }
+
+    case AP_STACK:
+	retainClosure(((StgAP_STACK *)c)->fun, c, c_child_r);
+	retainStack(c, c_child_r,
+		    (StgPtr)((StgAP_STACK *)c)->payload,
+		    (StgPtr)((StgAP_STACK *)c)->payload +
+		             ((StgAP_STACK *)c)->size);
+	goto loop;
+    }
+
+    push(c, c_child_r, &first_child);
+
+    // If first_child is null, c has no child.
+    // If first_child is not null, the top stack element points to the next
+    // object. push() may or may not push a stackElement on the stack.
+    if (first_child == NULL)
+	goto loop;
+
+    // (c, cp, r) = (first_child, c, c_child_r)
+    r = c_child_r;
+    cp = c;
+    c = first_child;
+    goto inner_loop;
+}
+
+/* -----------------------------------------------------------------------------
+ *  Compute the retainer set for every object reachable from *tl.
+ * -------------------------------------------------------------------------- */
+static void
+retainRoot( StgClosure **tl )
+{
+    // We no longer assume that only TSOs and WEAKs are roots; any closure can
+    // be a root.
+
+    ASSERT(isEmptyRetainerStack());
+    currentStackBoundary = stackTop;
+
+    if (*tl != &stg_END_TSO_QUEUE_closure && isRetainer(*tl)) {
+	retainClosure(*tl, *tl, getRetainerFrom(*tl));
+    } else {
+	retainClosure(*tl, *tl, CCS_SYSTEM);
+    }
+
+    // NOT TRUE: ASSERT(isMember(getRetainerFrom(*tl), retainerSetOf(*tl)));
+    // *tl might be a TSO which is ThreadComplete, in which
+    // case we ignore it for the purposes of retainer profiling.
+}
+
+/* -----------------------------------------------------------------------------
+ *  Compute the retainer set for each of the objects in the heap.
+ * -------------------------------------------------------------------------- */
+static void
+computeRetainerSet( void )
+{
+    StgWeak *weak;
+    RetainerSet *rtl;
+    nat g;
+    StgPtr ml;
+    bdescr *bd;
+#ifdef DEBUG_RETAINER
+    RetainerSet tmpRetainerSet;
+#endif
+
+    GetRoots(retainRoot);	// for scheduler roots
+
+    // This function is called after a major GC, when key, value, and finalizer
+    // all are guaranteed to be valid, or reachable.
+    //
+    // The following code assumes that WEAK objects are considered to be roots
+    // for retainer profilng.
+    for (weak = weak_ptr_list; weak != NULL; weak = weak->link)
+	// retainRoot((StgClosure *)weak);
+	retainRoot((StgClosure **)&weak);
+
+    // Consider roots from the stable ptr table.
+    markStablePtrTable(retainRoot);
+
+    // The following code resets the rs field of each unvisited mutable
+    // object (computing sumOfNewCostExtra and updating costArray[] when
+    // debugging retainer profiler).
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	// NOT TRUE: even G0 has a block on its mutable list
+        // ASSERT(g != 0 || (generations[g].mut_list == NULL));
+
+	// Traversing through mut_list is necessary
+	// because we can find MUT_VAR objects which have not been
+	// visited during retainer profiling.
+	for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
+	    for (ml = bd->start; ml < bd->free; ml++) {
+
+		maybeInitRetainerSet((StgClosure *)*ml);
+		rtl = retainerSetOf((StgClosure *)*ml);
+
+#ifdef DEBUG_RETAINER
+		if (rtl == NULL) {
+		    // first visit to *ml
+		    // This is a violation of the interface rule!
+		    RSET(ml) = (RetainerSet *)((StgWord)(&tmpRetainerSet) | flip);
+		    
+		    switch (get_itbl((StgClosure *)ml)->type) {
+		    case IND_STATIC:
+			// no cost involved
+			break;
+		    case CONSTR_INTLIKE:
+		    case CONSTR_CHARLIKE:
+		    case CONSTR_NOCAF_STATIC:
+		    case CONSTR_STATIC:
+		    case THUNK_STATIC:
+		    case FUN_STATIC:
+			barf("Invalid object in computeRetainerSet(): %d", get_itbl((StgClosure*)ml)->type);
+			break;
+		    default:
+			// dynamic objects
+			costArray[get_itbl((StgClosure *)ml)->type] += cost((StgClosure *)ml);
+			sumOfNewCostExtra += cost((StgClosure *)ml);
+			break;
+		    }
+		}
+#endif
+	    }
+	}
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ *  Traverse all static objects for which we compute retainer sets,
+ *  and reset their rs fields to NULL, which is accomplished by
+ *  invoking maybeInitRetainerSet(). This function must be called
+ *  before zeroing all objects reachable from scavenged_static_objects
+ *  in the case of major gabage collections. See GarbageCollect() in
+ *  GC.c.
+ *  Note:
+ *    The mut_once_list of the oldest generation must also be traversed?
+ *    Why? Because if the evacuation of an object pointed to by a static
+ *    indirection object fails, it is put back to the mut_once_list of
+ *    the oldest generation.
+ *    However, this is not necessary because any static indirection objects
+ *    are just traversed through to reach dynamic objects. In other words,
+ *    they are not taken into consideration in computing retainer sets.
+ * -------------------------------------------------------------------------- */
+void
+resetStaticObjectForRetainerProfiling( void )
+{
+#ifdef DEBUG_RETAINER
+    nat count;
+#endif
+    StgClosure *p;
+
+#ifdef DEBUG_RETAINER
+    count = 0;
+#endif
+    p = scavenged_static_objects;
+    while (p != END_OF_STATIC_LIST) {
+#ifdef DEBUG_RETAINER
+	count++;
+#endif
+	switch (get_itbl(p)->type) {
+	case IND_STATIC:
+	    // Since we do not compute the retainer set of any
+	    // IND_STATIC object, we don't have to reset its retainer
+	    // field.
+	    p = (StgClosure*)*IND_STATIC_LINK(p);
+	    break;
+	case THUNK_STATIC:
+	    maybeInitRetainerSet(p);
+	    p = (StgClosure*)*THUNK_STATIC_LINK(p);
+	    break;
+	case FUN_STATIC:
+	    maybeInitRetainerSet(p);
+	    p = (StgClosure*)*FUN_STATIC_LINK(p);
+	    break;
+	case CONSTR_STATIC:
+	    maybeInitRetainerSet(p);
+	    p = (StgClosure*)*STATIC_LINK(get_itbl(p), p);
+	    break;
+	default:
+	    barf("resetStaticObjectForRetainerProfiling: %p (%s)",
+		 p, get_itbl(p)->type);
+	    break;
+	}
+    }
+#ifdef DEBUG_RETAINER
+    // debugBelch("count in scavenged_static_objects = %d\n", count);
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+ * Perform retainer profiling.
+ * N is the oldest generation being profilied, where the generations are
+ * numbered starting at 0.
+ * Invariants:
+ * Note:
+ *   This function should be called only immediately after major garbage
+ *   collection.
+ * ------------------------------------------------------------------------- */
+void
+retainerProfile(void)
+{
+#ifdef DEBUG_RETAINER
+  nat i;
+  nat totalHeapSize;        // total raw heap size (computed by linear scanning)
+#endif
+
+#ifdef DEBUG_RETAINER
+  debugBelch(" < retainerProfile() invoked : %d>\n", retainerGeneration);
+#endif
+
+  stat_startRP();
+
+  // We haven't flipped the bit yet.
+#ifdef DEBUG_RETAINER
+  debugBelch("Before traversing:\n");
+  sumOfCostLinear = 0;
+  for (i = 0;i < N_CLOSURE_TYPES; i++)
+    costArrayLinear[i] = 0;
+  totalHeapSize = checkHeapSanityForRetainerProfiling();
+
+  debugBelch("\tsumOfCostLinear = %d, totalHeapSize = %d\n", sumOfCostLinear, totalHeapSize);
+  /*
+  debugBelch("costArrayLinear[] = ");
+  for (i = 0;i < N_CLOSURE_TYPES; i++)
+    debugBelch("[%u:%u] ", i, costArrayLinear[i]);
+  debugBelch("\n");
+  */
+
+  ASSERT(sumOfCostLinear == totalHeapSize);
+
+/*
+#define pcostArrayLinear(index) \
+  if (costArrayLinear[index] > 0) \
+    debugBelch("costArrayLinear[" #index "] = %u\n", costArrayLinear[index])
+  pcostArrayLinear(THUNK_STATIC);
+  pcostArrayLinear(FUN_STATIC);
+  pcostArrayLinear(CONSTR_STATIC);
+  pcostArrayLinear(CONSTR_NOCAF_STATIC);
+  pcostArrayLinear(CONSTR_INTLIKE);
+  pcostArrayLinear(CONSTR_CHARLIKE);
+*/
+#endif
+
+  // Now we flips flip.
+  flip = flip ^ 1;
+
+#ifdef DEBUG_RETAINER
+  stackSize = 0;
+  maxStackSize = 0;
+  cStackSize = 0;
+  maxCStackSize = 0;
+#endif
+  numObjectVisited = 0;
+  timesAnyObjectVisited = 0;
+
+#ifdef DEBUG_RETAINER
+  debugBelch("During traversing:\n");
+  sumOfNewCost = 0;
+  sumOfNewCostExtra = 0;
+  for (i = 0;i < N_CLOSURE_TYPES; i++)
+    costArray[i] = 0;
+#endif
+
+  /*
+    We initialize the traverse stack each time the retainer profiling is
+    performed (because the traverse stack size varies on each retainer profiling
+    and this operation is not costly anyhow). However, we just refresh the
+    retainer sets.
+   */
+  initializeTraverseStack();
+#ifdef DEBUG_RETAINER
+  initializeAllRetainerSet();
+#else
+  refreshAllRetainerSet();
+#endif
+  computeRetainerSet();
+
+#ifdef DEBUG_RETAINER
+  debugBelch("After traversing:\n");
+  sumOfCostLinear = 0;
+  for (i = 0;i < N_CLOSURE_TYPES; i++)
+    costArrayLinear[i] = 0;
+  totalHeapSize = checkHeapSanityForRetainerProfiling();
+
+  debugBelch("\tsumOfCostLinear = %d, totalHeapSize = %d\n", sumOfCostLinear, totalHeapSize);
+  ASSERT(sumOfCostLinear == totalHeapSize);
+
+  // now, compare the two results
+  /*
+    Note:
+      costArray[] must be exactly the same as costArrayLinear[].
+      Known exceptions:
+        1) Dead weak pointers, whose type is CONSTR. These objects are not
+           reachable from any roots.
+  */
+  debugBelch("Comparison:\n");
+  debugBelch("\tcostArrayLinear[] (must be empty) = ");
+  for (i = 0;i < N_CLOSURE_TYPES; i++)
+    if (costArray[i] != costArrayLinear[i])
+      // nothing should be printed except MUT_VAR after major GCs
+      debugBelch("[%u:%u] ", i, costArrayLinear[i]);
+  debugBelch("\n");
+
+  debugBelch("\tsumOfNewCost = %u\n", sumOfNewCost);
+  debugBelch("\tsumOfNewCostExtra = %u\n", sumOfNewCostExtra);
+  debugBelch("\tcostArray[] (must be empty) = ");
+  for (i = 0;i < N_CLOSURE_TYPES; i++)
+    if (costArray[i] != costArrayLinear[i])
+      // nothing should be printed except MUT_VAR after major GCs
+      debugBelch("[%u:%u] ", i, costArray[i]);
+  debugBelch("\n");
+
+  // only for major garbage collection
+  ASSERT(sumOfNewCost + sumOfNewCostExtra == sumOfCostLinear);
+#endif
+
+  // post-processing
+  closeTraverseStack();
+#ifdef DEBUG_RETAINER
+  closeAllRetainerSet();
+#else
+  // Note that there is no post-processing for the retainer sets.
+#endif
+  retainerGeneration++;
+
+  stat_endRP(
+    retainerGeneration - 1,   // retainerGeneration has just been incremented!
+#ifdef DEBUG_RETAINER
+    maxCStackSize, maxStackSize,
+#endif
+    (double)timesAnyObjectVisited / numObjectVisited);
+}
+
+/* -----------------------------------------------------------------------------
+ * DEBUGGING CODE
+ * -------------------------------------------------------------------------- */
+
+#ifdef DEBUG_RETAINER
+
+#define LOOKS_LIKE_PTR(r) ((LOOKS_LIKE_STATIC_CLOSURE(r) || \
+        ((HEAP_ALLOCED(r) && ((Bdescr((P_)r)->flags & BF_FREE) == 0)))) && \
+        ((StgWord)(*(StgPtr)r)!=0xaaaaaaaa))
+
+static nat
+sanityCheckHeapClosure( StgClosure *c )
+{
+    StgInfoTable *info;
+
+    ASSERT(LOOKS_LIKE_GHC_INFO(c->header.info));
+    ASSERT(!closure_STATIC(c));
+    ASSERT(LOOKS_LIKE_PTR(c));
+
+    if ((((StgWord)RSET(c) & 1) ^ flip) != 0) {
+	if (get_itbl(c)->type == CONSTR &&
+	    !strcmp(get_itbl(c)->prof.closure_type, "DEAD_WEAK") &&
+	    !strcmp(get_itbl(c)->prof.closure_desc, "DEAD_WEAK")) {
+	    debugBelch("\tUnvisited dead weak pointer object found: c = %p\n", c);
+	    costArray[get_itbl(c)->type] += cost(c);
+	    sumOfNewCost += cost(c);
+	} else
+	    debugBelch(
+		    "Unvisited object: flip = %d, c = %p(%d, %s, %s), rs = %p\n",
+		    flip, c, get_itbl(c)->type,
+		    get_itbl(c)->prof.closure_type, get_itbl(c)->prof.closure_desc,
+		    RSET(c));
+    } else {
+	// debugBelch("sanityCheckHeapClosure) S: flip = %d, c = %p(%d), rs = %p\n", flip, c, get_itbl(c)->type, RSET(c));
+    }
+
+    return closure_sizeW(c);
+}
+
+static nat
+heapCheck( bdescr *bd )
+{
+    StgPtr p;
+    static nat costSum, size;
+
+    costSum = 0;
+    while (bd != NULL) {
+	p = bd->start;
+	while (p < bd->free) {
+	    size = sanityCheckHeapClosure((StgClosure *)p);
+	    sumOfCostLinear += size;
+	    costArrayLinear[get_itbl((StgClosure *)p)->type] += size;
+	    p += size;
+	    // no need for slop check; I think slops are not used currently.
+	}
+	ASSERT(p == bd->free);
+	costSum += bd->free - bd->start;
+	bd = bd->link;
+    }
+
+    return costSum;
+}
+
+static nat
+smallObjectPoolCheck(void)
+{
+    bdescr *bd;
+    StgPtr p;
+    static nat costSum, size;
+
+    bd = small_alloc_list;
+    costSum = 0;
+
+    // first block
+    if (bd == NULL)
+	return costSum;
+
+    p = bd->start;
+    while (p < alloc_Hp) {
+	size = sanityCheckHeapClosure((StgClosure *)p);
+	sumOfCostLinear += size;
+	costArrayLinear[get_itbl((StgClosure *)p)->type] += size;
+	p += size;
+    }
+    ASSERT(p == alloc_Hp);
+    costSum += alloc_Hp - bd->start;
+
+    bd = bd->link;
+    while (bd != NULL) {
+	p = bd->start;
+	while (p < bd->free) {
+	    size = sanityCheckHeapClosure((StgClosure *)p);
+	    sumOfCostLinear += size;
+	    costArrayLinear[get_itbl((StgClosure *)p)->type] += size;
+	    p += size;
+	}
+	ASSERT(p == bd->free);
+	costSum += bd->free - bd->start;
+	bd = bd->link;
+    }
+
+    return costSum;
+}
+
+static nat
+chainCheck(bdescr *bd)
+{
+    nat costSum, size;
+
+    costSum = 0;
+    while (bd != NULL) {
+	// bd->free - bd->start is not an accurate measurement of the
+	// object size.  Actually it is always zero, so we compute its
+	// size explicitly.
+	size = sanityCheckHeapClosure((StgClosure *)bd->start);
+	sumOfCostLinear += size;
+	costArrayLinear[get_itbl((StgClosure *)bd->start)->type] += size;
+	costSum += size;
+	bd = bd->link;
+    }
+
+    return costSum;
+}
+
+static nat
+checkHeapSanityForRetainerProfiling( void )
+{
+    nat costSum, g, s;
+
+    costSum = 0;
+    debugBelch("START: sumOfCostLinear = %d, costSum = %d\n", sumOfCostLinear, costSum);
+    if (RtsFlags.GcFlags.generations == 1) {
+	costSum += heapCheck(g0s0->to_blocks);
+	debugBelch("heapCheck: sumOfCostLinear = %d, costSum = %d\n", sumOfCostLinear, costSum);
+	costSum += chainCheck(g0s0->large_objects);
+	debugBelch("chainCheck: sumOfCostLinear = %d, costSum = %d\n", sumOfCostLinear, costSum);
+    } else {
+	for (g = 0; g < RtsFlags.GcFlags.generations; g++)
+	for (s = 0; s < generations[g].n_steps; s++) {
+	    /*
+	      After all live objects have been scavenged, the garbage
+	      collector may create some objects in
+	      scheduleFinalizers(). These objects are created throught
+	      allocate(), so the small object pool or the large object
+	      pool of the g0s0 may not be empty.
+	    */
+	    if (g == 0 && s == 0) {
+		costSum += smallObjectPoolCheck();
+		debugBelch("smallObjectPoolCheck(): sumOfCostLinear = %d, costSum = %d\n", sumOfCostLinear, costSum);
+		costSum += chainCheck(generations[g].steps[s].large_objects);
+		debugBelch("chainCheck(): sumOfCostLinear = %d, costSum = %d\n", sumOfCostLinear, costSum);
+	    } else {
+		costSum += heapCheck(generations[g].steps[s].blocks);
+		debugBelch("heapCheck(): sumOfCostLinear = %d, costSum = %d\n", sumOfCostLinear, costSum);
+		costSum += chainCheck(generations[g].steps[s].large_objects);
+		debugBelch("chainCheck(): sumOfCostLinear = %d, costSum = %d\n", sumOfCostLinear, costSum);
+	    }
+	}
+    }
+
+    return costSum;
+}
+
+void
+findPointer(StgPtr p)
+{
+    StgPtr q, r, e;
+    bdescr *bd;
+    nat g, s;
+
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	for (s = 0; s < generations[g].n_steps; s++) {
+	    // if (g == 0 && s == 0) continue;
+	    bd = generations[g].steps[s].blocks;
+	    for (; bd; bd = bd->link) {
+		for (q = bd->start; q < bd->free; q++) {
+		    if (*q == (StgWord)p) {
+			r = q;
+			while (!LOOKS_LIKE_GHC_INFO(*r)) r--;
+			debugBelch("Found in gen[%d], step[%d]: q = %p, r = %p\n", g, s, q, r);
+			// return;
+		    }
+		}
+	    }
+	    bd = generations[g].steps[s].large_objects;
+	    for (; bd; bd = bd->link) {
+		e = bd->start + cost((StgClosure *)bd->start);
+		for (q = bd->start; q < e; q++) {
+		    if (*q == (StgWord)p) {
+			r = q;
+			while (*r == 0 || !LOOKS_LIKE_GHC_INFO(*r)) r--;
+			debugBelch("Found in gen[%d], large_objects: %p\n", g, r);
+			// return;
+		    }
+		}
+	    }
+	}
+    }
+}
+
+static void
+belongToHeap(StgPtr p)
+{
+    bdescr *bd;
+    nat g, s;
+
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	for (s = 0; s < generations[g].n_steps; s++) {
+	    // if (g == 0 && s == 0) continue;
+	    bd = generations[g].steps[s].blocks;
+	    for (; bd; bd = bd->link) {
+		if (bd->start <= p && p < bd->free) {
+		    debugBelch("Belongs to gen[%d], step[%d]", g, s);
+		    return;
+		}
+	    }
+	    bd = generations[g].steps[s].large_objects;
+	    for (; bd; bd = bd->link) {
+		if (bd->start <= p && p < bd->start + getHeapClosureSize((StgClosure *)bd->start)) {
+		    debugBelch("Found in gen[%d], large_objects: %p\n", g, bd->start);
+		    return;
+		}
+	    }
+	}
+    }
+}
+#endif /* DEBUG_RETAINER */
+
+#endif /* PROFILING */
diff --git a/rts/RetainerProfile.h b/rts/RetainerProfile.h
new file mode 100644
index 0000000000..827daa8ef4
--- /dev/null
+++ b/rts/RetainerProfile.h
@@ -0,0 +1,47 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001
+ * Author: Sungwoo Park
+ *
+ * Retainer profiling interface.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef RETAINERPROFILE_H
+#define RETAINERPROFILE_H
+
+#ifdef PROFILING
+
+#include "RetainerSet.h"
+
+extern void  initRetainerProfiling ( void );
+extern void  endRetainerProfiling  ( void );
+extern void  printRetainer         ( FILE *, retainer );
+extern void  retainerProfile       ( void );
+extern void  resetStaticObjectForRetainerProfiling ( void );
+
+extern StgWord RTS_VAR(flip);
+
+// extract the retainer set field from c
+#define RSET(c)   ((c)->header.prof.hp.rs)
+
+#define isRetainerSetFieldValid(c) \
+  ((((StgWord)(c)->header.prof.hp.rs & 1) ^ flip) == 0)
+
+static inline RetainerSet *
+retainerSetOf( StgClosure *c )
+{
+    ASSERT( isRetainerSetFieldValid(c) );
+    // StgWord has the same size as pointers, so the following type
+    // casting is okay.
+    return (RetainerSet *)((StgWord)RSET(c) ^ flip);
+}
+
+// Used by Storage.c:memInventory()
+#ifdef DEBUG
+extern lnat retainerStackBlocks ( void );
+#endif
+
+#endif /* PROFILING */
+
+#endif /* RETAINERPROFILE_H */
diff --git a/rts/RetainerSet.c b/rts/RetainerSet.c
new file mode 100644
index 0000000000..bfa0bc8acf
--- /dev/null
+++ b/rts/RetainerSet.c
@@ -0,0 +1,498 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001
+ * Author: Sungwoo Park
+ *
+ * Retainer set implementation for retainer profiling (see RetainerProfile.c)
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifdef PROFILING
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "Stats.h"
+#include "RtsUtils.h"
+#include "RetainerSet.h"
+#include "Arena.h"
+#include "Profiling.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#define HASH_TABLE_SIZE 255
+#define hash(hk)  (hk % HASH_TABLE_SIZE)
+static RetainerSet *hashTable[HASH_TABLE_SIZE];
+
+static Arena *arena;		// arena in which we store retainer sets
+
+static int nextId;              // id of next retainer set       
+
+/* -----------------------------------------------------------------------------
+ * rs_MANY is a distinguished retainer set, such that
+ *
+ *        isMember(e, rs_MANY)   = True
+ *
+ *	  addElement(e, rs)      = rs_MANY,   if rs->num >= maxRetainerSetSize
+ *	  addElement(e, rs_MANY) = rs_MANY
+ *
+ * The point of rs_MANY is to keep the total number of retainer sets
+ * from growing too large.
+ * -------------------------------------------------------------------------- */
+RetainerSet rs_MANY = {
+    num : 0,
+    hashKey : 0,
+    link : NULL,
+    id : 1,
+    element : {}
+};
+
+/* -----------------------------------------------------------------------------
+ * calculate the size of a RetainerSet structure
+ * -------------------------------------------------------------------------- */
+STATIC_INLINE size_t
+sizeofRetainerSet( int elems )
+{
+    return (sizeof(RetainerSet) + elems * sizeof(retainer));
+}
+
+/* -----------------------------------------------------------------------------
+ * Creates the first pool and initializes hashTable[].
+ * Frees all pools if any.
+ * -------------------------------------------------------------------------- */
+void
+initializeAllRetainerSet(void)
+{
+    int i;
+
+    arena = newArena();
+
+    for (i = 0; i < HASH_TABLE_SIZE; i++)
+	hashTable[i] = NULL;
+    nextId = 2;   // Initial value must be positive, 2 is MANY.
+}
+
+/* -----------------------------------------------------------------------------
+ * Refreshes all pools for reuse and initializes hashTable[].
+ * -------------------------------------------------------------------------- */
+void
+refreshAllRetainerSet(void)
+{
+#ifdef FIRST_APPROACH
+    int i;
+
+    // first approach: completely refresh
+    arenaFree(arena);
+    arena = newArena();
+
+    for (i = 0; i < HASH_TABLE_SIZE; i++)
+	hashTable[i] = NULL;
+    nextId = 2;
+#endif /* FIRST_APPROACH */
+}
+
+/* -----------------------------------------------------------------------------
+ * Frees all pools.
+ * -------------------------------------------------------------------------- */
+void
+closeAllRetainerSet(void)
+{
+    arenaFree(arena);
+}
+
+/* -----------------------------------------------------------------------------
+ *  Finds or creates if needed a singleton retainer set.
+ * -------------------------------------------------------------------------- */
+RetainerSet *
+singleton(retainer r)
+{
+    RetainerSet *rs;
+    StgWord hk;
+
+    hk = hashKeySingleton(r);
+    for (rs = hashTable[hash(hk)]; rs != NULL; rs = rs->link)
+	if (rs->num == 1 &&  rs->element[0] == r) return rs;    // found it
+
+    // create it
+    rs = arenaAlloc( arena, sizeofRetainerSet(1) );
+    rs->num = 1;
+    rs->hashKey = hk;
+    rs->link = hashTable[hash(hk)];
+    rs->id = nextId++;
+    rs->element[0] = r;
+
+    // The new retainer set is placed at the head of the linked list.
+    hashTable[hash(hk)] = rs;
+
+    return rs;
+}
+
+/* -----------------------------------------------------------------------------
+ *   Finds or creates a retainer set *rs augmented with r.
+ *   Invariants:
+ *     r is not a member of rs, i.e., isMember(r, rs) returns rtsFalse.
+ *     rs is not NULL.
+ *   Note:
+ *     We could check if rs is NULL, in which case this function call
+ *     reverts to singleton(). We do not choose this strategy because
+ *     in most cases addElement() is invoked with non-NULL rs.
+ * -------------------------------------------------------------------------- */
+RetainerSet *
+addElement(retainer r, RetainerSet *rs)
+{
+    nat i;
+    nat nl;             // Number of retainers in *rs Less than r
+    RetainerSet *nrs;   // New Retainer Set
+    StgWord hk;         // Hash Key
+
+#ifdef DEBUG_RETAINER
+    // debugBelch("addElement(%p, %p) = ", r, rs);
+#endif
+
+    ASSERT(rs != NULL);
+    ASSERT(rs->num <= RtsFlags.ProfFlags.maxRetainerSetSize);
+
+    if (rs == &rs_MANY || rs->num == RtsFlags.ProfFlags.maxRetainerSetSize) {
+	return &rs_MANY;
+    }
+
+    ASSERT(!isMember(r, rs));
+
+    for (nl = 0; nl < rs->num; nl++)
+	if (r < rs->element[nl]) break;
+    // Now nl is the index for r into the new set.
+    // Also it denotes the number of retainers less than r in *rs.
+    // Thus, compare the first nl retainers, then r itself, and finally the
+    // remaining (rs->num - nl) retainers.
+
+    hk = hashKeyAddElement(r, rs);
+    for (nrs = hashTable[hash(hk)]; nrs != NULL; nrs = nrs->link) {
+	// test *rs and *nrs for equality
+
+	// check their size
+	if (rs->num + 1 != nrs->num) continue;
+
+	// compare the first nl retainers and find the first non-matching one.
+	for (i = 0; i < nl; i++)
+	    if (rs->element[i] != nrs->element[i]) break;
+	if (i < nl) continue;
+
+	// compare r itself
+	if (r != nrs->element[i]) continue;       // i == nl
+
+	// compare the remaining retainers
+	for (; i < rs->num; i++)
+	    if (rs->element[i] != nrs->element[i + 1]) break;
+	if (i < rs->num) continue;
+
+#ifdef DEBUG_RETAINER
+	// debugBelch("%p\n", nrs);
+#endif
+	// The set we are seeking already exists!
+	return nrs;
+    }
+
+    // create a new retainer set
+    nrs = arenaAlloc( arena, sizeofRetainerSet(rs->num + 1) );
+    nrs->num = rs->num + 1;
+    nrs->hashKey = hk;
+    nrs->link = hashTable[hash(hk)];
+    nrs->id = nextId++;
+    for (i = 0; i < nl; i++) {              // copy the first nl retainers
+	nrs->element[i] = rs->element[i];
+    }
+    nrs->element[i] = r;                    // copy r
+    for (; i < rs->num; i++) {              // copy the remaining retainers
+	nrs->element[i + 1] = rs->element[i];
+    }
+
+    hashTable[hash(hk)] = nrs;
+
+#ifdef DEBUG_RETAINER
+    // debugBelch("%p\n", nrs);
+#endif
+    return nrs;
+}
+
+/* -----------------------------------------------------------------------------
+ *  Call f() for each retainer set.
+ * -------------------------------------------------------------------------- */
+void
+traverseAllRetainerSet(void (*f)(RetainerSet *))
+{
+    int i;
+    RetainerSet *rs;
+
+    (*f)(&rs_MANY);
+    for (i = 0; i < HASH_TABLE_SIZE; i++)
+	for (rs = hashTable[i]; rs != NULL; rs = rs->link)
+	    (*f)(rs);
+}
+
+
+/* -----------------------------------------------------------------------------
+ *  printRetainer() prints the full information on a given retainer,
+ *  not a retainer set.
+ * -------------------------------------------------------------------------- */
+#if defined(RETAINER_SCHEME_INFO)
+// Retainer scheme 1: retainer = info table
+void
+printRetainer(FILE *f, retainer itbl)
+{
+    fprintf(f, "%s[%s]", itbl->prof.closure_desc, itbl->prof.closure_type);
+}
+#elif defined(RETAINER_SCHEME_CCS)
+// Retainer scheme 2: retainer = cost centre stack
+void
+printRetainer(FILE *f, retainer ccs)
+{
+    fprintCCS(f, ccs);
+}
+#elif defined(RETAINER_SCHEME_CC)
+// Retainer scheme 3: retainer = cost centre
+void
+printRetainer(FILE *f, retainer cc)
+{
+    fprintf(f,"%s.%s", cc->module, cc->label);
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+ *  printRetainerSetShort() should always display the same output for
+ *  a given retainer set regardless of the time of invocation.
+ * -------------------------------------------------------------------------- */
+#ifdef SECOND_APPROACH
+#if defined(RETAINER_SCHEME_INFO)
+// Retainer scheme 1: retainer = info table
+void
+printRetainerSetShort(FILE *f, RetainerSet *rs)
+{
+#define MAX_RETAINER_SET_SPACE  24
+    char tmp[MAX_RETAINER_SET_SPACE + 1];
+    int size;
+    nat j;
+
+    ASSERT(rs->id < 0);
+
+    tmp[MAX_RETAINER_SET_SPACE] = '\0';
+
+    // No blank characters are allowed.
+    sprintf(tmp + 0, "(%d)", -(rs->id));
+    size = strlen(tmp);
+    ASSERT(size < MAX_RETAINER_SET_SPACE);
+
+    for (j = 0; j < rs->num; j++) {
+	if (j < rs->num - 1) {
+	    strncpy(tmp + size, rs->element[j]->prof.closure_desc, MAX_RETAINER_SET_SPACE - size);
+	    size = strlen(tmp);
+	    if (size == MAX_RETAINER_SET_SPACE)
+		break;
+	    strncpy(tmp + size, ",", MAX_RETAINER_SET_SPACE - size);
+	    size = strlen(tmp);
+	    if (size == MAX_RETAINER_SET_SPACE)
+		break;
+	}
+	else {
+	    strncpy(tmp + size, rs->element[j]->prof.closure_desc, MAX_RETAINER_SET_SPACE - size);
+	    // size = strlen(tmp);
+	}
+    }
+    fprintf(f, tmp);
+}
+#elif defined(RETAINER_SCHEME_CC)
+// Retainer scheme 3: retainer = cost centre
+void
+printRetainerSetShort(FILE *f, RetainerSet *rs)
+{
+#define MAX_RETAINER_SET_SPACE  24
+    char tmp[MAX_RETAINER_SET_SPACE + 1];
+    int size;
+    nat j;
+
+}
+#elif defined(RETAINER_SCHEME_CCS)
+// Retainer scheme 2: retainer = cost centre stack
+void
+printRetainerSetShort(FILE *f, RetainerSet *rs)
+{
+#define MAX_RETAINER_SET_SPACE  24
+    char tmp[MAX_RETAINER_SET_SPACE + 1];
+    int size;
+    nat j;
+
+    ASSERT(rs->id < 0);
+
+    tmp[MAX_RETAINER_SET_SPACE] = '\0';
+
+    // No blank characters are allowed.
+    sprintf(tmp + 0, "(%d)", -(rs->id));
+    size = strlen(tmp);
+    ASSERT(size < MAX_RETAINER_SET_SPACE);
+
+    for (j = 0; j < rs->num; j++) {
+	if (j < rs->num - 1) {
+	    strncpy(tmp + size, rs->element[j]->cc->label, MAX_RETAINER_SET_SPACE - size);
+	    size = strlen(tmp);
+	    if (size == MAX_RETAINER_SET_SPACE)
+		break;
+	    strncpy(tmp + size, ",", MAX_RETAINER_SET_SPACE - size);
+	    size = strlen(tmp);
+	    if (size == MAX_RETAINER_SET_SPACE)
+		break;
+	}
+	else {
+	    strncpy(tmp + size, rs->element[j]->cc->label, MAX_RETAINER_SET_SPACE - size);
+	    // size = strlen(tmp);
+	}
+    }
+    fprintf(f, tmp);
+}
+#elif defined(RETAINER_SCHEME_CC)
+// Retainer scheme 3: retainer = cost centre
+static void
+printRetainerSetShort(FILE *f, retainerSet *rs)
+{
+#define MAX_RETAINER_SET_SPACE  24
+    char tmp[MAX_RETAINER_SET_SPACE + 1];
+    int size;
+    nat j;
+
+    ASSERT(rs->id < 0);
+
+    tmp[MAX_RETAINER_SET_SPACE] = '\0';
+
+    // No blank characters are allowed.
+    sprintf(tmp + 0, "(%d)", -(rs->id));
+    size = strlen(tmp);
+    ASSERT(size < MAX_RETAINER_SET_SPACE);
+
+    for (j = 0; j < rs->num; j++) {
+	if (j < rs->num - 1) {
+	    strncpy(tmp + size, rs->element[j]->label,
+		    MAX_RETAINER_SET_SPACE - size);
+	    size = strlen(tmp);
+	    if (size == MAX_RETAINER_SET_SPACE)
+		break;
+	    strncpy(tmp + size, ",", MAX_RETAINER_SET_SPACE - size);
+	    size = strlen(tmp);
+	    if (size == MAX_RETAINER_SET_SPACE)
+		break;
+	}
+	else {
+	    strncpy(tmp + size, rs->element[j]->label,
+		    MAX_RETAINER_SET_SPACE - size);
+	    // size = strlen(tmp);
+	}
+    }
+    fprintf(f, tmp);
+/*
+  #define MAX_RETAINER_SET_SPACE  24
+  #define DOT_NUMBER              3
+  // 1. 32 > MAX_RETAINER_SET_SPACE + 1 (1 for '\0')
+  // 2. (MAX_RETAINER_SET_SPACE - DOT_NUMBER ) characters should be enough for
+  //    printing one natural number (plus '(' and ')').
+  char tmp[32];
+  int size, ts;
+  nat j;
+
+  ASSERT(rs->id < 0);
+
+  // No blank characters are allowed.
+  sprintf(tmp + 0, "(%d)", -(rs->id));
+  size = strlen(tmp);
+  ASSERT(size < MAX_RETAINER_SET_SPACE - DOT_NUMBER);
+
+  for (j = 0; j < rs->num; j++) {
+    ts = strlen(rs->element[j]->label);
+    if (j < rs->num - 1) {
+      if (size + ts + 1 > MAX_RETAINER_SET_SPACE - DOT_NUMBER) {
+        sprintf(tmp + size, "...");
+        break;
+      }
+      sprintf(tmp + size, "%s,", rs->element[j]->label);
+      size += ts + 1;
+    }
+    else {
+      if (size + ts > MAX_RETAINER_SET_SPACE - DOT_NUMBER) {
+        sprintf(tmp + size, "...");
+        break;
+      }
+      sprintf(tmp + size, "%s", rs->element[j]->label);
+      size += ts;
+    }
+  }
+  fprintf(f, tmp);
+*/
+}
+#endif /* RETAINER_SCHEME_CC */
+#endif /* SECOND_APPROACH */
+
+/* -----------------------------------------------------------------------------
+ * Dump the contents of each retainer set into the log file at the end
+ * of the run, so the user can find out for a given retainer set ID
+ * the full contents of that set.
+ * --------------------------------------------------------------------------- */
+#ifdef SECOND_APPROACH
+void
+outputAllRetainerSet(FILE *prof_file)
+{
+    nat i, j;
+    nat numSet;
+    RetainerSet *rs, **rsArray, *tmp;
+
+    // find out the number of retainer sets which have had a non-zero cost at
+    // least once during retainer profiling
+    numSet = 0;
+    for (i = 0; i < HASH_TABLE_SIZE; i++)
+	for (rs = hashTable[i]; rs != NULL; rs = rs->link) {
+	    if (rs->id < 0)
+		numSet++;
+	}
+
+    if (numSet == 0)      // retainer profiling was not done at all.
+	return;
+
+    // allocate memory
+    rsArray = stgMallocBytes(numSet * sizeof(RetainerSet *),
+			     "outputAllRetainerSet()");
+
+    // prepare for sorting
+    j = 0;
+    for (i = 0; i < HASH_TABLE_SIZE; i++)
+	for (rs = hashTable[i]; rs != NULL; rs = rs->link) {
+	    if (rs->id < 0) {
+		rsArray[j] = rs;
+		j++;
+	    }
+	}
+
+    ASSERT(j == numSet);
+
+    // sort rsArray[] according to the id of each retainer set
+    for (i = numSet - 1; i > 0; i--) {
+	for (j = 0; j <= i - 1; j++) {
+	    // if (-(rsArray[j]->id) < -(rsArray[j + 1]->id))
+	    if (rsArray[j]->id < rsArray[j + 1]->id) {
+		tmp = rsArray[j];
+		rsArray[j] = rsArray[j + 1];
+		rsArray[j + 1] = tmp;
+	    }
+	}
+    }
+
+    fprintf(prof_file, "\nRetainer sets created during profiling:\n");
+    for (i = 0;i < numSet; i++) {
+	fprintf(prof_file, "SET %u = {", -(rsArray[i]->id));
+	for (j = 0; j < rsArray[i]->num - 1; j++) {
+	    printRetainer(prof_file, rsArray[i]->element[j]);
+	    fprintf(prof_file, ", ");
+	}
+	printRetainer(prof_file, rsArray[i]->element[j]);
+	fprintf(prof_file, "}\n");
+    }
+
+    stgFree(rsArray);
+}
+#endif /* SECOND_APPROACH */
+
+#endif /* PROFILING */
diff --git a/rts/RetainerSet.h b/rts/RetainerSet.h
new file mode 100644
index 0000000000..6a00e1395e
--- /dev/null
+++ b/rts/RetainerSet.h
@@ -0,0 +1,201 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001
+ * Author: Sungwoo Park
+ *
+ * Retainer set interface for retainer profiling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef RETAINERSET_H
+#define RETAINERSET_H
+
+#include <stdio.h>
+
+#ifdef PROFILING
+
+/*
+  Type 'retainer' defines the retainer identity.
+
+  Invariant:
+    1. The retainer identity of a given retainer cannot change during 
+    program execution, no matter where it is actually stored.
+    For instance, the memory address of a retainer cannot be used as
+    its retainer identity because its location may change during garbage
+    collections.
+    2. Type 'retainer' must come with comparison operations as well as
+    an equality operation. That it, <, >, and == must be supported -
+    this is necessary to store retainers in a sorted order in retainer sets.
+    Therefore, you cannot use a huge structure type as 'retainer', for instance.
+
+  We illustrate three possibilities of defining 'retainer identity'.
+  Choose one of the following three compiler directives:
+
+   Retainer scheme 1 (RETAINER_SCHEME_INFO) : retainer = info table
+   Retainer scheme 2 (RETAINER_SCHEME_CCS)  : retainer = cost centre stack
+   Retainer scheme 3 (RETAINER_SCHEME_CC)   : retainer = cost centre
+*/
+
+// #define RETAINER_SCHEME_INFO
+#define RETAINER_SCHEME_CCS
+// #define RETAINER_SCHEME_CC
+
+#ifdef RETAINER_SCHEME_INFO
+struct _StgInfoTable;
+typedef struct _StgInfoTable *retainer;
+#endif
+
+#ifdef RETAINER_SCHEME_CCS
+typedef CostCentreStack *retainer;
+#endif
+
+#ifdef RETAINER_SCHEME_CC
+typedef CostCentre *retainer;
+#endif
+
+/*
+  Type 'retainerSet' defines an abstract datatype for sets of retainers.  
+
+  Invariants:
+    A retainer set stores its elements in increasing order (in element[] array).
+ */
+
+typedef struct _RetainerSet {
+  nat num;                      // number of elements
+  StgWord hashKey;              // hash key for this retainer set
+  struct _RetainerSet *link;    // link to the next retainer set in the bucket
+  int id;   // unique id of this retainer set (used when printing)
+            // Its absolute value is interpreted as its true id; if id is
+            // negative, it indicates that this retainer set has had a postive
+            // cost after some retainer profiling.
+  retainer element[0];          // elements of this retainer set
+  // do not put anything below here!
+} RetainerSet;
+
+/*
+  Note: 
+    There are two ways of maintaining all retainer sets. The first is simply by
+    freeing all the retainer sets and re-initialize the hash table at each
+    retainer profiling. The second is by setting the cost field of each 
+    retainer set. The second is preferred to the first if most retainer sets 
+    are likely to be observed again during the next retainer profiling. Note 
+    that in the first approach, we do not free the memory allocated for 
+    retainer sets; we just invalidate all retainer sets.
+ */
+#ifdef DEBUG_RETAINER
+// In thise case, FIRST_APPROACH must be turned on because the memory pool
+// for retainer sets is freed each time.
+#define FIRST_APPROACH
+#else
+// #define FIRST_APPROACH
+#define SECOND_APPROACH
+#endif
+
+// Creates the first pool and initializes a hash table. Frees all pools if any.
+void initializeAllRetainerSet(void);
+
+// Refreshes all pools for reuse and initializes a hash table.
+void refreshAllRetainerSet(void);
+
+// Frees all pools.
+void closeAllRetainerSet(void);
+
+// Finds or creates if needed a singleton retainer set.
+RetainerSet *singleton(retainer r);
+
+extern RetainerSet rs_MANY;
+
+// Checks if a given retainer is a memeber of the retainer set.
+// 
+// Note & (maybe) Todo:
+//   This function needs to be declared as an inline function, so it is declared
+//   as an inline static function here.
+//   This make the interface really bad, but isMember() returns a value, so
+//   it is not easy either to write it as a macro (due to my lack of C 
+//   programming experience). Sungwoo
+//
+// rtsBool isMember(retainer, retainerSet *);
+/*
+  Returns rtsTrue if r is a member of *rs.
+  Invariants:
+    rs is not NULL.
+  Note:
+    The efficiency of this function is subject to the typical size of
+    retainer sets. If it is small, linear scan is better. If it
+    is large in most cases, binary scan is better. 
+    The current implementation mixes the two search strategies.
+ */
+
+#define BINARY_SEARCH_THRESHOLD   8
+INLINE_HEADER rtsBool
+isMember(retainer r, RetainerSet *rs)
+{
+  int i, left, right;       // must be int, not nat (because -1 can appear)
+  retainer ri;
+
+  if (rs == &rs_MANY) { return rtsTrue; }
+
+  if (rs->num < BINARY_SEARCH_THRESHOLD) {
+    for (i = 0; i < (int)rs->num; i++) {
+      ri = rs->element[i];
+      if (r == ri) return rtsTrue;
+      else if (r < ri) return rtsFalse;
+    }
+  } else {
+    left = 0;
+    right = rs->num - 1;
+    while (left <= right) {
+      i = (left + right) / 2;
+      ri = rs->element[i];
+      if (r == ri) return rtsTrue;
+      else if (r < ri) right = i - 1;
+      else left = i + 1;
+    }
+  }
+  return rtsFalse;
+}
+
+// Finds or creates a retainer set augmented with a new retainer.
+RetainerSet *addElement(retainer, RetainerSet *);
+
+// Call f() for each retainer set.
+void traverseAllRetainerSet(void (*f)(RetainerSet *));
+
+#ifdef SECOND_APPROACH
+// Prints a single retainer set.
+void printRetainerSetShort(FILE *, RetainerSet *);
+#endif
+
+// Print the statistics on all the retainer sets.
+// store the sum of all costs and the number of all retainer sets. 
+void outputRetainerSet(FILE *, nat *, nat *);
+
+#ifdef SECOND_APPROACH
+// Print all retainer sets at the exit of the program.
+void outputAllRetainerSet(FILE *);
+#endif
+
+// Hashing functions
+/*
+  Invariants:
+    Once either initializeAllRetainerSet() or refreshAllRetainerSet()
+    is called, there exists only one copy of any retainer set created
+    through singleton() and addElement().  The pool (the storage for
+    retainer sets) is consumed linearly.  All the retainer sets of the
+    same hash function value are linked together from an element in
+    hashTable[].  See the invariants of allocateInPool() for the
+    maximum size of retainer sets.  The hashing function is defined by
+    hashKeySingleton() and hashKeyAddElement(). The hash key for a set
+    must be unique regardless of the order its elements are inserted,
+    i.e., the hashing function must be additive(?).
+*/
+#define hashKeySingleton(r)       ((StgWord)(r))
+#define hashKeyAddElement(r, s)   (hashKeySingleton((r)) + (s)->hashKey)
+
+// Prints the full information on a given retainer.
+// Note: This function is not part of retainerSet interface, but this is
+//       the best place to define it.
+void printRetainer(FILE *, retainer);
+
+#endif /* PROFILING */
+#endif /* RETAINERSET_H */
diff --git a/rts/RtsAPI.c b/rts/RtsAPI.c
new file mode 100644
index 0000000000..b1b1d9c52d
--- /dev/null
+++ b/rts/RtsAPI.c
@@ -0,0 +1,597 @@
+/* ----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2001
+ *
+ * API for invoking Haskell functions via the RTS
+ *
+ * --------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "OSThreads.h"
+#include "Storage.h"
+#include "RtsAPI.h"
+#include "SchedAPI.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Prelude.h"
+#include "Schedule.h"
+#include "Capability.h"
+
+#include <stdlib.h>
+
+/* ----------------------------------------------------------------------------
+   Building Haskell objects from C datatypes.
+   ------------------------------------------------------------------------- */
+HaskellObj
+rts_mkChar (Capability *cap, HsChar c)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap, CONSTR_sizeW(0,1));
+  SET_HDR(p, Czh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgWord)(StgChar)c;
+  return p;
+}
+
+HaskellObj
+rts_mkInt (Capability *cap, HsInt i)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, Izh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgInt)i;
+  return p;
+}
+
+HaskellObj
+rts_mkInt8 (Capability *cap, HsInt8 i)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, I8zh_con_info, CCS_SYSTEM);
+  /* Make sure we mask out the bits above the lowest 8 */
+  p->payload[0]  = (StgClosure *)(StgInt)((unsigned)i & 0xff);
+  return p;
+}
+
+HaskellObj
+rts_mkInt16 (Capability *cap, HsInt16 i)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, I16zh_con_info, CCS_SYSTEM);
+  /* Make sure we mask out the relevant bits */
+  p->payload[0]  = (StgClosure *)(StgInt)((unsigned)i & 0xffff);
+  return p;
+}
+
+HaskellObj
+rts_mkInt32 (Capability *cap, HsInt32 i)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, I32zh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgInt)((unsigned)i & 0xffffffff);
+  return p;
+}
+
+HaskellObj
+rts_mkInt64 (Capability *cap, HsInt64 i)
+{
+  llong *tmp;
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,2));
+  SET_HDR(p, I64zh_con_info, CCS_SYSTEM);
+  tmp  = (llong*)&(p->payload[0]);
+  *tmp = (StgInt64)i;
+  return p;
+}
+
+HaskellObj
+rts_mkWord (Capability *cap, HsWord i)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, Wzh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgWord)i;
+  return p;
+}
+
+HaskellObj
+rts_mkWord8 (Capability *cap, HsWord8 w)
+{
+  /* see rts_mkInt* comments */
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, W8zh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgWord)(w & 0xff);
+  return p;
+}
+
+HaskellObj
+rts_mkWord16 (Capability *cap, HsWord16 w)
+{
+  /* see rts_mkInt* comments */
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, W16zh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgWord)(w & 0xffff);
+  return p;
+}
+
+HaskellObj
+rts_mkWord32 (Capability *cap, HsWord32 w)
+{
+  /* see rts_mkInt* comments */
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, W32zh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgWord)(w & 0xffffffff);
+  return p;
+}
+
+HaskellObj
+rts_mkWord64 (Capability *cap, HsWord64 w)
+{
+  ullong *tmp;
+
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,2));
+  /* see mk_Int8 comment */
+  SET_HDR(p, W64zh_con_info, CCS_SYSTEM);
+  tmp  = (ullong*)&(p->payload[0]);
+  *tmp = (StgWord64)w;
+  return p;
+}
+
+HaskellObj
+rts_mkFloat (Capability *cap, HsFloat f)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, Fzh_con_info, CCS_SYSTEM);
+  ASSIGN_FLT((P_)p->payload, (StgFloat)f);
+  return p;
+}
+
+HaskellObj
+rts_mkDouble (Capability *cap, HsDouble d)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,sizeofW(StgDouble)));
+  SET_HDR(p, Dzh_con_info, CCS_SYSTEM);
+  ASSIGN_DBL((P_)p->payload, (StgDouble)d);
+  return p;
+}
+
+HaskellObj
+rts_mkStablePtr (Capability *cap, HsStablePtr s)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,sizeofW(StgHeader)+1);
+  SET_HDR(p, StablePtr_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)s;
+  return p;
+}
+
+HaskellObj
+rts_mkPtr (Capability *cap, HsPtr a)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,sizeofW(StgHeader)+1);
+  SET_HDR(p, Ptr_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)a;
+  return p;
+}
+
+HaskellObj
+rts_mkFunPtr (Capability *cap, HsFunPtr a)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,sizeofW(StgHeader)+1);
+  SET_HDR(p, FunPtr_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)a;
+  return p;
+}
+
+HaskellObj
+rts_mkBool (Capability *cap STG_UNUSED, HsBool b)
+{
+  if (b) {
+    return (StgClosure *)True_closure;
+  } else {
+    return (StgClosure *)False_closure;
+  }
+}
+
+HaskellObj
+rts_mkString (Capability *cap, char *s)
+{
+  return rts_apply(cap, (StgClosure *)unpackCString_closure, rts_mkPtr(cap,s));
+}
+
+HaskellObj
+rts_apply (Capability *cap, HaskellObj f, HaskellObj arg)
+{
+    StgThunk *ap;
+
+    ap = (StgThunk *)allocateLocal(cap,sizeofW(StgThunk) + 2);
+    SET_HDR(ap, (StgInfoTable *)&stg_ap_2_upd_info, CCS_SYSTEM);
+    ap->payload[0] = f;
+    ap->payload[1] = arg;
+    return (StgClosure *)ap;
+}
+
+/* ----------------------------------------------------------------------------
+   Deconstructing Haskell objects
+
+   We would like to assert that we have the right kind of object in
+   each case, but this is problematic because in GHCi the info table
+   for the D# constructor (say) might be dynamically loaded.  Hence we
+   omit these assertions for now.
+   ------------------------------------------------------------------------- */
+
+HsChar
+rts_getChar (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == Czh_con_info ||
+    //        p->header.info == Czh_static_info);
+    return (StgChar)(StgWord)(p->payload[0]);
+}
+
+HsInt
+rts_getInt (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == Izh_con_info ||
+    //        p->header.info == Izh_static_info);
+    return (HsInt)(p->payload[0]);
+}
+
+HsInt8
+rts_getInt8 (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == I8zh_con_info ||
+    //        p->header.info == I8zh_static_info);
+    return (HsInt8)(HsInt)(p->payload[0]);
+}
+
+HsInt16
+rts_getInt16 (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == I16zh_con_info ||
+    //        p->header.info == I16zh_static_info);
+    return (HsInt16)(HsInt)(p->payload[0]);
+}
+
+HsInt32
+rts_getInt32 (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == I32zh_con_info ||
+    //        p->header.info == I32zh_static_info);
+    return (HsInt32)(HsInt)(p->payload[0]);
+}
+
+HsInt64
+rts_getInt64 (HaskellObj p)
+{
+    HsInt64* tmp;
+    // See comment above:
+    // ASSERT(p->header.info == I64zh_con_info ||
+    //        p->header.info == I64zh_static_info);
+    tmp = (HsInt64*)&(p->payload[0]);
+    return *tmp;
+}
+HsWord
+rts_getWord (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == Wzh_con_info ||
+    //        p->header.info == Wzh_static_info);
+    return (HsWord)(p->payload[0]);
+}
+
+HsWord8
+rts_getWord8 (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == W8zh_con_info ||
+    //        p->header.info == W8zh_static_info);
+    return (HsWord8)(HsWord)(p->payload[0]);
+}
+
+HsWord16
+rts_getWord16 (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == W16zh_con_info ||
+    //        p->header.info == W16zh_static_info);
+    return (HsWord16)(HsWord)(p->payload[0]);
+}
+
+HsWord32
+rts_getWord32 (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == W32zh_con_info ||
+    //        p->header.info == W32zh_static_info);
+    return (HsWord32)(HsWord)(p->payload[0]);
+}
+
+
+HsWord64
+rts_getWord64 (HaskellObj p)
+{
+    HsWord64* tmp;
+    // See comment above:
+    // ASSERT(p->header.info == W64zh_con_info ||
+    //        p->header.info == W64zh_static_info);
+    tmp = (HsWord64*)&(p->payload[0]);
+    return *tmp;
+}
+
+HsFloat
+rts_getFloat (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == Fzh_con_info ||
+    //        p->header.info == Fzh_static_info);
+    return (float)(PK_FLT((P_)p->payload));
+}
+
+HsDouble
+rts_getDouble (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == Dzh_con_info ||
+    //        p->header.info == Dzh_static_info);
+    return (double)(PK_DBL((P_)p->payload));
+}
+
+HsStablePtr
+rts_getStablePtr (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == StablePtr_con_info ||
+    //        p->header.info == StablePtr_static_info);
+    return (StgStablePtr)(p->payload[0]);
+}
+
+HsPtr
+rts_getPtr (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == Ptr_con_info ||
+    //        p->header.info == Ptr_static_info);
+    return (Capability *)(p->payload[0]);
+}
+
+HsFunPtr
+rts_getFunPtr (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == FunPtr_con_info ||
+    //        p->header.info == FunPtr_static_info);
+    return (void *)(p->payload[0]);
+}
+
+HsBool
+rts_getBool (HaskellObj p)
+{
+    StgInfoTable *info;
+
+    info = get_itbl((StgClosure *)p);
+    if (info->srt_bitmap == 0) { // srt_bitmap is the constructor tag
+	return 0;
+    } else {
+	return 1;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   Creating threads
+   -------------------------------------------------------------------------- */
+
+INLINE_HEADER void pushClosure   (StgTSO *tso, StgWord c) {
+  tso->sp--;
+  tso->sp[0] = (W_) c;
+}
+
+StgTSO *
+createGenThread (Capability *cap, nat stack_size,  StgClosure *closure)
+{
+  StgTSO *t;
+#if defined(GRAN)
+  t = createThread (cap, stack_size, NO_PRI);
+#else
+  t = createThread (cap, stack_size);
+#endif
+  pushClosure(t, (W_)closure);
+  pushClosure(t, (W_)&stg_enter_info);
+  return t;
+}
+
+StgTSO *
+createIOThread (Capability *cap, nat stack_size,  StgClosure *closure)
+{
+  StgTSO *t;
+#if defined(GRAN)
+  t = createThread (cap, stack_size, NO_PRI);
+#else
+  t = createThread (cap, stack_size);
+#endif
+  pushClosure(t, (W_)&stg_noforceIO_info);
+  pushClosure(t, (W_)&stg_ap_v_info);
+  pushClosure(t, (W_)closure);
+  pushClosure(t, (W_)&stg_enter_info);
+  return t;
+}
+
+/*
+ * Same as above, but also evaluate the result of the IO action
+ * to whnf while we're at it.
+ */
+
+StgTSO *
+createStrictIOThread(Capability *cap, nat stack_size,  StgClosure *closure)
+{
+  StgTSO *t;
+#if defined(GRAN)
+  t = createThread(cap, stack_size, NO_PRI);
+#else
+  t = createThread(cap, stack_size);
+#endif
+  pushClosure(t, (W_)&stg_forceIO_info);
+  pushClosure(t, (W_)&stg_ap_v_info);
+  pushClosure(t, (W_)closure);
+  pushClosure(t, (W_)&stg_enter_info);
+  return t;
+}
+
+/* ----------------------------------------------------------------------------
+   Evaluating Haskell expressions
+   ------------------------------------------------------------------------- */
+
+Capability *
+rts_eval (Capability *cap, HaskellObj p, /*out*/HaskellObj *ret)
+{
+    StgTSO *tso;
+    
+    tso = createGenThread(cap, RtsFlags.GcFlags.initialStkSize, p);
+    return scheduleWaitThread(tso,ret,cap);
+}
+
+Capability *
+rts_eval_ (Capability *cap, HaskellObj p, unsigned int stack_size, 
+	   /*out*/HaskellObj *ret)
+{
+    StgTSO *tso;
+
+    tso = createGenThread(cap, stack_size, p);
+    return scheduleWaitThread(tso,ret,cap);
+}
+
+/*
+ * rts_evalIO() evaluates a value of the form (IO a), forcing the action's
+ * result to WHNF before returning.
+ */
+Capability *
+rts_evalIO (Capability *cap, HaskellObj p, /*out*/HaskellObj *ret)
+{
+    StgTSO* tso; 
+    
+    tso = createStrictIOThread(cap, RtsFlags.GcFlags.initialStkSize, p);
+    return scheduleWaitThread(tso,ret,cap);
+}
+
+/*
+ * rts_evalStableIO() is suitable for calling from Haskell.  It
+ * evaluates a value of the form (StablePtr (IO a)), forcing the
+ * action's result to WHNF before returning.  The result is returned
+ * in a StablePtr.
+ */
+Capability *
+rts_evalStableIO (Capability *cap, HsStablePtr s, /*out*/HsStablePtr *ret)
+{
+    StgTSO* tso;
+    StgClosure *p, *r;
+    SchedulerStatus stat;
+    
+    p = (StgClosure *)deRefStablePtr(s);
+    tso = createStrictIOThread(cap, RtsFlags.GcFlags.initialStkSize, p);
+    cap = scheduleWaitThread(tso,&r,cap);
+    stat = rts_getSchedStatus(cap);
+
+    if (stat == Success && ret != NULL) {
+	ASSERT(r != NULL);
+	*ret = getStablePtr((StgPtr)r);
+    }
+
+    return cap;
+}
+
+/*
+ * Like rts_evalIO(), but doesn't force the action's result.
+ */
+Capability *
+rts_evalLazyIO (Capability *cap, HaskellObj p, /*out*/HaskellObj *ret)
+{
+    StgTSO *tso;
+
+    tso = createIOThread(cap, RtsFlags.GcFlags.initialStkSize, p);
+    return scheduleWaitThread(tso,ret,cap);
+}
+
+Capability *
+rts_evalLazyIO_ (Capability *cap, HaskellObj p, unsigned int stack_size, 
+		 /*out*/HaskellObj *ret)
+{
+    StgTSO *tso;
+
+    tso = createIOThread(cap, stack_size, p);
+    return scheduleWaitThread(tso,ret,cap);
+}
+
+/* Convenience function for decoding the returned status. */
+
+void
+rts_checkSchedStatus (char* site, Capability *cap)
+{
+    SchedulerStatus rc = cap->running_task->stat;
+    switch (rc) {
+    case Success:
+	return;
+    case Killed:
+	errorBelch("%s: uncaught exception",site);
+	stg_exit(EXIT_FAILURE);
+    case Interrupted:
+	errorBelch("%s: interrupted", site);
+	stg_exit(EXIT_FAILURE);
+    default:
+	errorBelch("%s: Return code (%d) not ok",(site),(rc));	
+	stg_exit(EXIT_FAILURE);
+    }
+}
+
+SchedulerStatus
+rts_getSchedStatus (Capability *cap)
+{
+    return cap->running_task->stat;
+}
+
+Capability *
+rts_lock (void)
+{
+    Capability *cap;
+    Task *task;
+
+    // ToDo: get rid of this lock in the common case.  We could store
+    // a free Task in thread-local storage, for example.  That would
+    // leave just one lock on the path into the RTS: cap->lock when
+    // acquiring the Capability.
+    ACQUIRE_LOCK(&sched_mutex);
+    task = newBoundTask();
+    RELEASE_LOCK(&sched_mutex);
+
+    cap = NULL;
+    waitForReturnCapability(&cap, task);
+    return (Capability *)cap;
+}
+
+// Exiting the RTS: we hold a Capability that is not necessarily the
+// same one that was originally returned by rts_lock(), because
+// rts_evalIO() etc. may return a new one.  Now that we have
+// investigated the return value, we can release the Capability,
+// and free the Task (in that order).
+
+void
+rts_unlock (Capability *cap)
+{
+    Task *task;
+
+    task = cap->running_task;
+    ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
+
+    // slightly delicate ordering of operations below, pay attention!
+
+    // We are no longer a bound task/thread.  This is important,
+    // because the GC can run when we release the Capability below,
+    // and we don't want it to treat this as a live TSO pointer.
+    task->tso = NULL;
+
+    // Now release the Capability.  With the capability released, GC
+    // may happen.  NB. does not try to put the current Task on the
+    // worker queue.
+    releaseCapability(cap);
+
+    // Finally, we can release the Task to the free list.
+    boundTaskExiting(task);
+}
diff --git a/rts/RtsDllMain.c b/rts/RtsDllMain.c
new file mode 100644
index 0000000000..af3c5090de
--- /dev/null
+++ b/rts/RtsDllMain.c
@@ -0,0 +1,39 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1999-2000
+ *
+ * Entry point for RTS-in-a-DLL
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsAPI.h"
+
+#ifdef HAVE_WINDOWS_H
+#include <windows.h>
+#endif
+
+/* I'd be mildly surprised if this wasn't defined, but still. */
+#ifdef ENABLE_WIN32_DLL_SUPPORT
+
+BOOL
+WINAPI
+DllMain ( HINSTANCE hInstance
+        , DWORD reason
+	, LPVOID reserved
+	)
+{
+  /*
+   * Note: the DllMain() doesn't call startupHaskell() for you,
+   *       that is the task of users of the RTS. The reason is
+   *       that *you* want to be able to control the arguments
+   *       you pass to the RTS.
+   */
+  switch (reason) {
+  case DLL_PROCESS_DETACH: shutdownHaskell();
+  }
+  return TRUE;
+}
+
+#endif /* ENABLE_WIN32_DLL_SUPPORT */
diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c
new file mode 100644
index 0000000000..0f83b3356c
--- /dev/null
+++ b/rts/RtsFlags.c
@@ -0,0 +1,2281 @@
+
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The AQUA Project, Glasgow University, 1994-1997
+ * (c) The GHC Team, 1998-1999
+ *
+ * Functions for parsing the argument list.
+ *
+ * ---------------------------------------------------------------------------*/
+
+//@menu
+//* Includes::			
+//* Constants::			
+//* Static function decls::	
+//* Command-line option parsing routines::  
+//* GranSim specific options::	
+//* Aux fcts::			
+//@end menu
+//*/
+
+//@node Includes, Constants
+//@subsection Includes
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "BlockAlloc.h"
+#include "Timer.h"		/* CS_MIN_MILLISECS */
+#include "Profiling.h"
+
+#ifdef HAVE_CTYPE_H
+#include <ctype.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+// Flag Structure
+RTS_FLAGS RtsFlags;
+
+/*
+ * Split argument lists
+ */
+int     prog_argc = 0;    /* an "int" so as to match normal "argc" */
+char  **prog_argv = NULL;
+char   *prog_name = NULL; /* 'basename' of prog_argv[0] */
+int     rts_argc = 0;  /* ditto */
+char   *rts_argv[MAX_RTS_ARGS];
+
+//@node Constants, Static function decls, Includes
+//@subsection Constants
+
+/*
+ * constants, used later 
+ */
+#define RTS 1
+#define PGM 0
+
+#if defined(GRAN)
+
+static char *gran_debug_opts_strs[] = {
+  "DEBUG (-bDe, -bD1): event_trace; printing event trace.\n",
+  "DEBUG (-bDE, -bD2): event_stats; printing event statistics.\n",
+  "DEBUG (-bDb, -bD4): bq; check blocking queues\n",
+  "DEBUG (-bDG, -bD8): pack; routines for (un-)packing graph structures.\n",
+  "DEBUG (-bDq, -bD16): checkSparkQ; check consistency of the spark queues.\n",
+  "DEBUG (-bDf, -bD32): thunkStealing; print forwarding of fetches.\n",
+  "DEBUG (-bDr, -bD64): randomSteal; stealing sparks/threads from random PEs.\n",
+  "DEBUG (-bDF, -bD128): findWork; searching spark-pools (local & remote), thread queues for work.\n",
+  "DEBUG (-bDu, -bD256): unused; currently unused flag.\n",
+  "DEBUG (-bDS, -bD512): pri; priority sparking or scheduling.\n",
+  "DEBUG (-bD:, -bD1024): checkLight; check GranSim-Light setup.\n",
+  "DEBUG (-bDo, -bD2048): sortedQ; check whether spark/thread queues are sorted.\n",
+  "DEBUG (-bDz, -bD4096): blockOnFetch; check for blocked on fetch.\n",
+  "DEBUG (-bDP, -bD8192): packBuffer; routines handling pack buffer (GranSim internal!).\n",
+  "DEBUG (-bDt, -bD16384): blockOnFetch_sanity; check for TSO asleep on fetch.\n",
+};
+
+/* one character codes for the available debug options */
+static char gran_debug_opts_flags[] = {
+  'e', 'E', 'b', 'G', 'q', 'f', 'r', 'F', 'u', 'S', ':', 'o', 'z', 'P', 't'
+};
+
+#elif defined(PAR)
+
+static char *par_debug_opts_strs[] = {
+  "DEBUG (-qDv, -qD1): verbose; be generally verbose with parallel related stuff.\n",
+  "DEBUG (-qDq, -qD2): bq; print blocking queues.\n",
+  "DEBUG (-qDs, -qD4): schedule; scheduling of parallel threads.\n",
+  "DEBUG (-qDe, -qD8): free; free messages.\n",
+  "DEBUG (-qDr, -qD16): resume; resume messages.\n",
+  "DEBUG (-qDw, -qD32): weight; print weights and distrib GC stuff.\n",
+  "DEBUG (-qDF, -qD64): fetch; fetch messages.\n",
+  // "DEBUG (-qDa, -qD128): ack; ack messages.\n",
+  "DEBUG (-qDf, -qD128): fish; fish messages.\n",
+  //"DEBUG (-qDo, -qD512): forward; forwarding messages to other PEs.\n",
+  "DEBUG (-qDl, -qD256): tables; print internal LAGA etc tables.\n",
+  "DEBUG (-qDo, -qD512): packet; packets and graph structures when packing.\n",
+  "DEBUG (-qDp, -qD1024): pack; packing and unpacking graphs.\n",
+  "DEBUG (-qDz, -qD2048): paranoia; ridiculously detailed output (excellent for filling a partition).\n"
+};
+
+/* one character codes for the available debug options */
+static char par_debug_opts_flags[] = {
+  'v', 'q', 's', 'e', 'r', 'w', 'F', 'f', 'l', 'o', 'p', 'z'
+};
+
+#endif /* PAR */
+
+//@node Static function decls, Command-line option parsing routines, Constants
+//@subsection Static function decls
+
+/* -----------------------------------------------------------------------------
+   Static function decls
+   -------------------------------------------------------------------------- */
+
+static int		/* return NULL on error */
+open_stats_file (
+    I_ arg,
+    int argc, char *argv[],
+    int rts_argc, char *rts_argv[],
+    const char *FILENAME_FMT,
+    FILE **file_ret);
+
+static I_ decode(const char *s);
+static void bad_option(const char *s);
+
+#if defined(GRAN)
+static void enable_GranSimLight(void);
+static void process_gran_option(int arg, int *rts_argc, char *rts_argv[], rtsBool *error);
+static void set_GranSim_debug_options(nat n);
+static void help_GranSim_debug_options(nat n);
+#elif defined(PAR)
+static void process_par_option(int arg, int *rts_argc, char *rts_argv[], rtsBool *error);
+static void set_par_debug_options(nat n);
+static void help_par_debug_options(nat n);
+#endif
+
+//@node Command-line option parsing routines, GranSim specific options, Static function decls
+//@subsection Command-line option parsing routines
+
+/* -----------------------------------------------------------------------------
+ * Command-line option parsing routines.
+ * ---------------------------------------------------------------------------*/
+
+void initRtsFlagsDefaults(void)
+{
+    RtsFlags.GcFlags.statsFile		= NULL;
+    RtsFlags.GcFlags.giveStats		= NO_GC_STATS;
+
+    RtsFlags.GcFlags.maxStkSize		= (8 * 1024 * 1024) / sizeof(W_);
+    RtsFlags.GcFlags.initialStkSize	= 1024 / sizeof(W_);
+
+    RtsFlags.GcFlags.minAllocAreaSize   = (512 * 1024)        / BLOCK_SIZE;
+    RtsFlags.GcFlags.minOldGenSize      = (1024 * 1024)       / BLOCK_SIZE;
+    RtsFlags.GcFlags.maxHeapSize	= 0;    /* off by default */
+    RtsFlags.GcFlags.heapSizeSuggestion	= 0;    /* none */
+    RtsFlags.GcFlags.pcFreeHeap		= 3;	/* 3% */
+    RtsFlags.GcFlags.oldGenFactor       = 2;
+#if defined(PAR)
+    /* A hack currently needed for GUM -- HWL */
+    RtsFlags.GcFlags.generations        = 1;
+    RtsFlags.GcFlags.steps              = 2;
+    RtsFlags.GcFlags.squeezeUpdFrames	= rtsFalse;
+#else
+    RtsFlags.GcFlags.generations        = 2;
+    RtsFlags.GcFlags.steps              = 2;
+    RtsFlags.GcFlags.squeezeUpdFrames	= rtsTrue;
+#endif
+    RtsFlags.GcFlags.compact            = rtsFalse;
+    RtsFlags.GcFlags.compactThreshold   = 30.0;
+#ifdef RTS_GTK_FRONTPANEL
+    RtsFlags.GcFlags.frontpanel         = rtsFalse;
+#endif
+    RtsFlags.GcFlags.idleGCDelayTicks   = 300 / TICK_MILLISECS; /* ticks */
+
+#ifdef DEBUG
+    RtsFlags.DebugFlags.scheduler	= rtsFalse;
+    RtsFlags.DebugFlags.interpreter	= rtsFalse;
+    RtsFlags.DebugFlags.codegen		= rtsFalse;
+    RtsFlags.DebugFlags.weak		= rtsFalse;
+    RtsFlags.DebugFlags.gccafs		= rtsFalse;
+    RtsFlags.DebugFlags.gc		= rtsFalse;
+    RtsFlags.DebugFlags.block_alloc	= rtsFalse;
+    RtsFlags.DebugFlags.sanity		= rtsFalse;
+    RtsFlags.DebugFlags.stable		= rtsFalse;
+    RtsFlags.DebugFlags.stm             = rtsFalse;
+    RtsFlags.DebugFlags.prof		= rtsFalse;
+    RtsFlags.DebugFlags.gran		= rtsFalse;
+    RtsFlags.DebugFlags.par		= rtsFalse;
+    RtsFlags.DebugFlags.linker		= rtsFalse;
+    RtsFlags.DebugFlags.squeeze		= rtsFalse;
+#endif
+
+#if defined(PROFILING) || defined(PAR)
+    RtsFlags.CcFlags.doCostCentres	= 0;
+#endif /* PROFILING or PAR */
+
+#ifdef PROFILING
+    RtsFlags.ProfFlags.doHeapProfile      = rtsFalse;
+    RtsFlags.ProfFlags.profileInterval    = 100;
+    RtsFlags.ProfFlags.includeTSOs        = rtsFalse;
+    RtsFlags.ProfFlags.showCCSOnException = rtsFalse;
+    RtsFlags.ProfFlags.maxRetainerSetSize = 8;
+    RtsFlags.ProfFlags.modSelector        = NULL;
+    RtsFlags.ProfFlags.descrSelector      = NULL;
+    RtsFlags.ProfFlags.typeSelector       = NULL;
+    RtsFlags.ProfFlags.ccSelector         = NULL;
+    RtsFlags.ProfFlags.ccsSelector        = NULL;
+    RtsFlags.ProfFlags.retainerSelector   = NULL;
+    RtsFlags.ProfFlags.bioSelector        = NULL;
+
+#elif defined(DEBUG)
+    RtsFlags.ProfFlags.doHeapProfile      = rtsFalse;
+#endif
+
+    RtsFlags.ConcFlags.ctxtSwitchTime	= CS_MIN_MILLISECS;  /* In milliseconds */
+
+#ifdef THREADED_RTS
+    RtsFlags.ParFlags.nNodes	        = 1;
+    RtsFlags.ParFlags.migrate           = rtsTrue;
+    RtsFlags.ParFlags.wakeupMigrate     = rtsFalse;
+#endif
+
+#ifdef PAR
+    RtsFlags.ParFlags.ParStats.Full   	  = rtsFalse;
+    RtsFlags.ParFlags.ParStats.Suppressed = rtsFalse;
+    RtsFlags.ParFlags.ParStats.Binary 	  = rtsFalse;
+    RtsFlags.ParFlags.ParStats.Sparks 	  = rtsFalse;
+    RtsFlags.ParFlags.ParStats.Heap   	  = rtsFalse;
+    RtsFlags.ParFlags.ParStats.NewLogfile = rtsFalse;
+    RtsFlags.ParFlags.ParStats.Global     = rtsFalse;
+
+    RtsFlags.ParFlags.outputDisabled	= rtsFalse;
+#ifdef DIST
+    RtsFlags.ParFlags.doFairScheduling  = rtsTrue;  /* fair sched by def */
+#else
+    RtsFlags.ParFlags.doFairScheduling  = rtsFalse;  /* unfair sched by def */
+#endif
+    RtsFlags.ParFlags.packBufferSize	= 1024;
+    RtsFlags.ParFlags.thunksToPack      = 1; /* 0 ... infinity; */
+    RtsFlags.ParFlags.globalising       = 1; /* 0 ... everything */
+    RtsFlags.ParFlags.maxThreads        = 1024;
+    RtsFlags.ParFlags.maxFishes        = MAX_FISHES;
+    RtsFlags.ParFlags.fishDelay         = FISH_DELAY;
+#endif
+
+#if defined(PAR) || defined(THREADED_RTS)
+    RtsFlags.ParFlags.maxLocalSparks	= 4096;
+#endif /* PAR || THREADED_RTS */
+
+#if defined(GRAN)
+    /* ToDo: check defaults for GranSim and GUM */
+    RtsFlags.GcFlags.maxStkSize		= (8 * 1024 * 1024) / sizeof(W_);
+    RtsFlags.GcFlags.initialStkSize	= 1024 / sizeof(W_);
+
+    RtsFlags.GranFlags.maxThreads	= 65536; // refers to mandatory threads
+    RtsFlags.GranFlags.GranSimStats.Full	= rtsFalse;
+    RtsFlags.GranFlags.GranSimStats.Suppressed	= rtsFalse;
+    RtsFlags.GranFlags.GranSimStats.Binary      = rtsFalse;
+    RtsFlags.GranFlags.GranSimStats.Sparks      = rtsFalse;
+    RtsFlags.GranFlags.GranSimStats.Heap        = rtsFalse;
+    RtsFlags.GranFlags.GranSimStats.NewLogfile  = rtsFalse;
+    RtsFlags.GranFlags.GranSimStats.Global      = rtsFalse;
+
+    RtsFlags.GranFlags.packBufferSize	= 1024;
+    RtsFlags.GranFlags.packBufferSize_internal = GRANSIM_DEFAULT_PACK_BUFFER_SIZE;
+
+    RtsFlags.GranFlags.proc         = MAX_PROC;
+    RtsFlags.GranFlags.Fishing      = rtsFalse;
+    RtsFlags.GranFlags.maxFishes   = MAX_FISHES;
+    RtsFlags.GranFlags.time_slice   = GRAN_TIME_SLICE;
+    RtsFlags.GranFlags.Light        = rtsFalse;
+
+    RtsFlags.GranFlags.Costs.latency =             LATENCY;          
+    RtsFlags.GranFlags.Costs.additional_latency =  ADDITIONAL_LATENCY; 
+    RtsFlags.GranFlags.Costs.fetchtime =           FETCHTIME; 
+    RtsFlags.GranFlags.Costs.lunblocktime =        LOCALUNBLOCKTIME; 
+    RtsFlags.GranFlags.Costs.gunblocktime =        GLOBALUNBLOCKTIME;
+    RtsFlags.GranFlags.Costs.mpacktime =           MSGPACKTIME;      
+    RtsFlags.GranFlags.Costs.munpacktime =         MSGUNPACKTIME;
+    RtsFlags.GranFlags.Costs.mtidytime =           MSGTIDYTIME;
+
+    RtsFlags.GranFlags.Costs.threadcreatetime =         THREADCREATETIME;
+    RtsFlags.GranFlags.Costs.threadqueuetime =          THREADQUEUETIME;
+    RtsFlags.GranFlags.Costs.threaddescheduletime =     THREADDESCHEDULETIME;
+    RtsFlags.GranFlags.Costs.threadscheduletime =       THREADSCHEDULETIME;
+    RtsFlags.GranFlags.Costs.threadcontextswitchtime =  THREADCONTEXTSWITCHTIME;
+
+    RtsFlags.GranFlags.Costs.arith_cost =         ARITH_COST;       
+    RtsFlags.GranFlags.Costs.branch_cost =        BRANCH_COST; 
+    RtsFlags.GranFlags.Costs.load_cost =          LOAD_COST;        
+    RtsFlags.GranFlags.Costs.store_cost =         STORE_COST; 
+    RtsFlags.GranFlags.Costs.float_cost =         FLOAT_COST;       
+
+    RtsFlags.GranFlags.Costs.heapalloc_cost =     HEAPALLOC_COST;
+
+    RtsFlags.GranFlags.Costs.pri_spark_overhead = PRI_SPARK_OVERHEAD;        
+    RtsFlags.GranFlags.Costs.pri_sched_overhead = PRI_SCHED_OVERHEAD;        
+
+    RtsFlags.GranFlags.DoFairSchedule           = rtsFalse;             
+    RtsFlags.GranFlags.DoAsyncFetch             = rtsFalse;        
+    RtsFlags.GranFlags.DoStealThreadsFirst      = rtsFalse;        
+    RtsFlags.GranFlags.DoAlwaysCreateThreads    = rtsFalse;      
+    RtsFlags.GranFlags.DoBulkFetching           = rtsFalse;             
+    RtsFlags.GranFlags.DoThreadMigration        = rtsFalse;          
+    RtsFlags.GranFlags.FetchStrategy            = 2;                     
+    RtsFlags.GranFlags.PreferSparksOfLocalNodes = rtsFalse;   
+    RtsFlags.GranFlags.DoPrioritySparking       = rtsFalse;         
+    RtsFlags.GranFlags.DoPriorityScheduling     = rtsFalse;       
+    RtsFlags.GranFlags.SparkPriority            = 0;
+    RtsFlags.GranFlags.SparkPriority2           = 0; 
+    RtsFlags.GranFlags.RandomPriorities         = rtsFalse;           
+    RtsFlags.GranFlags.InversePriorities        = rtsFalse;          
+    RtsFlags.GranFlags.IgnorePriorities         = rtsFalse;           
+    RtsFlags.GranFlags.ThunksToPack             = 0;                      
+    RtsFlags.GranFlags.RandomSteal              = rtsTrue;
+#endif
+
+#ifdef TICKY_TICKY
+    RtsFlags.TickyFlags.showTickyStats	 = rtsFalse;
+    RtsFlags.TickyFlags.tickyFile	 = NULL;
+#endif
+}
+
+static const char *
+usage_text[] = {
+"",
+"Usage: <prog> <args> [+RTS <rtsopts> | -RTS <args>] ... --RTS <args>",
+"",
+"   +RTS    Indicates run time system options follow",
+"   -RTS    Indicates program arguments follow",
+"  --RTS    Indicates that ALL subsequent arguments will be given to the",
+"           program (including any of these RTS flags)",
+"",
+"The following run time system options are available:",
+"",
+"  -?       Prints this message and exits; the program is not executed",
+"",
+"  -K<size> Sets the maximum stack size (default 8M)  Egs: -K32k   -K512k",
+"  -k<size> Sets the initial thread stack size (default 1k)  Egs: -k4k   -k2m",
+"",
+"  -A<size> Sets the minimum allocation area size (default 256k) Egs: -A1m -A10k",
+"  -M<size> Sets the maximum heap size (default unlimited)  Egs: -M256k -M1G",
+"  -H<size> Sets the minimum heap size (default 0M)   Egs: -H24m  -H1G",
+"  -m<n>    Minimum % of heap which must be available (default 3%)",
+"  -G<n>    Number of generations (default: 2)",
+"  -T<n>    Number of steps in younger generations (default: 2)",
+"  -c<n>    Auto-enable compaction of the oldest generation when live data is",
+"           at least <n>% of the maximum heap size set with -M (default: 30%)",
+"  -c       Enable compaction for all major collections",
+#if defined(THREADED_RTS)
+"  -I<sec>  Perform full GC after <sec> idle time (default: 0.3, 0 == off)",
+#endif
+"",
+"  -t<file> One-line GC statistics  (default file: <program>.stat)",
+"  -s<file> Summary  GC statistics  (with -Sstderr going to stderr)",
+"  -S<file> Detailed GC statistics",
+#ifdef RTS_GTK_FRONTPANEL
+"  -f       Display front panel (requires X11 & GTK+)",
+#endif
+"",
+"",
+"  -Z       Don't squeeze out update frames on stack overflow",
+"  -B       Sound the bell at the start of each garbage collection",
+#if defined(PROFILING) || defined(PAR)
+"",
+"  -px      Time/allocation profile (XML)  (output file <program>.prof)",
+"  -p       Time/allocation profile        (output file <program>.prof)",
+"  -P       More detailed Time/Allocation profile",
+"  -Pa      Give information about *all* cost centres",
+
+# if defined(PROFILING)
+"",
+"  -hx            Heap residency profile (XML)   (output file <program>.prof)",
+"  -h<break-down> Heap residency profile (hp2ps) (output file <program>.hp)",
+"     break-down: c = cost centre stack (default)",
+"                 m = module",
+"                 d = closure description",
+"                 y = type description",
+"                 r = retainer",
+"                 b = biography (LAG,DRAG,VOID,USE)",
+"  A subset of closures may be selected thusly:",
+"    -hc<cc>,...  specific cost centre(s) (top of stack only)",
+"    -hC<cc>,...  specific cost centre(s) (anywhere in stack)",
+"    -hm<mod>...  all cost centres from the specified modules(s)",
+"    -hd<des>,... closures with specified closure descriptions",
+"    -hy<typ>...  closures with specified type descriptions",
+"    -hr<cc>...   closures with specified retainers",
+"    -hb<bio>...  closures with specified biographies (lag,drag,void,use)",
+"",
+"  -R<size>       Set the maximum retainer set size (default: 8)",
+"",
+"  -i<sec>        Time between heap samples (seconds, default: 0.1)",
+"",
+"  -xt            Include threads (TSOs) in a heap profile",
+"",
+"  -xc      Show current cost centre stack on raising an exception",
+# endif
+#endif /* PROFILING or PAR */
+#if !defined(PROFILING) && defined(DEBUG)
+"",
+"  -h<break-down> Debugging Heap residency profile",
+"                 (output file <program>.hp)",
+"     break-down: L = closure label (default)",
+"                 T = closure type (constructor, thunk etc.)",
+#endif
+"",
+#if defined(TICKY_TICKY)
+"  -r<file>  Produce reduction profiling statistics (with -rstderr for stderr)",
+"",
+#endif
+#if defined(PAR)
+"  -N<n>     Use <n> PVMish processors in parallel (default: 2)",
+/* NB: the -N<n> is implemented by the driver!! */
+#endif
+"  -C<secs>  Context-switch interval in seconds",
+"                (0 or no argument means switch as often as possible)",
+"                the default is .02 sec; resolution is .02 sec",
+"",
+#if defined(DEBUG)
+"  -Ds  DEBUG: scheduler",
+"  -Di  DEBUG: interpreter",
+"  -Dc  DEBUG: codegen",
+"  -Dw  DEBUG: weak",
+"  -DG  DEBUG: gccafs",
+"  -Dg  DEBUG: gc",
+"  -Db  DEBUG: block",
+"  -DS  DEBUG: sanity",
+"  -Dt  DEBUG: stable",
+"  -Dp  DEBUG: prof",
+"  -Dr  DEBUG: gran",
+"  -DP  DEBUG: par",
+"  -Dl  DEBUG: linker",
+"  -Dm  DEBUG: stm",
+"  -Dz  DEBUG: stack squezing",
+"",
+#endif /* DEBUG */
+#if defined(THREADED_RTS)
+"  -N<n>     Use <n> OS threads (default: 1)",
+"  -qm       Don't automatically migrate threads between CPUs",
+"  -qw       Migrate a thread to the current CPU when it is woken up",
+#endif
+#if defined(THREADED_RTS) || defined(PAR)
+"  -e<size>  Size of spark pools (default 100)",
+#endif
+#if defined(PAR)
+"  -t<num>   Set maximum number of advisory threads per PE (default 32)",
+"  -qP       Enable activity profile (output files in ~/<program>*.gr)",
+"  -qQ<size> Set pack-buffer size (default: 1024)",
+"  -qd       Turn on PVM-ish debugging",
+"  -qO       Disable output for performance measurement",
+#endif
+#if defined(THREADED_RTS) || defined(PAR)
+"  -e<n>     Maximum number of outstanding local sparks (default: 4096)",
+#endif
+#if defined(PAR)
+"  -d        Turn on PVM-ish debugging",
+"  -O        Disable output for performance measurement",
+#endif /* PAR */
+#if defined(GRAN)  /* ToDo: fill in decent Docu here */
+"  -b...     All GranSim options start with -b; see GranSim User's Guide for details",
+#endif
+"",
+"RTS options may also be specified using the GHCRTS environment variable.",
+"",
+"Other RTS options may be available for programs compiled a different way.",
+"The GHC User's Guide has full details.",
+"",
+0
+};
+
+STATIC_INLINE rtsBool
+strequal(const char *a, const char * b)
+{
+    return(strcmp(a, b) == 0);
+}
+
+static void
+splitRtsFlags(char *s, int *rts_argc, char *rts_argv[])
+{
+    char *c1, *c2;
+
+    c1 = s;
+    do {
+	while (isspace(*c1)) { c1++; };
+	c2 = c1;
+	while (!isspace(*c2) && *c2 != '\0') { c2++; };
+	
+	if (c1 == c2) { break; }
+	
+	if (*rts_argc < MAX_RTS_ARGS-1) {
+	    s = stgMallocBytes(c2-c1+1, "RtsFlags.c:splitRtsFlags()");
+	    strncpy(s, c1, c2-c1);
+	    s[c2-c1] = '\0';
+	    rts_argv[(*rts_argc)++] = s;
+	} else {
+	    barf("too many RTS arguments (max %d)", MAX_RTS_ARGS-1);
+	}
+	
+	c1 = c2;
+    } while (*c1 != '\0');
+}
+    
+void
+setupRtsFlags(int *argc, char *argv[], int *rts_argc, char *rts_argv[])
+{
+    rtsBool error = rtsFalse;
+    I_ mode;
+    I_ arg, total_arg;
+
+    setProgName (argv);
+    total_arg = *argc;
+    arg = 1;
+
+    *argc = 1;
+    *rts_argc = 0;
+
+    // process arguments from the ghc_rts_opts global variable first.
+    // (arguments from the GHCRTS environment variable and the command
+    // line override these).
+    {
+	if (ghc_rts_opts != NULL) {
+	    splitRtsFlags(ghc_rts_opts, rts_argc, rts_argv);
+	}
+    }
+
+    // process arguments from the GHCRTS environment variable next
+    // (arguments from the command line override these).
+    {
+	char *ghc_rts = getenv("GHCRTS");
+
+	if (ghc_rts != NULL) {
+	    splitRtsFlags(ghc_rts, rts_argc, rts_argv);
+	}
+    }
+
+    // Split arguments (argv) into PGM (argv) and RTS (rts_argv) parts
+    //   argv[0] must be PGM argument -- leave in argv
+
+    for (mode = PGM; arg < total_arg; arg++) {
+	// The '--RTS' argument disables all future +RTS ... -RTS processing.
+	if (strequal("--RTS", argv[arg])) {
+	    arg++;
+	    break;
+	}
+	// The '--' argument is passed through to the program, but
+	// disables all further +RTS ... -RTS processing.
+	else if (strequal("--", argv[arg])) {
+	    break;
+	}
+	else if (strequal("+RTS", argv[arg])) {
+	    mode = RTS;
+	}
+	else if (strequal("-RTS", argv[arg])) {
+	    mode = PGM;
+	}
+	else if (mode == RTS && *rts_argc < MAX_RTS_ARGS-1) {
+	    rts_argv[(*rts_argc)++] = argv[arg];
+	}
+	else if (mode == PGM) {
+	    argv[(*argc)++] = argv[arg];
+	}
+	else {
+	  barf("too many RTS arguments (max %d)", MAX_RTS_ARGS-1);
+	}
+    }
+    // process remaining program arguments
+    for (; arg < total_arg; arg++) {
+	argv[(*argc)++] = argv[arg];
+    }
+    argv[*argc] = (char *) 0;
+    rts_argv[*rts_argc] = (char *) 0;
+
+    // Process RTS (rts_argv) part: mainly to determine statsfile
+    for (arg = 0; arg < *rts_argc; arg++) {
+	if (rts_argv[arg][0] != '-') {
+	    fflush(stdout);
+	    errorBelch("unexpected RTS argument: %s", rts_argv[arg]);
+	    error = rtsTrue;
+
+        } else {
+	    switch(rts_argv[arg][1]) {
+
+	      /* process: general args, then PROFILING-only ones,
+		 then CONCURRENT-only, PARallel-only, GRAN-only,
+		 TICKY-only (same order as defined in RtsFlags.lh);
+		 within those groups, mostly in case-insensitive
+		 alphabetical order.
+                 Final group is x*, which allows for more options.
+	      */
+
+#ifdef TICKY_TICKY
+# define TICKY_BUILD_ONLY(x) x
+#else
+# define TICKY_BUILD_ONLY(x) \
+errorBelch("not built for: ticky-ticky stats"); \
+error = rtsTrue;
+#endif
+
+#if defined(PROFILING) 
+# define COST_CENTRE_USING_BUILD_ONLY(x) x
+#else
+# define COST_CENTRE_USING_BUILD_ONLY(x) \
+errorBelch("not built for: -prof or -parallel"); \
+error = rtsTrue;
+#endif
+
+#ifdef PROFILING
+# define PROFILING_BUILD_ONLY(x)   x
+#else
+# define PROFILING_BUILD_ONLY(x) \
+errorBelch("not built for: -prof"); \
+error = rtsTrue;
+#endif
+
+#ifdef PAR
+# define PAR_BUILD_ONLY(x)      x
+#else
+# define PAR_BUILD_ONLY(x) \
+errorBelch("not built for: -parallel"); \
+error = rtsTrue;
+#endif
+
+#ifdef THREADED_RTS
+# define THREADED_BUILD_ONLY(x)      x
+#else
+# define THREADED_BUILD_ONLY(x) \
+errorBelch("not built for: -smp"); \
+error = rtsTrue;
+#endif
+
+#if defined(THREADED_RTS) || defined(PAR)
+# define PAR_OR_THREADED_BUILD_ONLY(x)      x
+#else
+# define PAR_OR_THREADED_BUILD_ONLY(x) \
+errorBelch("not built for: -parallel or -smp"); \
+error = rtsTrue;
+#endif
+
+#ifdef GRAN
+# define GRAN_BUILD_ONLY(x)     x
+#else
+# define GRAN_BUILD_ONLY(x) \
+errorBelch("not built for: -gransim"); \
+error = rtsTrue;
+#endif
+
+	      /* =========== GENERAL ========================== */
+	      case '?':
+		error = rtsTrue;
+		break;
+
+	      case 'A':
+		RtsFlags.GcFlags.minAllocAreaSize
+		  = decode(rts_argv[arg]+2) / BLOCK_SIZE;
+		if (RtsFlags.GcFlags.minAllocAreaSize <= 0) {
+		  bad_option(rts_argv[arg]);
+		}
+		break;
+
+	      case 'B':
+		RtsFlags.GcFlags.ringBell = rtsTrue;
+		break;
+
+	      case 'c':
+		  if (rts_argv[arg][2] != '\0') {
+		      RtsFlags.GcFlags.compactThreshold =
+			  atof(rts_argv[arg]+2);
+		  } else {
+		      RtsFlags.GcFlags.compact = rtsTrue;
+		  }
+		  break;
+
+	      case 'F':
+	        RtsFlags.GcFlags.oldGenFactor = atof(rts_argv[arg]+2);
+	      
+		if (RtsFlags.GcFlags.oldGenFactor < 0)
+		  bad_option( rts_argv[arg] );
+		break;
+	      
+#ifdef DEBUG
+	      case 'D':
+	      { 
+		  char *c;
+
+		  for (c  = rts_argv[arg] + 2; *c != '\0'; c++) {
+		      switch (*c) {
+		      case 's':
+			  RtsFlags.DebugFlags.scheduler = rtsTrue;
+			  break;
+		      case 'i':
+			  RtsFlags.DebugFlags.interpreter = rtsTrue;
+			  break;
+		      case 'c':
+			  RtsFlags.DebugFlags.codegen = rtsTrue;
+			  break;
+		      case 'w':
+			  RtsFlags.DebugFlags.weak = rtsTrue;
+			  break;
+		      case 'G':
+			  RtsFlags.DebugFlags.gccafs = rtsTrue;
+			  break;
+		      case 'g':
+			  RtsFlags.DebugFlags.gc = rtsTrue;
+			  break;
+		      case 'b':
+			  RtsFlags.DebugFlags.block_alloc = rtsTrue;
+			  break;
+		      case 'S':
+			  RtsFlags.DebugFlags.sanity = rtsTrue;
+			  break;
+		      case 't':
+			  RtsFlags.DebugFlags.stable = rtsTrue;
+			  break;
+		      case 'p':
+			  RtsFlags.DebugFlags.prof = rtsTrue;
+			  break;
+		      case 'r':
+			  RtsFlags.DebugFlags.gran = rtsTrue;
+			  break;
+		      case 'P':
+			  RtsFlags.DebugFlags.par = rtsTrue;
+			  break;
+		      case 'l':
+			  RtsFlags.DebugFlags.linker = rtsTrue;
+			  break;
+		      case 'a':
+			  RtsFlags.DebugFlags.apply = rtsTrue;
+			  break;
+		      case 'm':
+			  RtsFlags.DebugFlags.stm = rtsTrue;
+			  break;
+		      case 'z':
+			  RtsFlags.DebugFlags.squeeze = rtsTrue;
+			  break;
+		      default:
+			  bad_option( rts_argv[arg] );
+		      }
+		  }
+		  break;
+	      }
+#endif
+
+	      case 'K':
+		RtsFlags.GcFlags.maxStkSize = 
+		  decode(rts_argv[arg]+2) / sizeof(W_);
+
+		if (RtsFlags.GcFlags.maxStkSize == 0) 
+		  bad_option( rts_argv[arg] );
+		break;
+
+	      case 'k':
+		RtsFlags.GcFlags.initialStkSize = 
+		  decode(rts_argv[arg]+2) / sizeof(W_);
+
+		if (RtsFlags.GcFlags.initialStkSize == 0) 
+		  bad_option( rts_argv[arg] );
+		break;
+
+	      case 'M':
+		RtsFlags.GcFlags.maxHeapSize = 
+		  decode(rts_argv[arg]+2) / BLOCK_SIZE;
+		/* user give size in *bytes* but "maxHeapSize" is in *blocks* */
+
+		if (RtsFlags.GcFlags.maxHeapSize <= 0) {
+		  bad_option(rts_argv[arg]);
+		}
+		break;
+
+	      case 'm':
+		RtsFlags.GcFlags.pcFreeHeap = atof(rts_argv[arg]+2);
+
+		if (RtsFlags.GcFlags.pcFreeHeap < 0 || 
+		    RtsFlags.GcFlags.pcFreeHeap > 100)
+		  bad_option( rts_argv[arg] );
+		break;
+
+	      case 'G':
+		RtsFlags.GcFlags.generations = decode(rts_argv[arg]+2);
+		if (RtsFlags.GcFlags.generations < 1) {
+		  bad_option(rts_argv[arg]);
+		}
+		break;
+
+	      case 'T':
+		RtsFlags.GcFlags.steps = decode(rts_argv[arg]+2);
+		if (RtsFlags.GcFlags.steps < 1) {
+		  bad_option(rts_argv[arg]);
+		}
+		break;
+
+	      case 'H':
+		RtsFlags.GcFlags.heapSizeSuggestion = 
+		  decode(rts_argv[arg]+2) / BLOCK_SIZE;
+
+		if (RtsFlags.GcFlags.heapSizeSuggestion <= 0) {
+		  bad_option(rts_argv[arg]);
+		}
+		break;
+
+#ifdef RTS_GTK_FRONTPANEL
+	      case 'f':
+		  RtsFlags.GcFlags.frontpanel = rtsTrue;
+		  break;
+#endif
+
+    	      case 'I':	/* idle GC delay */
+		if (rts_argv[arg][2] == '\0') {
+		  /* use default */
+		} else {
+		    I_ cst; /* tmp */
+
+		    /* Convert to ticks */
+		    cst = (I_) ((atof(rts_argv[arg]+2) * 1000));
+		    if (cst > 0 && cst < TICK_MILLISECS) {
+			cst = TICK_MILLISECS;
+		    } else {
+			cst = cst / TICK_MILLISECS;
+		    }
+		    RtsFlags.GcFlags.idleGCDelayTicks = cst;
+		}
+		break;
+
+	      case 'S':
+		  RtsFlags.GcFlags.giveStats = VERBOSE_GC_STATS;
+		  goto stats;
+
+	      case 's':
+		  RtsFlags.GcFlags.giveStats = SUMMARY_GC_STATS;
+		  goto stats;
+
+	      case 't':
+		  RtsFlags.GcFlags.giveStats = ONELINE_GC_STATS;
+		  goto stats;
+
+	    stats:
+#ifdef PAR
+		/* Opening all those files would almost certainly fail... */
+		// RtsFlags.ParFlags.ParStats.Full = rtsTrue;
+		RtsFlags.GcFlags.statsFile = NULL; /* temporary; ToDo: rm */
+#else
+		{ 
+		    int r;
+		    r = open_stats_file(arg, *argc, argv,
+					*rts_argc, rts_argv, STAT_FILENAME_FMT,
+					&RtsFlags.GcFlags.statsFile);
+		    if (r == -1) { error = rtsTrue; }
+		}
+#endif
+		  break;
+
+	      case 'Z':
+		RtsFlags.GcFlags.squeezeUpdFrames = rtsFalse;
+		break;
+
+	      /* =========== PROFILING ========================== */
+
+	      case 'P': /* detailed cost centre profiling (time/alloc) */
+	      case 'p': /* cost centre profiling (time/alloc) */
+		COST_CENTRE_USING_BUILD_ONLY(
+		switch (rts_argv[arg][2]) {
+		  case 'x':
+		    RtsFlags.CcFlags.doCostCentres = COST_CENTRES_XML;
+		    break;
+		  case 'a':
+		    RtsFlags.CcFlags.doCostCentres = COST_CENTRES_ALL;
+		    break;
+		  default:
+		      if (rts_argv[arg][1] == 'P') {
+			  RtsFlags.CcFlags.doCostCentres =
+			      COST_CENTRES_VERBOSE;
+		      } else {
+			  RtsFlags.CcFlags.doCostCentres =
+			      COST_CENTRES_SUMMARY;
+		      }
+		      break;
+		}
+		) break;
+
+	      case 'R':
+		  PROFILING_BUILD_ONLY(
+		      RtsFlags.ProfFlags.maxRetainerSetSize = atof(rts_argv[arg]+2);
+  	          ) break;
+
+	      case 'h': /* serial heap profile */
+#if !defined(PROFILING) && defined(DEBUG)
+		switch (rts_argv[arg][2]) {
+		  case '\0':
+		  case 'L':
+		    RtsFlags.ProfFlags.doHeapProfile = HEAP_BY_INFOPTR;
+		    break;
+		  case 'T':
+		    RtsFlags.ProfFlags.doHeapProfile = HEAP_BY_CLOSURE_TYPE;
+		    break;
+		  default:
+		    errorBelch("invalid heap profile option: %s",rts_argv[arg]);
+		    error = rtsTrue;
+		}
+#else
+		PROFILING_BUILD_ONLY(
+		switch (rts_argv[arg][2]) {
+		case '\0':
+		case 'C':
+		case 'c':
+		case 'M':
+		case 'm':
+		case 'D':
+		case 'd':
+		case 'Y':
+		case 'y':
+		case 'R':
+		case 'r':
+		case 'B':
+		case 'b':
+		    if (rts_argv[arg][2] != '\0' && rts_argv[arg][3] != '\0') {
+			{
+			    char *left  = strchr(rts_argv[arg], '{');
+			    char *right = strrchr(rts_argv[arg], '}');
+
+			    // curly braces are optional, for
+			    // backwards compat.
+			    if (left)
+				left = left+1;
+			    else
+				left = rts_argv[arg] + 3;
+
+			    if (!right)
+				right = rts_argv[arg] + strlen(rts_argv[arg]);
+
+			    *right = '\0';
+
+			    switch (rts_argv[arg][2]) {
+			    case 'c': // cost centre label select
+				RtsFlags.ProfFlags.ccSelector = left;
+				break;
+			    case 'C':
+				RtsFlags.ProfFlags.ccsSelector = left;
+				break;
+			    case 'M':
+			    case 'm': // cost centre module select
+				RtsFlags.ProfFlags.modSelector = left;
+				break;
+			    case 'D':
+			    case 'd': // closure descr select 
+				RtsFlags.ProfFlags.descrSelector = left;
+				break;
+			    case 'Y':
+			    case 'y': // closure type select
+				RtsFlags.ProfFlags.typeSelector = left;
+				break;
+			    case 'R':
+			    case 'r': // retainer select
+				RtsFlags.ProfFlags.retainerSelector = left;
+				break;
+			    case 'B':
+			    case 'b': // biography select
+				RtsFlags.ProfFlags.bioSelector = left;
+				break;
+			    }
+			}
+			break;
+		    }
+
+		    if (RtsFlags.ProfFlags.doHeapProfile != 0) {
+			errorBelch("multiple heap profile options");
+			error = rtsTrue;
+			break;
+		    }
+
+		    switch (rts_argv[arg][2]) {
+		    case '\0':
+		    case 'C':
+		    case 'c':
+			RtsFlags.ProfFlags.doHeapProfile = HEAP_BY_CCS;
+			break;
+		    case 'M':
+		    case 'm':
+			  RtsFlags.ProfFlags.doHeapProfile = HEAP_BY_MOD;
+			  break;
+		    case 'D':
+		    case 'd':
+			  RtsFlags.ProfFlags.doHeapProfile = HEAP_BY_DESCR;
+			  break;
+		    case 'Y':
+		    case 'y':
+			  RtsFlags.ProfFlags.doHeapProfile = HEAP_BY_TYPE;
+			  break;
+		    case 'R':
+		    case 'r':
+			  RtsFlags.ProfFlags.doHeapProfile = HEAP_BY_RETAINER;
+			  break;
+		    case 'B':
+		    case 'b':
+			  RtsFlags.ProfFlags.doHeapProfile = HEAP_BY_LDV;
+			  break;
+		    }
+		    break;
+		      
+		default:
+		    errorBelch("invalid heap profile option: %s",rts_argv[arg]);
+		    error = rtsTrue;
+		}
+		) 
+#endif /* PROFILING */
+    	    	break;
+
+#if defined(PROFILING) 
+    	      case 'i':	/* heap sample interval */
+		if (rts_argv[arg][2] == '\0') {
+		  /* use default */
+		} else {
+		    I_ cst; /* tmp */
+
+		    /* Convert to milliseconds */
+		    cst = (I_) ((atof(rts_argv[arg]+2) * 1000));
+		    cst = (cst / CS_MIN_MILLISECS) * CS_MIN_MILLISECS;
+		    if (cst != 0 && cst < CS_MIN_MILLISECS)
+			cst = CS_MIN_MILLISECS;
+
+		    RtsFlags.ProfFlags.profileInterval = cst;
+		}
+		break;
+#endif
+
+	      /* =========== CONCURRENT ========================= */
+    	      case 'C':	/* context switch interval */
+		if (rts_argv[arg][2] == '\0')
+    	    	    RtsFlags.ConcFlags.ctxtSwitchTime = 0;
+		else {
+		    I_ cst; /* tmp */
+
+		    /* Convert to milliseconds */
+		    cst = (I_) ((atof(rts_argv[arg]+2) * 1000));
+		    cst = (cst / CS_MIN_MILLISECS) * CS_MIN_MILLISECS;
+		    if (cst != 0 && cst < CS_MIN_MILLISECS)
+			cst = CS_MIN_MILLISECS;
+
+		    RtsFlags.ConcFlags.ctxtSwitchTime = cst;
+		}
+    	    	break;
+
+#ifdef THREADED_RTS
+	      case 'N':
+		THREADED_BUILD_ONLY(
+		if (rts_argv[arg][2] != '\0') {
+		    RtsFlags.ParFlags.nNodes
+		      = strtol(rts_argv[arg]+2, (char **) NULL, 10);
+		    if (RtsFlags.ParFlags.nNodes <= 0) {
+		      errorBelch("bad value for -N");
+		      error = rtsTrue;
+		    }
+		}
+		) break;
+
+	    case 'q':
+		    switch (rts_argv[arg][2]) {
+		    case '\0':
+			errorBelch("incomplete RTS option: %s",rts_argv[arg]);
+			error = rtsTrue;
+			break;
+		    case 'm':
+			RtsFlags.ParFlags.migrate = rtsFalse;
+			break;
+		    case 'w':
+			RtsFlags.ParFlags.wakeupMigrate = rtsTrue;
+			break;
+		    default:
+			errorBelch("unknown RTS option: %s",rts_argv[arg]);
+			error = rtsTrue;
+			break;
+		    }
+		    break;
+#endif
+	      /* =========== PARALLEL =========================== */
+	      case 'e':
+		PAR_OR_THREADED_BUILD_ONLY(
+		if (rts_argv[arg][2] != '\0') {
+		    RtsFlags.ParFlags.maxLocalSparks
+		      = strtol(rts_argv[arg]+2, (char **) NULL, 10);
+		    if (RtsFlags.ParFlags.maxLocalSparks <= 0) {
+		      errorBelch("bad value for -e");
+		      error = rtsTrue;
+		    }
+		}
+		) break;
+
+#ifdef PAR
+    	      case 'q':
+		PAR_BUILD_ONLY(
+		  process_par_option(arg, rts_argc, rts_argv, &error);
+		) break;
+#endif
+
+	      /* =========== GRAN =============================== */
+
+    	      case 'b':
+		GRAN_BUILD_ONLY(
+		  process_gran_option(arg, rts_argc, rts_argv, &error);
+		) break;
+
+	      /* =========== TICKY ============================== */
+
+	      case 'r': /* Basic profiling stats */
+		TICKY_BUILD_ONLY(
+
+		RtsFlags.TickyFlags.showTickyStats = rtsTrue;
+
+		{ 
+		    int r;
+		    r = open_stats_file(arg, *argc, argv,
+					*rts_argc, rts_argv, TICKY_FILENAME_FMT,
+					&RtsFlags.TickyFlags.tickyFile);
+		    if (r == -1) { error = rtsTrue; }
+		}
+	        ) break;
+
+	      /* =========== EXTENDED OPTIONS =================== */
+
+              case 'x': /* Extend the argument space */
+                switch(rts_argv[arg][2]) {
+                  case '\0':
+		    errorBelch("incomplete RTS option: %s",rts_argv[arg]);
+		    error = rtsTrue;
+		    break;
+
+                  case 'c': /* Debugging tool: show current cost centre on an exception */
+                    PROFILING_BUILD_ONLY(
+			RtsFlags.ProfFlags.showCCSOnException = rtsTrue;
+			);
+		    break;
+
+		case 't':  /* Include memory used by TSOs in a heap profile */
+		    PROFILING_BUILD_ONLY(
+			RtsFlags.ProfFlags.includeTSOs = rtsTrue;
+			);
+		    break;
+
+                  /* The option prefix '-xx' is reserved for future extension.  KSW 1999-11. */
+
+	          default:
+		    errorBelch("unknown RTS option: %s",rts_argv[arg]);
+		    error = rtsTrue;
+		    break;
+                }
+                break;  /* defensive programming */
+
+	      /* =========== OH DEAR ============================ */
+	      default:
+		errorBelch("unknown RTS option: %s",rts_argv[arg]);
+		error = rtsTrue;
+		break;
+	    }
+	}
+    }
+    if (error) {
+	const char **p;
+
+        fflush(stdout);
+	for (p = usage_text; *p; p++)
+	    errorBelch("%s", *p);
+	stg_exit(EXIT_FAILURE);
+    }
+}
+
+#if defined(GRAN)
+
+//@node GranSim specific options, Aux fcts, Command-line option parsing routines
+//@subsection GranSim specific options
+
+static void
+enable_GranSimLight(void) {
+
+    debugBelch("GrAnSim Light enabled (infinite number of processors;  0 communication costs)\n");
+    RtsFlags.GranFlags.Light=rtsTrue;
+    RtsFlags.GranFlags.Costs.latency = 
+	RtsFlags.GranFlags.Costs.fetchtime = 
+	RtsFlags.GranFlags.Costs.additional_latency =
+	RtsFlags.GranFlags.Costs.gunblocktime = 
+	RtsFlags.GranFlags.Costs.lunblocktime =
+	RtsFlags.GranFlags.Costs.threadcreatetime = 
+	RtsFlags.GranFlags.Costs.threadqueuetime =
+	RtsFlags.GranFlags.Costs.threadscheduletime = 
+	RtsFlags.GranFlags.Costs.threaddescheduletime =
+	RtsFlags.GranFlags.Costs.threadcontextswitchtime = 0;
+  
+    RtsFlags.GranFlags.Costs.mpacktime = 
+	RtsFlags.GranFlags.Costs.munpacktime = 0;
+
+    RtsFlags.GranFlags.DoFairSchedule = rtsTrue;
+    RtsFlags.GranFlags.DoAsyncFetch = rtsFalse;
+    RtsFlags.GranFlags.DoAlwaysCreateThreads = rtsTrue;
+    /* FetchStrategy is irrelevant in GrAnSim-Light */
+
+    /* GrAnSim Light often creates an abundance of parallel threads,
+       each with its own stack etc. Therefore, it's in general a good
+       idea to use small stack chunks (use the -o<size> option to 
+       increase it again). 
+    */
+    // RtsFlags.ConcFlags.stkChunkSize = 100;
+
+    RtsFlags.GranFlags.proc = 1; 
+}
+
+static void
+process_gran_option(int arg, int *rts_argc, char *rts_argv[], rtsBool *error)
+{
+    if (rts_argv[arg][1] != 'b') /* All GranSim options start with -b */
+      return;
+
+    /* or a ridiculously idealised simulator */
+    if(strcmp((rts_argv[arg]+2),"oring")==0) {
+      RtsFlags.GranFlags.Costs.latency = 
+	RtsFlags.GranFlags.Costs.fetchtime = 
+	RtsFlags.GranFlags.Costs.additional_latency =
+	RtsFlags.GranFlags.Costs.gunblocktime = 
+	RtsFlags.GranFlags.Costs.lunblocktime =
+	RtsFlags.GranFlags.Costs.threadcreatetime = 
+	RtsFlags.GranFlags.Costs.threadqueuetime =
+	RtsFlags.GranFlags.Costs.threadscheduletime = 
+	RtsFlags.GranFlags.Costs.threaddescheduletime =
+	RtsFlags.GranFlags.Costs.threadcontextswitchtime = 0;
+
+      RtsFlags.GranFlags.Costs.mpacktime = 
+	RtsFlags.GranFlags.Costs.munpacktime = 0;
+
+      RtsFlags.GranFlags.Costs.arith_cost = 
+	RtsFlags.GranFlags.Costs.float_cost = 
+	RtsFlags.GranFlags.Costs.load_cost =
+	RtsFlags.GranFlags.Costs.store_cost = 
+	RtsFlags.GranFlags.Costs.branch_cost = 0;
+
+      RtsFlags.GranFlags.Costs.heapalloc_cost = 1;
+
+      /* ++RtsFlags.GranFlags.DoFairSchedule; */
+      RtsFlags.GranFlags.DoStealThreadsFirst = rtsTrue;        /* -bZ */
+      RtsFlags.GranFlags.DoThreadMigration   = rtsTrue;        /* -bM */
+      RtsFlags.GranFlags.GranSimStats.Full   = rtsTrue;        /* -bP */
+      return;
+    }
+
+      /* or a somewhat idealised simulator */
+      if(strcmp((rts_argv[arg]+2),"onzo")==0) {
+	RtsFlags.GranFlags.Costs.latency = 
+	RtsFlags.GranFlags.Costs.fetchtime = 
+	RtsFlags.GranFlags.Costs.additional_latency =
+	RtsFlags.GranFlags.Costs.gunblocktime = 
+	RtsFlags.GranFlags.Costs.lunblocktime =
+	RtsFlags.GranFlags.Costs.threadcreatetime = 
+	RtsFlags.GranFlags.Costs.threadqueuetime =
+	RtsFlags.GranFlags.Costs.threadscheduletime = 
+	RtsFlags.GranFlags.Costs.threaddescheduletime =
+	RtsFlags.GranFlags.Costs.threadcontextswitchtime = 0;
+
+	RtsFlags.GranFlags.Costs.mpacktime = 
+	RtsFlags.GranFlags.Costs.munpacktime = 0;
+	
+	RtsFlags.GranFlags.Costs.heapalloc_cost = 1;
+
+	/* RtsFlags.GranFlags.DoFairSchedule  = rtsTrue; */       /* -b-R */
+	/* RtsFlags.GranFlags.DoStealThreadsFirst = rtsTrue; */   /* -b-T */
+	RtsFlags.GranFlags.DoAsyncFetch = rtsTrue;         /* -bZ */
+	RtsFlags.GranFlags.DoThreadMigration  = rtsTrue;          /* -bM */
+	RtsFlags.GranFlags.GranSimStats.Full  = rtsTrue;          /* -bP */
+#  if defined(GRAN_CHECK) && defined(GRAN)
+	RtsFlags.GranFlags.Debug.event_stats = rtsTrue; /* print event statistics   */
+#  endif
+	return;
+      }
+
+      /* Communication and task creation cost parameters */
+      switch(rts_argv[arg][2]) {
+        case '.':
+	  IgnoreYields = rtsTrue; // HWL HACK
+	  break;
+
+        case ':':
+	  enable_GranSimLight();       /* set flags for GrAnSim-Light mode */
+	  break;
+
+        case 'l':
+	  if (rts_argv[arg][3] != '\0')
+	    {
+	      RtsFlags.GranFlags.Costs.gunblocktime = 
+	      RtsFlags.GranFlags.Costs.latency = decode(rts_argv[arg]+3);
+	      RtsFlags.GranFlags.Costs.fetchtime = 2*RtsFlags.GranFlags.Costs.latency;
+	    }
+	  else
+	    RtsFlags.GranFlags.Costs.latency = LATENCY;
+	  break;
+
+        case 'a':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.additional_latency = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.additional_latency = ADDITIONAL_LATENCY;
+	  break;
+
+        case 'm':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.mpacktime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.mpacktime = MSGPACKTIME;
+	  break;
+
+        case 'x':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.mtidytime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.mtidytime = 0;
+	  break;
+
+        case 'r':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.munpacktime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.munpacktime = MSGUNPACKTIME;
+	  break;
+	  
+        case 'g':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.fetchtime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.fetchtime = FETCHTIME;
+	  break;
+	  
+        case 'n':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.gunblocktime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.gunblocktime = GLOBALUNBLOCKTIME;
+	  break;
+
+        case 'u':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.lunblocktime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.lunblocktime = LOCALUNBLOCKTIME;
+	  break;
+
+	/* Thread-related metrics */
+        case 't':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.threadcreatetime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.threadcreatetime = THREADCREATETIME;
+	  break;
+	  
+        case 'q':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.threadqueuetime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.threadqueuetime = THREADQUEUETIME;
+	  break;
+	  
+        case 'c':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.threadscheduletime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.threadscheduletime = THREADSCHEDULETIME;
+	  
+	  RtsFlags.GranFlags.Costs.threadcontextswitchtime = RtsFlags.GranFlags.Costs.threadscheduletime
+	    + RtsFlags.GranFlags.Costs.threaddescheduletime;
+	  break;
+
+        case 'd':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.threaddescheduletime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.threaddescheduletime = THREADDESCHEDULETIME;
+	  
+	  RtsFlags.GranFlags.Costs.threadcontextswitchtime = RtsFlags.GranFlags.Costs.threadscheduletime
+	    + RtsFlags.GranFlags.Costs.threaddescheduletime;
+	  break;
+
+	/* Instruction Cost Metrics */
+        case 'A':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.arith_cost = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.arith_cost = ARITH_COST;
+	  break;
+
+        case 'F':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.float_cost = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.float_cost = FLOAT_COST;
+	  break;
+		      
+        case 'B':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.branch_cost = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.branch_cost = BRANCH_COST;
+	  break;
+
+        case 'L':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.load_cost = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.load_cost = LOAD_COST;
+	  break;
+	  
+        case 'S':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.store_cost = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.store_cost = STORE_COST;
+	  break;
+
+        case 'H':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.heapalloc_cost = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.heapalloc_cost = 0;
+	  break;
+
+        case 'y':
+	  RtsFlags.GranFlags.DoAsyncFetch = rtsTrue;
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.FetchStrategy = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.FetchStrategy = 2;
+	  if (RtsFlags.GranFlags.FetchStrategy == 0)
+	    RtsFlags.GranFlags.DoAsyncFetch = rtsFalse;
+	  break;
+	  
+        case 'K':   /* sort overhead (per elem in spark list) */
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.pri_spark_overhead = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.pri_spark_overhead = PRI_SPARK_OVERHEAD;
+	  debugBelch("Overhead for pri spark: %d (per elem).\n",
+		         RtsFlags.GranFlags.Costs.pri_spark_overhead);
+	  break;
+
+        case 'O':  /* sort overhead (per elem in spark list) */
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.pri_sched_overhead = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.pri_sched_overhead = PRI_SCHED_OVERHEAD;
+	  debugBelch("Overhead for pri sched: %d (per elem).\n",
+		       RtsFlags.GranFlags.Costs.pri_sched_overhead);
+	  break;
+
+        /* General Parameters */
+        case 'p':
+	  if (rts_argv[arg][3] != '\0')
+	    {
+	      RtsFlags.GranFlags.proc = decode(rts_argv[arg]+3);
+	      if (RtsFlags.GranFlags.proc==0) {
+		  enable_GranSimLight(); /* set flags for GrAnSim-Light mode */
+	      } else if (RtsFlags.GranFlags.proc > MAX_PROC || 
+			 RtsFlags.GranFlags.proc < 1)
+		{
+		  debugBelch("setupRtsFlags: no more than %u processors allowed\n",
+			  MAX_PROC);
+		  *error = rtsTrue;
+		}
+	    }
+	  else
+	    RtsFlags.GranFlags.proc = MAX_PROC;
+	  break;
+
+        case 'f':
+	  RtsFlags.GranFlags.Fishing = rtsTrue;
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.maxFishes = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.maxFishes = MAX_FISHES;
+	  break;
+	  
+        case 'w':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.time_slice = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.time_slice = GRAN_TIME_SLICE;
+	  break;
+	  
+        case 'C':
+	  RtsFlags.GranFlags.DoAlwaysCreateThreads=rtsTrue;
+	  RtsFlags.GranFlags.DoThreadMigration=rtsTrue;
+	  break;
+
+        case 'G':
+	  debugBelch("Bulk fetching enabled.\n");
+	  RtsFlags.GranFlags.DoBulkFetching=rtsTrue;
+	  break;
+	  
+        case 'M':
+	  debugBelch("Thread migration enabled.\n");
+	  RtsFlags.GranFlags.DoThreadMigration=rtsTrue;
+	  break;
+
+        case 'R':
+	  debugBelch("Fair Scheduling enabled.\n");
+	  RtsFlags.GranFlags.DoFairSchedule=rtsTrue;
+	  break;
+	  
+        case 'I':
+	  debugBelch("Priority Scheduling enabled.\n");
+	  RtsFlags.GranFlags.DoPriorityScheduling=rtsTrue;
+	  break;
+
+        case 'T':
+	  RtsFlags.GranFlags.DoStealThreadsFirst=rtsTrue;
+	  RtsFlags.GranFlags.DoThreadMigration=rtsTrue;
+	  break;
+	  
+        case 'Z':
+	  RtsFlags.GranFlags.DoAsyncFetch=rtsTrue;
+	  break;
+	  
+/*          case 'z': */
+/*  	  RtsFlags.GranFlags.SimplifiedFetch=rtsTrue; */
+/*  	  break; */
+	  
+        case 'N':
+	  RtsFlags.GranFlags.PreferSparksOfLocalNodes=rtsTrue;
+	  break;
+	  
+        case 'b':
+	  RtsFlags.GranFlags.GranSimStats.Binary=rtsTrue;
+	  break;
+	  
+        case 'P':
+	  /* format is -bP<c> where <c> is one char describing kind of profile */
+	  RtsFlags.GranFlags.GranSimStats.Full = rtsTrue;
+	  switch(rts_argv[arg][3]) {
+	  case '\0': break; // nothing special, just an ordinary profile
+	  case '0': RtsFlags.GranFlags.GranSimStats.Suppressed = rtsTrue;
+	    break;
+	  case 'b': RtsFlags.GranFlags.GranSimStats.Binary = rtsTrue;
+	    break;
+	  case 's': RtsFlags.GranFlags.GranSimStats.Sparks = rtsTrue;
+	    break;
+	  case 'h': RtsFlags.GranFlags.GranSimStats.Heap = rtsTrue;
+	    break;
+	  case 'n': RtsFlags.GranFlags.GranSimStats.NewLogfile = rtsTrue;
+	    break;
+	  case 'g': RtsFlags.GranFlags.GranSimStats.Global = rtsTrue;
+	    break;
+	  default: barf("Unknown option -bP%c", rts_argv[arg][3]);
+	  }
+	  break;
+
+        case 's':
+	  RtsFlags.GranFlags.GranSimStats.Sparks=rtsTrue;
+	  break;
+
+        case 'h':
+	  RtsFlags.GranFlags.GranSimStats.Heap=rtsTrue;
+	  break;
+
+        case 'Y':   /* syntax: -bY<n>[,<n>]  n ... pos int */ 
+	  if (rts_argv[arg][3] != '\0') {
+	    char *arg0, *tmp;
+	    
+	    arg0 = rts_argv[arg]+3;
+	    if ((tmp = strstr(arg0,","))==NULL) {
+	      RtsFlags.GranFlags.SparkPriority = decode(arg0);
+	      debugBelch("SparkPriority: %u.\n",RtsFlags.GranFlags.SparkPriority);
+	    } else {
+	      *(tmp++) = '\0'; 
+	      RtsFlags.GranFlags.SparkPriority = decode(arg0);
+	      RtsFlags.GranFlags.SparkPriority2 = decode(tmp);
+	      debugBelch("SparkPriority: %u.\n",
+		      RtsFlags.GranFlags.SparkPriority);
+	      debugBelch("SparkPriority2:%u.\n",
+		      RtsFlags.GranFlags.SparkPriority2);
+	      if (RtsFlags.GranFlags.SparkPriority2 < 
+		  RtsFlags.GranFlags.SparkPriority) {
+		debugBelch("WARNING: 2nd pri < main pri (%u<%u); 2nd pri has no effect\n",
+			RtsFlags.GranFlags.SparkPriority2,
+			RtsFlags.GranFlags.SparkPriority);
+	      }
+	    }
+	  } else {
+	    /* plain pri spark is now invoked with -bX  
+	       RtsFlags.GranFlags.DoPrioritySparking = 1;
+	       debugBelch("PrioritySparking.\n");
+	    */
+	  }
+	  break;
+
+        case 'Q':
+	  if (rts_argv[arg][3] != '\0') {
+	    RtsFlags.GranFlags.ThunksToPack = decode(rts_argv[arg]+3);
+	  } else {
+	    RtsFlags.GranFlags.ThunksToPack = 1;
+	  }
+	  debugBelch("Thunks To Pack in one packet: %u.\n",
+		  RtsFlags.GranFlags.ThunksToPack);
+	  break;
+		      
+        case 'e':
+	  RtsFlags.GranFlags.RandomSteal = rtsFalse;
+	  debugBelch("Deterministic mode (no random stealing)\n");
+		      break;
+
+	  /* The following class of options contains eXperimental */
+	  /* features in connection with exploiting granularity */
+	  /* information. I.e. if -bY is chosen these options */
+	  /* tell the RTS what to do with the supplied info --HWL */
+
+        case 'W':
+	  if (rts_argv[arg][3] != '\0') {
+	    RtsFlags.GranFlags.packBufferSize_internal = decode(rts_argv[arg]+3);
+	  } else {
+	    RtsFlags.GranFlags.packBufferSize_internal = GRANSIM_DEFAULT_PACK_BUFFER_SIZE;
+	  }
+	  debugBelch("Size of GranSim internal pack buffer: %u.\n",
+		  RtsFlags.GranFlags.packBufferSize_internal);
+	  break;
+  		      
+        case 'X':
+	  switch(rts_argv[arg][3]) {
+	    
+	    case '\0':
+	      RtsFlags.GranFlags.DoPrioritySparking = 1;
+	      debugBelch("Priority Sparking with Normal Priorities.\n");
+	      RtsFlags.GranFlags.InversePriorities = rtsFalse; 
+	      RtsFlags.GranFlags.RandomPriorities = rtsFalse;
+	      RtsFlags.GranFlags.IgnorePriorities = rtsFalse;
+	      break;
+			
+	    case 'I':
+	      RtsFlags.GranFlags.DoPrioritySparking = 1;
+	      debugBelch("Priority Sparking with Inverse Priorities.\n");
+	      RtsFlags.GranFlags.InversePriorities++; 
+	      break;
+	      
+	    case 'R': 
+	      RtsFlags.GranFlags.DoPrioritySparking = 1;
+	      debugBelch("Priority Sparking with Random Priorities.\n");
+	      RtsFlags.GranFlags.RandomPriorities++;
+	      break;
+	      
+	    case 'N':
+	      RtsFlags.GranFlags.DoPrioritySparking = 1;
+	      debugBelch("Priority Sparking with No Priorities.\n");
+	      RtsFlags.GranFlags.IgnorePriorities++;
+	      break;
+	      
+	    default:
+	      bad_option( rts_argv[arg] );
+	      break;
+	  }
+	  break;
+
+        case '-':
+	  switch(rts_argv[arg][3]) {
+	    
+	    case 'C':
+	      RtsFlags.GranFlags.DoAlwaysCreateThreads=rtsFalse;
+	      RtsFlags.GranFlags.DoThreadMigration=rtsFalse;
+	      break;
+
+	    case 'G':
+	      RtsFlags.GranFlags.DoBulkFetching=rtsFalse;
+	      break;
+	      
+	    case 'M':
+	      RtsFlags.GranFlags.DoThreadMigration=rtsFalse;
+	      break;
+
+	    case 'R':
+	      RtsFlags.GranFlags.DoFairSchedule=rtsFalse;
+	      break;
+
+	    case 'T':
+	      RtsFlags.GranFlags.DoStealThreadsFirst=rtsFalse;
+	      RtsFlags.GranFlags.DoThreadMigration=rtsFalse;
+	      break;
+
+	    case 'Z':
+	      RtsFlags.GranFlags.DoAsyncFetch=rtsFalse;
+	      break;
+	      
+	    case 'N':
+	      RtsFlags.GranFlags.PreferSparksOfLocalNodes=rtsFalse;
+			 break;
+			 
+	    case 'P':
+	      RtsFlags.GranFlags.GranSimStats.Suppressed=rtsTrue;
+	      break;
+
+	    case 's':
+	      RtsFlags.GranFlags.GranSimStats.Sparks=rtsFalse;
+	      break;
+	    
+	    case 'h':
+	      RtsFlags.GranFlags.GranSimStats.Heap=rtsFalse;
+	      break;
+	    
+	    case 'b':
+	      RtsFlags.GranFlags.GranSimStats.Binary=rtsFalse;
+	      break;
+			 
+	    case 'X':
+	      RtsFlags.GranFlags.DoPrioritySparking = rtsFalse;
+	      break;
+
+	    case 'Y':
+	      RtsFlags.GranFlags.DoPrioritySparking = rtsFalse;
+	      RtsFlags.GranFlags.SparkPriority = rtsFalse;
+	      break;
+
+	    case 'I':
+	      RtsFlags.GranFlags.DoPriorityScheduling = rtsFalse;
+	      break;
+
+	    case 'e':
+	      RtsFlags.GranFlags.RandomSteal = rtsFalse;
+	      break;
+
+	    default:
+	      bad_option( rts_argv[arg] );
+	      break;
+	  }
+	  break;
+
+#  if defined(GRAN_CHECK) && defined(GRAN)
+        case 'D':
+	  switch(rts_argv[arg][3]) {
+	    case 'Q':    /* Set pack buffer size (same as 'Q' in GUM) */
+	      if (rts_argv[arg][4] != '\0') {
+		RtsFlags.GranFlags.packBufferSize = decode(rts_argv[arg]+4);
+		debugBelch("Pack buffer size: %d\n",
+			RtsFlags.GranFlags.packBufferSize);
+	      } else {
+    	    	debugBelch("setupRtsFlags: missing size of PackBuffer (for -Q)\n");
+    	    	*error = rtsTrue;
+    	      }
+	      break;
+
+	  default:
+	      if (isdigit(rts_argv[arg][3])) {/* Set all debugging options in one */
+	    	/* hack warning: interpret the flags as a binary number */
+	    	nat n = decode(rts_argv[arg]+3);
+		set_GranSim_debug_options(n);
+	      } else {
+		nat i;
+		for (i=0; i<=MAX_GRAN_DEBUG_OPTION; i++) 
+		  if (rts_argv[arg][3] == gran_debug_opts_flags[i])
+		    break;
+		
+		if (i==MAX_GRAN_DEBUG_OPTION+1) {
+		  debugBelch("Valid GranSim debug options are:\n");
+		  help_GranSim_debug_options(MAX_GRAN_DEBUG_MASK);
+		  bad_option( rts_argv[arg] );
+		} else { // flag found; now set it
+		  set_GranSim_debug_options(GRAN_DEBUG_MASK(i));  // 2^i
+		}
+	      }
+	      break;
+	      
+#if 0
+	    case 'e':       /* event trace; also -bD1 */
+	      debugBelch("DEBUG: event_trace; printing event trace.\n");
+	      RtsFlags.GranFlags.Debug.event_trace = rtsTrue;
+	      /* RtsFlags.GranFlags.event_trace=rtsTrue; */
+	      break;
+	      
+	    case 'E':       /* event statistics; also -bD2 */
+	      debugBelch("DEBUG: event_stats; printing event statistics.\n");
+	      RtsFlags.GranFlags.Debug.event_stats = rtsTrue;
+	      /* RtsFlags.GranFlags.Debug |= 0x20; print event statistics   */
+	      break;
+	      
+	    case 'f':       /* thunkStealing; also -bD4 */
+	      debugBelch("DEBUG: thunkStealing; printing forwarding of FETCHNODES.\n");
+	      RtsFlags.GranFlags.Debug.thunkStealing = rtsTrue;
+	      /* RtsFlags.GranFlags.Debug |= 0x2;  print fwd messages */
+	      break;
+
+	    case 'z':       /* blockOnFetch; also -bD8 */
+	      debugBelch("DEBUG: blockOnFetch; check for blocked on fetch.\n");
+	      RtsFlags.GranFlags.Debug.blockOnFetch = rtsTrue;
+	      /* RtsFlags.GranFlags.Debug |= 0x4; debug non-reschedule-on-fetch */
+	      break;
+	      
+	    case 't':       /* blockOnFetch_sanity; also -bD16 */  
+	      debugBelch("DEBUG: blockOnFetch_sanity; check for TSO asleep on fetch.\n");
+	      RtsFlags.GranFlags.Debug.blockOnFetch_sanity = rtsTrue;
+	      /* RtsFlags.GranFlags.Debug |= 0x10; debug TSO asleep for fetch  */
+	      break;
+
+	    case 'S':       /* priSpark; also -bD32 */
+	      debugBelch("DEBUG: priSpark; priority sparking.\n");
+	      RtsFlags.GranFlags.Debug.priSpark = rtsTrue;
+	      break;
+
+	    case 's':       /* priSched; also -bD64 */
+	      debugBelch("DEBUG: priSched; priority scheduling.\n");
+	      RtsFlags.GranFlags.Debug.priSched = rtsTrue;
+	      break;
+
+	    case 'F':       /* findWork; also -bD128 */
+	      debugBelch("DEBUG: findWork; searching spark-pools (local & remote), thread queues for work.\n");
+	      RtsFlags.GranFlags.Debug.findWork = rtsTrue;
+	      break;
+	      
+	    case 'g':       /* globalBlock; also -bD256 */
+	      debugBelch("DEBUG: globalBlock; blocking on remote closures (FETCHMEs etc in GUM).\n");
+	      RtsFlags.GranFlags.Debug.globalBlock = rtsTrue;
+	      break;
+	      
+	    case 'G':       /* pack; also -bD512 */
+	      debugBelch("DEBUG: pack; routines for (un-)packing graph structures.\n");
+	      RtsFlags.GranFlags.Debug.pack = rtsTrue;
+	      break;
+	      
+	    case 'P':       /* packBuffer; also -bD1024 */
+	      debugBelch("DEBUG: packBuffer; routines handling pack buffer (GranSim internal!).\n");
+	      RtsFlags.GranFlags.Debug.packBuffer = rtsTrue;
+	      break;
+	      
+	    case 'o':       /* sortedQ; also -bD2048 */
+	      debugBelch("DEBUG: sortedQ; check whether spark/thread queues are sorted.\n");
+	      RtsFlags.GranFlags.Debug.sortedQ = rtsTrue;
+	      break;
+	      
+	    case 'r':       /* randomSteal; also -bD4096 */
+	      debugBelch("DEBUG: randomSteal; stealing sparks/threads from random PEs.\n");
+	      RtsFlags.GranFlags.Debug.randomSteal = rtsTrue;
+	      break;
+	      
+	    case 'q':       /* checkSparkQ; also -bD8192 */
+	      debugBelch("DEBUG: checkSparkQ; check consistency of the spark queues.\n");
+	      RtsFlags.GranFlags.Debug.checkSparkQ = rtsTrue;
+	      break;
+	      
+	    case ':':       /* checkLight; also -bD16384 */
+	      debugBelch("DEBUG: checkLight; check GranSim-Light setup.\n");
+	      RtsFlags.GranFlags.Debug.checkLight = rtsTrue;
+	      break;
+	      
+	    case 'b':       /* bq; also -bD32768 */
+	      debugBelch("DEBUG: bq; check blocking queues\n");
+	      RtsFlags.GranFlags.Debug.bq = rtsTrue;
+	      break;
+	      
+	    case 'd':       /* all options turned on */
+	      debugBelch("DEBUG: all options turned on.\n");
+	      set_GranSim_debug_options(MAX_GRAN_DEBUG_MASK);
+	      /* RtsFlags.GranFlags.Debug |= 0x40; */
+	      break;
+
+/*  	    case '\0': */
+/*  	      RtsFlags.GranFlags.Debug = 1; */
+/*  	      break; */
+#endif
+
+	  }
+	  break;
+#  endif  /* GRAN_CHECK */
+      default:
+	bad_option( rts_argv[arg] );
+	break;
+      }
+}
+
+/*
+  Interpret n as a binary number masking GranSim debug options and set the 
+  correxponding option. See gran_debug_opts_strs for explanations of the flags.
+*/
+static void
+set_GranSim_debug_options(nat n) {
+  nat i;
+
+  for (i=0; i<=MAX_GRAN_DEBUG_OPTION; i++) 
+    if ((n>>i)&1) {
+      errorBelch(gran_debug_opts_strs[i]);
+      switch (i) {
+        case 0: RtsFlags.GranFlags.Debug.event_trace   = rtsTrue;  break;
+        case 1: RtsFlags.GranFlags.Debug.event_stats   = rtsTrue;  break;
+        case 2: RtsFlags.GranFlags.Debug.bq            = rtsTrue;  break;
+        case 3: RtsFlags.GranFlags.Debug.pack          = rtsTrue;  break;
+        case 4: RtsFlags.GranFlags.Debug.checkSparkQ   = rtsTrue;  break;
+        case 5: RtsFlags.GranFlags.Debug.thunkStealing = rtsTrue;  break;
+        case 6: RtsFlags.GranFlags.Debug.randomSteal   = rtsTrue;  break;
+        case 7: RtsFlags.GranFlags.Debug.findWork      = rtsTrue;  break;
+        case 8: RtsFlags.GranFlags.Debug.unused        = rtsTrue;  break;
+        case 9: RtsFlags.GranFlags.Debug.pri           = rtsTrue;  break;
+        case 10: RtsFlags.GranFlags.Debug.checkLight   = rtsTrue;  break;
+        case 11: RtsFlags.GranFlags.Debug.sortedQ      = rtsTrue;  break;
+        case 12: RtsFlags.GranFlags.Debug.blockOnFetch = rtsTrue;  break;
+        case 13: RtsFlags.GranFlags.Debug.packBuffer   = rtsTrue;  break;
+        case 14: RtsFlags.GranFlags.Debug.blockOnFetch_sanity = rtsTrue;  break;
+        default: barf("set_GranSim_debug_options: only %d debug options expected");
+      } /* switch */
+    } /* if */
+}
+
+/*
+  Print one line explanation for each of the GranSim debug options specified
+  in the bitmask n.
+*/
+static void
+help_GranSim_debug_options(nat n) {
+  nat i;
+
+  for (i=0; i<=MAX_GRAN_DEBUG_OPTION; i++) 
+    if ((n>>i)&1) 
+      debugBelch(gran_debug_opts_strs[i]);
+}
+
+# elif defined(PAR)
+
+static void
+process_par_option(int arg, int *rts_argc, char *rts_argv[], rtsBool *error)
+{
+
+  if (rts_argv[arg][1] != 'q') { /* All GUM options start with -q */
+    errorBelch("Warning: GUM option does not start with -q: %s", rts_argv[arg]);
+    return;
+  }
+
+  /* Communication and task creation cost parameters */
+  switch(rts_argv[arg][2]) {
+  case 'e':  /* -qe<n>  ... allow <n> local sparks */
+    if (rts_argv[arg][3] != '\0') { /* otherwise, stick w/ the default */
+      RtsFlags.ParFlags.maxLocalSparks
+	= strtol(rts_argv[arg]+3, (char **) NULL, 10);
+      
+      if (RtsFlags.ParFlags.maxLocalSparks <= 0) {
+	errorBelch("setupRtsFlags: bad value for -e\n");
+	*error = rtsTrue;
+      }
+    }
+    IF_PAR_DEBUG(verbose,
+		 errorBelch("-qe<n>: max %d local sparks", 
+		       RtsFlags.ParFlags.maxLocalSparks));
+    break;
+  
+  case 't':
+    if (rts_argv[arg][3] != '\0') {
+      RtsFlags.ParFlags.maxThreads
+	= strtol(rts_argv[arg]+3, (char **) NULL, 10);
+    } else {
+      errorBelch("missing size for -qt\n");
+      *error = rtsTrue;
+    }
+    IF_PAR_DEBUG(verbose,
+		 errorBelch("-qt<n>: max %d threads", 
+		       RtsFlags.ParFlags.maxThreads));
+    break;
+
+  case 'f':
+    if (rts_argv[arg][3] != '\0')
+      RtsFlags.ParFlags.maxFishes = decode(rts_argv[arg]+3);
+    else
+      RtsFlags.ParFlags.maxFishes = MAX_FISHES;
+    break;
+    IF_PAR_DEBUG(verbose,
+		 errorBelch("-qf<n>: max %d fishes sent out at one time", 
+		       RtsFlags.ParFlags.maxFishes));
+    break;
+  
+  case 'F':
+    if (rts_argv[arg][3] != '\0') {
+      RtsFlags.ParFlags.fishDelay
+	= strtol(rts_argv[arg]+3, (char **) NULL, 10);
+    } else {
+      errorBelch("missing fish delay time for -qF\n");
+      *error = rtsTrue;
+    }
+    IF_PAR_DEBUG(verbose,
+		 errorBelch("-qF<n>: fish delay time %d us", 
+		       RtsFlags.ParFlags.fishDelay));
+    break;
+
+  case 'O':
+    RtsFlags.ParFlags.outputDisabled = rtsTrue;
+    IF_PAR_DEBUG(verbose,
+		 errorBelch("-qO: output disabled"));
+    break;
+  
+  case 'g': /* -qg<n> ... globalisation scheme */
+    if (rts_argv[arg][3] != '\0') {
+      RtsFlags.ParFlags.globalising = decode(rts_argv[arg]+3);
+    } else {
+      errorBelch("missing identifier for globalisation scheme (for -qg)\n");
+      *error = rtsTrue;
+    }
+    IF_PAR_DEBUG(verbose,
+		 debugBelch("-qg<n>: globalisation scheme set to  %d", 
+		       RtsFlags.ParFlags.globalising));
+    break;
+
+  case 'h': /* -qh<n> ... max number of thunks (except root) in packet */
+    if (rts_argv[arg][3] != '\0') {
+      RtsFlags.ParFlags.thunksToPack = decode(rts_argv[arg]+3);
+    } else {
+      errorBelch("missing number of thunks per packet (for -qh)\n");
+      *error = rtsTrue;
+    }
+    IF_PAR_DEBUG(verbose,
+		 debugBelch("-qh<n>: thunks per packet set to %d", 
+		       RtsFlags.ParFlags.thunksToPack));
+    break;
+
+  case 'P': /* -qP for writing a log file */
+    //RtsFlags.ParFlags.ParStats.Full = rtsFalse;
+    /* same encoding as in GranSim after -bP */	
+    switch(rts_argv[arg][3]) {
+    case '\0': RtsFlags.ParFlags.ParStats.Full = rtsTrue;
+      break; // nothing special, just an ordinary profile
+    case '0': RtsFlags.ParFlags.ParStats.Suppressed = rtsTrue;
+	RtsFlags.ParFlags.ParStats.Full = rtsFalse;
+      break;
+    case 'b': RtsFlags.ParFlags.ParStats.Binary = rtsTrue;
+      break;
+    case 's': RtsFlags.ParFlags.ParStats.Sparks = rtsTrue;
+      break;
+      //case 'h': RtsFlags.parFlags.ParStats.Heap = rtsTrue;
+      //  break;
+    case 'n': RtsFlags.ParFlags.ParStats.NewLogfile = rtsTrue;
+      break;
+    case 'g': 
+# if defined(PAR_TICKY)
+      RtsFlags.ParFlags.ParStats.Global = rtsTrue;
+# else 
+      errorBelch("-qPg is only possible for a PAR_TICKY RTS, which this is not");
+      stg_exit(EXIT_FAILURE);
+# endif
+      break;
+    default: barf("Unknown option -qP%c", rts_argv[arg][2]);
+    }
+    IF_PAR_DEBUG(verbose,
+		 debugBelch("(-qP) writing to log-file (RtsFlags.ParFlags.ParStats.Full=%s)",
+		       (RtsFlags.ParFlags.ParStats.Full ? "rtsTrue" : "rtsFalse")));
+    break;
+  
+  case 'Q': /* -qQ<n> ... set pack buffer size to <n> */
+    if (rts_argv[arg][3] != '\0') {
+      RtsFlags.ParFlags.packBufferSize = decode(rts_argv[arg]+3);
+    } else {
+      errorBelch("missing size of PackBuffer (for -qQ)\n");
+      *error = rtsTrue;
+    }
+    IF_PAR_DEBUG(verbose,
+		 debugBelch("-qQ<n>: pack buffer size set to %d", 
+		       RtsFlags.ParFlags.packBufferSize));
+    break;
+
+  case 'R':
+    RtsFlags.ParFlags.doFairScheduling = rtsTrue;
+    IF_PAR_DEBUG(verbose,
+		 debugBelch("-qR: fair-ish scheduling"));
+    break;
+  
+# if defined(DEBUG)  
+  case 'w':
+    if (rts_argv[arg][3] != '\0') {
+      RtsFlags.ParFlags.wait
+	= strtol(rts_argv[arg]+3, (char **) NULL, 10);
+    } else {
+      RtsFlags.ParFlags.wait = 1000;
+    }
+    IF_PAR_DEBUG(verbose,
+		 debugBelch("-qw<n>: length of wait loop after synchr before reduction: %d", 
+		       RtsFlags.ParFlags.wait));
+    break;
+
+  case 'D':  /* -qD ... all the debugging options */
+    if (isdigit(rts_argv[arg][3])) {/* Set all debugging options in one */
+      /* hack warning: interpret the flags as a binary number */
+      nat n = decode(rts_argv[arg]+3);
+      set_par_debug_options(n);
+    } else {
+      nat i;
+      for (i=0; i<=MAX_PAR_DEBUG_OPTION; i++) 
+	if (rts_argv[arg][3] == par_debug_opts_flags[i])
+	  break;
+	
+      if (i==MAX_PAR_DEBUG_OPTION+1) {
+	errorBelch("Valid GUM debug options are:\n");
+	help_par_debug_options(MAX_PAR_DEBUG_MASK);
+	bad_option( rts_argv[arg] );
+      } else { // flag found; now set it
+	set_par_debug_options(PAR_DEBUG_MASK(i));  // 2^i
+      }
+    }
+    break;
+# endif
+  default:
+    errorBelch("Unknown option -q%c (%d opts in total)", 
+	  rts_argv[arg][2], *rts_argc);
+    break;
+  } /* switch */
+}
+
+/*
+  Interpret n as a binary number masking Par debug options and set the 
+  correxponding option. See par_debug_opts_strs for explanations of the flags.
+*/
+static void
+set_par_debug_options(nat n) {
+  nat i;
+
+  for (i=0; i<=MAX_PAR_DEBUG_OPTION; i++) 
+    if ((n>>i)&1) {
+      debugBelch(par_debug_opts_strs[i]);
+      switch (i) {
+        case 0: RtsFlags.ParFlags.Debug.verbose       = rtsTrue;  break;
+        case 1: RtsFlags.ParFlags.Debug.bq            = rtsTrue;  break;
+        case 2: RtsFlags.ParFlags.Debug.schedule      = rtsTrue;  break;
+        case 3: RtsFlags.ParFlags.Debug.free          = rtsTrue;  break;
+        case 4: RtsFlags.ParFlags.Debug.resume        = rtsTrue;  break;
+        case 5: RtsFlags.ParFlags.Debug.weight        = rtsTrue;  break;
+        case 6: RtsFlags.ParFlags.Debug.fetch         = rtsTrue;  break;
+	  //case 7: RtsFlags.ParFlags.Debug.ack           = rtsTrue;  break;
+        case 7: RtsFlags.ParFlags.Debug.fish          = rtsTrue;  break;
+        case 8: RtsFlags.ParFlags.Debug.tables        = rtsTrue;  break;
+        case 9: RtsFlags.ParFlags.Debug.packet        = rtsTrue;  break;
+        case 10: RtsFlags.ParFlags.Debug.pack         = rtsTrue;  break;
+        case 11: RtsFlags.ParFlags.Debug.paranoia     = rtsTrue;  break;
+        default: barf("set_par_debug_options: only %d debug options expected",
+		      MAX_PAR_DEBUG_OPTION);
+      } /* switch */
+    } /* if */
+}
+
+/*
+  Print one line explanation for each of the GranSim debug options specified
+  in the bitmask n.
+*/
+static void
+help_par_debug_options(nat n) {
+  nat i;
+
+  for (i=0; i<=MAX_PAR_DEBUG_OPTION; i++) 
+    if ((n>>i)&1) 
+      debugBelch(par_debug_opts_strs[i]);
+}
+
+#endif /* PAR */
+
+//@node Aux fcts,  , GranSim specific options
+//@subsection Aux fcts
+
+static void
+stats_fprintf(FILE *f, char *s, ...)
+{
+    va_list ap;
+    va_start(ap,s);
+    if (f == NULL) {
+	vdebugBelch(s, ap);
+    } else {
+	vfprintf(f, s, ap);
+    }
+    va_end(ap);
+}
+
+static int		/* return -1 on error */
+open_stats_file (
+    I_ arg,
+    int argc, char *argv[],
+    int rts_argc, char *rts_argv[],
+    const char *FILENAME_FMT,
+    FILE **file_ret)
+{
+    FILE *f = NULL;
+
+    if (strequal(rts_argv[arg]+2, "stderr")) { /* use debugBelch */
+        f = NULL; /* NULL means use debugBelch */
+    } else {
+	if (rts_argv[arg][2] != '\0') {  /* stats file specified */
+	    f = fopen(rts_argv[arg]+2,"w");
+	} else {
+	    char stats_filename[STATS_FILENAME_MAXLEN]; /* default <program>.<ext> */
+	    sprintf(stats_filename, FILENAME_FMT, argv[0]);
+	    f = fopen(stats_filename,"w");
+	}
+	if (f == NULL) {
+	    errorBelch("Can't open stats file %s\n", rts_argv[arg]+2);
+	    return -1;
+	}
+    }
+    *file_ret = f;
+
+    {
+	/* Write argv and rtsv into start of stats file */
+	int count;
+	for(count = 0; count < argc; count++) {
+	    stats_fprintf(f, "%s ", argv[count]);
+	}
+	stats_fprintf(f, "+RTS ");
+	for(count = 0; count < rts_argc; count++)
+	    stats_fprintf(f, "%s ", rts_argv[count]);
+	stats_fprintf(f, "\n");
+    }
+    return 0;
+}
+
+
+
+static I_
+decode(const char *s)
+{
+    I_ c;
+    StgDouble m;
+
+    if (!*s)
+	return 0;
+
+    m = atof(s);
+    c = s[strlen(s)-1];
+
+    if (c == 'g' || c == 'G')
+	m *= 1000*1000*1000;	/* UNchecked! */
+    else if (c == 'm' || c == 'M')
+	m *= 1000*1000;			/* We do not use powers of 2 (1024) */
+    else if (c == 'k' || c == 'K')	/* to avoid possible bad effects on */
+	m *= 1000;			/* a direct-mapped cache.   	    */ 
+    else if (c == 'w' || c == 'W')
+	m *= sizeof(W_);
+
+    return (I_)m;
+}
+
+static void
+bad_option(const char *s)
+{
+  errorBelch("bad RTS option: %s", s);
+  stg_exit(EXIT_FAILURE);
+}
+
+/* -----------------------------------------------------------------------------
+   Getting/Setting the program's arguments.
+
+   These are used by System.Environment, and parts of the RTS.
+   -------------------------------------------------------------------------- */
+
+void
+setProgName(char *argv[])
+{
+    /* Remove directory from argv[0] -- default files in current directory */
+#if !defined(mingw32_HOST_OS)
+    char *last_slash;
+    if ( (last_slash = (char *) strrchr(argv[0], '/')) != NULL ) {
+	prog_name = last_slash+1;
+   } else {
+	prog_name = argv[0];
+   }
+#else
+    char* last_slash = argv[0] + (strlen(argv[0]) - 1);
+    while ( last_slash > argv[0] ) {
+	if ( *last_slash == '/' || *last_slash == '\\' ) {
+	    prog_name = last_slash+1;
+	    return;
+	}
+	last_slash--;
+    }
+    prog_name = argv[0];
+#endif
+}
+
+void
+getProgArgv(int *argc, char **argv[])
+{
+    if (argc) { *argc = prog_argc; }
+    if (argv) { *argv = prog_argv; }
+}
+
+void
+setProgArgv(int argc, char *argv[])
+{
+   /* Usually this is done by startupHaskell, so we don't need to call this. 
+      However, sometimes Hugs wants to change the arguments which Haskell
+      getArgs >>= ... will be fed.  So you can do that by calling here
+      _after_ calling startupHaskell.
+   */
+   prog_argc = argc;
+   prog_argv = argv;
+   setProgName(prog_argv);
+}
diff --git a/rts/RtsMessages.c b/rts/RtsMessages.c
new file mode 100644
index 0000000000..1242d886eb
--- /dev/null
+++ b/rts/RtsMessages.c
@@ -0,0 +1,201 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * General utility functions used in the RTS.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+
+#include <stdio.h>
+
+#ifdef HAVE_WINDOWS_H
+#include <windows.h>
+#endif
+
+/* -----------------------------------------------------------------------------
+   General message generation functions
+
+   All messages should go through here.  We can't guarantee that
+   stdout/stderr will be available - e.g. in a Windows program there
+   is no console for generating messages, so they have to either go to
+   to the debug console, or pop up message boxes.
+   -------------------------------------------------------------------------- */
+
+// Default to the stdio implementation of these hooks.
+RtsMsgFunction *fatalInternalErrorFn = rtsFatalInternalErrorFn;
+RtsMsgFunction *debugMsgFn           = rtsDebugMsgFn;
+RtsMsgFunction *errorMsgFn           = rtsErrorMsgFn;
+
+void
+barf(char *s, ...)
+{
+  va_list ap;
+  va_start(ap,s);
+  (*fatalInternalErrorFn)(s,ap);
+  stg_exit(EXIT_INTERNAL_ERROR); // just in case fatalInternalErrorFn() returns
+  va_end(ap);
+}
+
+void
+vbarf(char *s, va_list ap)
+{
+  (*fatalInternalErrorFn)(s,ap);
+  stg_exit(EXIT_INTERNAL_ERROR); // just in case fatalInternalErrorFn() returns
+}
+
+void 
+_assertFail(char *filename, unsigned int linenum)
+{
+    barf("ASSERTION FAILED: file %s, line %u\n", filename, linenum);
+}
+
+void
+errorBelch(char *s, ...)
+{
+  va_list ap;
+  va_start(ap,s);
+  (*errorMsgFn)(s,ap);
+  va_end(ap);
+}
+
+void
+verrorBelch(char *s, va_list ap)
+{
+  (*errorMsgFn)(s,ap);
+}
+
+void
+debugBelch(char *s, ...)
+{
+  va_list ap;
+  va_start(ap,s);
+  (*debugMsgFn)(s,ap);
+  va_end(ap);
+}
+
+void
+vdebugBelch(char *s, va_list ap)
+{
+  (*debugMsgFn)(s,ap);
+}
+
+/* -----------------------------------------------------------------------------
+   stdio versions of the message functions
+   -------------------------------------------------------------------------- */
+
+#define BUFSIZE 512
+
+#if defined(cygwin32_TARGET_OS) || defined (mingw32_TARGET_OS)
+static int
+isGUIApp()
+{
+  PIMAGE_DOS_HEADER pDOSHeader;
+  PIMAGE_NT_HEADERS pPEHeader;
+
+  pDOSHeader = (PIMAGE_DOS_HEADER) GetModuleHandleA(NULL);
+  if (pDOSHeader->e_magic != IMAGE_DOS_SIGNATURE)
+    return 0;
+
+  pPEHeader = (PIMAGE_NT_HEADERS) ((char *)pDOSHeader + pDOSHeader->e_lfanew);
+  if (pPEHeader->Signature != IMAGE_NT_SIGNATURE)
+    return 0;
+
+  return (pPEHeader->OptionalHeader.Subsystem == IMAGE_SUBSYSTEM_WINDOWS_GUI);
+}
+#endif
+
+#define xstr(s) str(s)
+#define str(s) #s
+
+void
+rtsFatalInternalErrorFn(char *s, va_list ap)
+{
+#if defined(cygwin32_TARGET_OS) || defined (mingw32_TARGET_OS)
+  if (isGUIApp())
+  {
+     char title[BUFSIZE], message[BUFSIZE];
+
+     snprintf(title,   BUFSIZE, "%s: internal error", prog_name);
+     vsnprintf(message, BUFSIZE, s, ap);
+
+     MessageBox(NULL /* hWnd */,
+	        message,
+	        title,
+	        MB_OK | MB_ICONERROR | MB_TASKMODAL
+	       );
+  }
+  else
+#endif
+  {
+     /* don't fflush(stdout); WORKAROUND bug in Linux glibc */
+     if (prog_argv != NULL && prog_name != NULL) {
+       fprintf(stderr, "%s: internal error: ", prog_name);
+     } else {
+       fprintf(stderr, "internal error: ");
+     }
+     vfprintf(stderr, s, ap);
+     fprintf(stderr, "\n");
+     fprintf(stderr, "    (GHC version %s for %s)\n", ProjectVersion, xstr(HostPlatform_TYPE));
+     fprintf(stderr, "    Please report this as a GHC bug:  http://www.haskell.org/ghc/reportabug\n");
+     fflush(stderr);
+  }
+
+  abort();
+  // stg_exit(EXIT_INTERNAL_ERROR);
+}
+
+void
+rtsErrorMsgFn(char *s, va_list ap)
+{
+#if defined(cygwin32_TARGET_OS) || defined (mingw32_TARGET_OS)
+  if (isGUIApp())
+  {
+     char buf[BUFSIZE];
+     int r;
+
+	 r = vsnprintf(buf, BUFSIZE, s, ap);
+	 if (r > 0 && r < BUFSIZE) {
+		MessageBox(NULL /* hWnd */,
+              buf,
+              prog_name,
+              MB_OK | MB_ICONERROR | MB_TASKMODAL
+              );
+     }
+  }
+  else
+#endif
+  {
+     /* don't fflush(stdout); WORKAROUND bug in Linux glibc */
+     if (prog_argv != NULL && prog_name != NULL) {
+       fprintf(stderr, "%s: ", prog_name);
+     }
+     vfprintf(stderr, s, ap);
+     fprintf(stderr, "\n");
+  }
+}
+
+void
+rtsDebugMsgFn(char *s, va_list ap)
+{
+#if defined(cygwin32_TARGET_OS) || defined (mingw32_TARGET_OS)
+  if (isGUIApp())
+  {
+     char buf[BUFSIZE];
+	 int r;
+
+	 r = vsnprintf(buf, BUFSIZE, s, ap);
+	 if (r > 0 && r < BUFSIZE) {
+       OutputDebugString(buf);
+     }
+  }
+  else
+#endif
+  {
+     /* don't fflush(stdout); WORKAROUND bug in Linux glibc */
+     vfprintf(stderr, s, ap);
+     fflush(stderr);
+  }
+}
diff --git a/rts/RtsSignals.h b/rts/RtsSignals.h
new file mode 100644
index 0000000000..eafeeaaf55
--- /dev/null
+++ b/rts/RtsSignals.h
@@ -0,0 +1,78 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Signal processing / handling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef RTS_SIGNALS_H
+#define RTS_SIGNALS_H
+
+#if !defined(PAR) && !defined(mingw32_HOST_OS)
+
+#include "posix/Signals.h"
+
+#elif defined(mingw32_HOST_OS)
+
+#include "win32/ConsoleHandler.h"
+
+#else /* PAR */
+
+#define signals_pending() (rtsFalse)
+
+#endif /* PAR */
+
+
+#if RTS_USER_SIGNALS
+
+/*
+ * Function: initUserSignals()
+ *
+ * Initialize the console handling substrate.
+ */
+extern void initUserSignals(void);
+
+/*
+ * Function: initDefaultHandlers()
+ *
+ * Install any default signal/console handlers. Currently we install a
+ * Ctrl+C handler that shuts down the RTS in an orderly manner.
+ */
+extern void initDefaultHandlers(void);
+
+/*
+ * Function: blockUserSignals()
+ *
+ * Temporarily block the delivery of further console events. Needed to
+ * avoid race conditions when GCing the queue of outstanding handlers or
+ * when emptying the queue by running the handlers.
+ * 
+ */
+extern void blockUserSignals(void);
+
+/*
+ * Function: unblockUserSignals()
+ *
+ * The inverse of blockUserSignals(); re-enable the deliver of console events.
+ */
+extern void unblockUserSignals(void);
+
+/*
+ * Function: awaitUserSignals()
+ *
+ * Wait for the next console event. Currently a NOP (returns immediately.)
+ */
+extern void awaitUserSignals(void);
+
+/*
+ * Function: markSignalHandlers()
+ *
+ * Evacuate the handler queue. _Assumes_ that console event delivery
+ * has already been blocked.
+ */
+extern void markSignalHandlers (evac_fn evac);
+
+#endif /* RTS_USER_SIGNALS */
+
+#endif /* RTS_SIGNALS_H */
diff --git a/rts/RtsStartup.c b/rts/RtsStartup.c
new file mode 100644
index 0000000000..147de7b857
--- /dev/null
+++ b/rts/RtsStartup.c
@@ -0,0 +1,457 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2002
+ *
+ * Main function for a standalone Haskell program.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsAPI.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"  
+#include "OSThreads.h"
+#include "Storage.h"    /* initStorage, exitStorage */
+#include "Schedule.h"   /* initScheduler */
+#include "Stats.h"      /* initStats */
+#include "STM.h"        /* initSTM */
+#include "Signals.h"
+#include "RtsSignals.h"
+#include "Timer.h"      /* startTimer, stopTimer */
+#include "Weak.h"
+#include "Ticky.h"
+#include "StgRun.h"
+#include "Prelude.h"		/* fixupRTStoPreludeRefs */
+#include "HsFFI.h"
+#include "Linker.h"
+#include "ThreadLabels.h"
+#include "BlockAlloc.h"
+
+#if defined(RTS_GTK_FRONTPANEL)
+#include "FrontPanel.h"
+#endif
+
+#if defined(PROFILING) || defined(DEBUG)
+# include "Profiling.h"
+# include "ProfHeap.h"
+# include "RetainerProfile.h"
+#endif
+
+#if defined(GRAN)
+# include "GranSimRts.h"
+#endif
+
+#if defined(GRAN) || defined(PAR)
+# include "ParallelRts.h"
+#endif
+
+#if defined(PAR)
+# include "Parallel.h"
+# include "LLC.h"
+#endif
+
+#if defined(mingw32_HOST_OS)
+#include "win32/AsyncIO.h"
+#endif
+
+#include <stdlib.h>
+
+#ifdef HAVE_TERMIOS_H
+#include <termios.h>
+#endif
+#ifdef HAVE_SIGNAL_H
+#include <signal.h>
+#endif
+
+// Count of how many outstanding hs_init()s there have been.
+static int hs_init_count = 0;
+
+// Here we save the terminal settings on the standard file
+// descriptors, if we need to change them (eg. to support NoBuffering
+// input).
+static void *saved_termios[3] = {NULL,NULL,NULL};
+
+void*
+__hscore_get_saved_termios(int fd)
+{
+  return (0 <= fd && fd < (int)(sizeof(saved_termios) / sizeof(*saved_termios))) ?
+    saved_termios[fd] : NULL;
+}
+
+void
+__hscore_set_saved_termios(int fd, void* ts)
+{
+  if (0 <= fd && fd < (int)(sizeof(saved_termios) / sizeof(*saved_termios))) {
+    saved_termios[fd] = ts;
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   Initialise floating point unit on x86 (currently disabled. why?)
+   (see comment in ghc/compiler/nativeGen/MachInstrs.lhs).
+   -------------------------------------------------------------------------- */
+
+#define X86_INIT_FPU 0
+
+#if X86_INIT_FPU
+static void
+x86_init_fpu ( void )
+{
+  __volatile unsigned short int fpu_cw;
+
+  // Grab the control word
+  __asm __volatile ("fnstcw %0" : "=m" (fpu_cw));
+
+#if 0
+  printf("fpu_cw: %x\n", fpu_cw);
+#endif
+
+  // Set bits 8-9 to 10 (64-bit precision).
+  fpu_cw = (fpu_cw & 0xfcff) | 0x0200;
+
+  // Store the new control word back
+  __asm __volatile ("fldcw %0" : : "m" (fpu_cw));
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+   Starting up the RTS
+   -------------------------------------------------------------------------- */
+
+void
+hs_init(int *argc, char **argv[])
+{
+    hs_init_count++;
+    if (hs_init_count > 1) {
+	// second and subsequent inits are ignored
+	return;
+    }
+
+    /* The very first thing we do is grab the start time...just in case we're
+     * collecting timing statistics.
+     */
+    stat_startInit();
+
+#ifdef PAR
+    /*
+     * The parallel system needs to be initialised and synchronised before
+     * the program is run.  
+     */ 
+    startupParallelSystem(argv);
+     
+    if (*argv[0] == '-') { /* Strip off mainPE flag argument */
+      argv++; 
+      argc--;			
+    }
+
+    argv[1] = argv[0];   /* ignore the nPEs argument */
+    argv++; argc--;
+#endif
+
+    /* Set the RTS flags to default values. */
+    initRtsFlagsDefaults();
+
+    /* Call the user hook to reset defaults, if present */
+    defaultsHook();
+
+    /* Parse the flags, separating the RTS flags from the programs args */
+    if (argc != NULL && argv != NULL) {
+	setupRtsFlags(argc, *argv, &rts_argc, rts_argv);
+	setProgArgv(*argc,*argv);
+    }
+
+#if defined(PAR)
+    /* NB: this really must be done after processing the RTS flags */
+    IF_PAR_DEBUG(verbose,
+                 debugBelch("==== Synchronising system (%d PEs)\n", nPEs));
+    synchroniseSystem();             // calls initParallelSystem etc
+#endif	/* PAR */
+
+    /* Perform initialisation of adjustor thunk layer. */
+    initAdjustor();
+
+    /* initialise scheduler data structures (needs to be done before
+     * initStorage()).
+     */
+    initScheduler();
+
+#if defined(GRAN)
+    /* And start GranSim profiling if required: */
+    if (RtsFlags.GranFlags.GranSimStats.Full)
+      init_gr_simulation(rts_argc, rts_argv, prog_argc, prog_argv);
+#elif defined(PAR)
+    /* And start GUM profiling if required: */
+    if (RtsFlags.ParFlags.ParStats.Full)
+      init_gr_simulation(rts_argc, rts_argv, prog_argc, prog_argv);
+#endif	/* PAR || GRAN */
+
+    /* initialize the storage manager */
+    initStorage();
+
+    /* initialise the stable pointer table */
+    initStablePtrTable();
+
+#if defined(DEBUG)
+    /* initialise thread label table (tso->char*) */
+    initThreadLabelTable();
+#endif
+
+#if defined(PROFILING) || defined(DEBUG)
+    initProfiling1();
+#endif
+
+    /* start the virtual timer 'subsystem'. */
+    startTimer(TICK_MILLISECS);
+
+    /* Initialise the stats department */
+    initStats();
+
+#if defined(RTS_USER_SIGNALS)
+    /* Initialise the user signal handler set */
+    initUserSignals();
+    /* Set up handler to run on SIGINT, etc. */
+    initDefaultHandlers();
+#endif
+ 
+#if defined(mingw32_HOST_OS)
+    startupAsyncIO();
+#endif
+
+#ifdef RTS_GTK_FRONTPANEL
+    if (RtsFlags.GcFlags.frontpanel) {
+	initFrontPanel();
+    }
+#endif
+
+#if X86_INIT_FPU
+    x86_init_fpu();
+#endif
+
+    /* Record initialization times */
+    stat_endInit();
+}
+
+// Compatibility interface
+void
+startupHaskell(int argc, char *argv[], void (*init_root)(void))
+{
+    hs_init(&argc, &argv);
+    if(init_root)
+        hs_add_root(init_root);
+}
+
+
+/* -----------------------------------------------------------------------------
+   Per-module initialisation
+
+   This process traverses all the compiled modules in the program
+   starting with "Main", and performing per-module initialisation for
+   each one.
+
+   So far, two things happen at initialisation time:
+
+      - we register stable names for each foreign-exported function
+        in that module.  This prevents foreign-exported entities, and
+	things they depend on, from being garbage collected.
+
+      - we supply a unique integer to each statically declared cost
+        centre and cost centre stack in the program.
+
+   The code generator inserts a small function "__stginit_<module>" in each
+   module and calls the registration functions in each of the modules it
+   imports.
+
+   The init* functions are compiled in the same way as STG code,
+   i.e. without normal C call/return conventions.  Hence we must use
+   StgRun to call this stuff.
+   -------------------------------------------------------------------------- */
+
+/* The init functions use an explicit stack... 
+ */
+#define INIT_STACK_BLOCKS  4
+static F_ *init_stack = NULL;
+
+void
+hs_add_root(void (*init_root)(void))
+{
+    bdescr *bd;
+    nat init_sp;
+    Capability *cap = &MainCapability;
+
+    if (hs_init_count <= 0) {
+	barf("hs_add_root() must be called after hs_init()");
+    }
+
+    /* The initialisation stack grows downward, with sp pointing 
+       to the last occupied word */
+    init_sp = INIT_STACK_BLOCKS*BLOCK_SIZE_W;
+    bd = allocGroup_lock(INIT_STACK_BLOCKS);
+    init_stack = (F_ *)bd->start;
+    init_stack[--init_sp] = (F_)stg_init_finish;
+    if (init_root != NULL) {
+	init_stack[--init_sp] = (F_)init_root;
+    }
+    
+    cap->r.rSp = (P_)(init_stack + init_sp);
+    StgRun((StgFunPtr)stg_init, &cap->r);
+
+    freeGroup_lock(bd);
+
+#if defined(PROFILING) || defined(DEBUG)
+    // This must be done after module initialisation.
+    // ToDo: make this work in the presence of multiple hs_add_root()s.
+    initProfiling2();
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+   Shutting down the RTS
+   -------------------------------------------------------------------------- */
+
+void
+hs_exit(void)
+{
+    if (hs_init_count <= 0) {
+	errorBelch("warning: too many hs_exit()s");
+	return;
+    }
+    hs_init_count--;
+    if (hs_init_count > 0) {
+	// ignore until it's the last one
+	return;
+    }
+
+    /* start timing the shutdown */
+    stat_startExit();
+    
+    /* stop all running tasks */
+    exitScheduler();
+    
+#if defined(GRAN)
+    /* end_gr_simulation prints global stats if requested -- HWL */
+    if (!RtsFlags.GranFlags.GranSimStats.Suppressed)
+	end_gr_simulation();
+#endif
+    
+    /* stop the ticker */
+    stopTimer();
+    
+    /* reset the standard file descriptors to blocking mode */
+    resetNonBlockingFd(0);
+    resetNonBlockingFd(1);
+    resetNonBlockingFd(2);
+
+#if HAVE_TERMIOS_H
+    // Reset the terminal settings on the standard file descriptors,
+    // if we changed them.  See System.Posix.Internals.tcSetAttr for
+    // more details, including the reason we termporarily disable
+    // SIGTTOU here.
+    { 
+	int fd;
+	sigset_t sigset, old_sigset;
+	sigemptyset(&sigset);
+	sigaddset(&sigset, SIGTTOU);
+	sigprocmask(SIG_BLOCK, &sigset, &old_sigset);
+	for (fd = 0; fd <= 2; fd++) {
+	    struct termios* ts = (struct termios*)__hscore_get_saved_termios(fd);
+	    if (ts != NULL) {
+		tcsetattr(fd,TCSANOW,ts);
+	    }
+	}
+	sigprocmask(SIG_SETMASK, &old_sigset, NULL);
+    }
+#endif
+
+#if defined(PAR)
+    /* controlled exit; good thread! */
+    shutdownParallelSystem(0);
+    
+    /* global statistics in parallel system */
+    PAR_TICKY_PAR_END();
+#endif
+
+    /* stop timing the shutdown, we're about to print stats */
+    stat_endExit();
+    
+    // clean up things from the storage manager's point of view.
+    // also outputs the stats (+RTS -s) info.
+    exitStorage();
+    
+#ifdef RTS_GTK_FRONTPANEL
+    if (RtsFlags.GcFlags.frontpanel) {
+	stopFrontPanel();
+    }
+#endif
+
+#if defined(PROFILING) 
+    reportCCSProfiling();
+#endif
+
+#if defined(PROFILING) || defined(DEBUG)
+    endProfiling();
+#endif
+
+#ifdef PROFILING
+    // Originally, this was in report_ccs_profiling().  Now, retainer
+    // profiling might tack some extra stuff on to the end of this file
+    // during endProfiling().
+    fclose(prof_file);
+#endif
+
+#if defined(TICKY_TICKY)
+    if (RtsFlags.TickyFlags.showTickyStats) PrintTickyInfo();
+#endif
+
+#if defined(mingw32_HOST_OS)
+    shutdownAsyncIO();
+#endif
+
+    // Finally, free all our storage.
+    freeStorage();
+}
+
+// Compatibility interfaces
+void
+shutdownHaskell(void)
+{
+    hs_exit();
+}
+
+void
+shutdownHaskellAndExit(int n)
+{
+    if (hs_init_count == 1) {
+	OnExitHook();
+	hs_exit();
+#if defined(PAR)
+	/* really exit (stg_exit() would call shutdownParallelSystem() again) */
+	exit(n);
+#else
+	stg_exit(n);
+#endif
+    }
+}
+
+/* 
+ * called from STG-land to exit the program
+ */
+
+#ifdef PAR
+static int exit_started=rtsFalse;
+#endif
+
+void  
+stg_exit(int n)
+{ 
+#ifdef PAR
+  /* HACK: avoid a loop when exiting due to a stupid error */
+  if (exit_started) 
+    return;
+  exit_started=rtsTrue;
+
+  IF_PAR_DEBUG(verbose, debugBelch("==-- stg_exit %d on [%x]...", n, mytid));
+  shutdownParallelSystem(n);
+#endif
+  exit(n);
+}
diff --git a/rts/RtsUtils.c b/rts/RtsUtils.c
new file mode 100644
index 0000000000..3e7e225dda
--- /dev/null
+++ b/rts/RtsUtils.c
@@ -0,0 +1,367 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * General utility functions used in the RTS.
+ *
+ * ---------------------------------------------------------------------------*/
+
+/* gettimeofday isn't POSIX */
+/* #include "PosixSource.h" */
+
+#include "Rts.h"
+#include "RtsAPI.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Ticky.h"
+
+#ifdef HAVE_TIME_H
+#include <time.h>
+#endif
+
+#ifdef HAVE_FCNTL_H
+#include <fcntl.h>
+#endif
+
+#ifdef HAVE_GETTIMEOFDAY
+#include <sys/time.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdio.h>
+
+#ifdef HAVE_SIGNAL_H
+#include <signal.h>
+#endif
+
+#if defined(THREADED_RTS) && defined(openbsd_HOST_OS) && defined(HAVE_PTHREAD_H)
+#include <pthread.h>
+#endif
+
+#if defined(openbsd_HOST_OS) || defined(linux_HOST_OS) || defined(darwin_HOST_OS)
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+
+/* no C99 header stdint.h on OpenBSD? */
+#if defined(openbsd_HOST_OS)
+typedef unsigned long my_uintptr_t;
+#else
+#include <stdint.h>
+typedef uintptr_t my_uintptr_t;
+#endif
+#endif
+
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
+/* -----------------------------------------------------------------------------
+   Result-checking malloc wrappers.
+   -------------------------------------------------------------------------- */
+
+void *
+stgMallocBytes (int n, char *msg)
+{
+    char *space;
+
+    if ((space = (char *) malloc((size_t) n)) == NULL) {
+      /* don't fflush(stdout); WORKAROUND bug in Linux glibc */
+      MallocFailHook((W_) n, msg); /*msg*/
+      stg_exit(EXIT_INTERNAL_ERROR);
+    }
+    return space;
+}
+
+void *
+stgReallocBytes (void *p, int n, char *msg)
+{
+    char *space;
+
+    if ((space = (char *) realloc(p, (size_t) n)) == NULL) {
+      /* don't fflush(stdout); WORKAROUND bug in Linux glibc */
+      MallocFailHook((W_) n, msg); /*msg*/
+      stg_exit(EXIT_INTERNAL_ERROR);
+    }
+    return space;
+}
+
+void *
+stgCallocBytes (int n, int m, char *msg)
+{
+    char *space;
+
+    if ((space = (char *) calloc((size_t) n, (size_t) m)) == NULL) {
+      /* don't fflush(stdout); WORKAROUND bug in Linux glibc */
+      MallocFailHook((W_) n*m, msg); /*msg*/
+      stg_exit(EXIT_INTERNAL_ERROR);
+    }
+    return space;
+}
+
+/* To simplify changing the underlying allocator used
+ * by stgMallocBytes(), provide stgFree() as well.
+ */
+void
+stgFree(void* p)
+{
+  free(p);
+}
+
+/* -----------------------------------------------------------------------------
+   Stack overflow
+   
+   Not sure if this belongs here.
+   -------------------------------------------------------------------------- */
+
+void
+stackOverflow(void)
+{
+  StackOverflowHook(RtsFlags.GcFlags.maxStkSize * sizeof(W_));
+
+#if defined(TICKY_TICKY)
+  if (RtsFlags.TickyFlags.showTickyStats) PrintTickyInfo();
+#endif
+}
+
+void
+heapOverflow(void)
+{
+  /* don't fflush(stdout); WORKAROUND bug in Linux glibc */
+  OutOfHeapHook(0/*unknown request size*/, 
+		RtsFlags.GcFlags.maxHeapSize * BLOCK_SIZE);
+  
+#if defined(TICKY_TICKY)
+  if (RtsFlags.TickyFlags.showTickyStats) PrintTickyInfo();
+#endif
+
+  stg_exit(EXIT_HEAPOVERFLOW);
+}
+
+/* -----------------------------------------------------------------------------
+   Out-of-line strlen.
+
+   Used in addr2Integer because the C compiler on x86 chokes on
+   strlen, trying to inline it with not enough registers available.
+   -------------------------------------------------------------------------- */
+
+nat stg_strlen(char *s)
+{
+   char *p = s;
+
+   while (*p) p++;
+   return p-s;
+}
+
+
+/* -----------------------------------------------------------------------------
+   genSym stuff, used by GHC itself for its splitting unique supply.
+
+   ToDo: put this somewhere sensible.
+   -------------------------------------------------------------------------  */
+
+static I_ __GenSymCounter = 0;
+
+I_
+genSymZh(void)
+{
+    return(__GenSymCounter++);
+}
+I_
+resetGenSymZh(void) /* it's your funeral */
+{
+    __GenSymCounter=0;
+    return(__GenSymCounter);
+}
+
+/* -----------------------------------------------------------------------------
+   Get the current time as a string.  Used in profiling reports.
+   -------------------------------------------------------------------------- */
+
+#if defined(PROFILING) || defined(DEBUG) || defined(PAR) || defined(GRAN)
+char *
+time_str(void)
+{
+    static time_t now = 0;
+    static char nowstr[26];
+
+    if (now == 0) {
+	time(&now);
+#if HAVE_CTIME_R
+	ctime_r(&now, nowstr);
+#else
+	strcpy(nowstr, ctime(&now));
+#endif
+	memmove(nowstr+16,nowstr+19,7);
+	nowstr[21] = '\0';  // removes the \n
+    }
+    return nowstr;
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Reset a file handle to blocking mode.  We do this for the standard
+ * file descriptors before exiting, because the shell doesn't always
+ * clean up for us.
+ * -------------------------------------------------------------------------- */
+
+#if !defined(mingw32_HOST_OS)
+void
+resetNonBlockingFd(int fd)
+{
+  long fd_flags;
+
+  /* clear the non-blocking flag on this file descriptor */
+  fd_flags = fcntl(fd, F_GETFL);
+  if (fd_flags & O_NONBLOCK) {
+    fcntl(fd, F_SETFL, fd_flags & ~O_NONBLOCK);
+  }
+}
+
+void
+setNonBlockingFd(int fd)
+{
+  long fd_flags;
+
+  /* clear the non-blocking flag on this file descriptor */
+  fd_flags = fcntl(fd, F_GETFL);
+  if (!(fd_flags & O_NONBLOCK)) {
+    fcntl(fd, F_SETFL, fd_flags | O_NONBLOCK);
+  }
+}
+#else
+/* Stub defns -- async / non-blocking IO is not done 
+ * via O_NONBLOCK and select() under Win32. 
+ */
+void resetNonBlockingFd(int fd STG_UNUSED) {}
+void setNonBlockingFd(int fd STG_UNUSED) {}
+#endif
+
+#ifdef PAR
+static ullong startTime = 0;
+
+/* used in a parallel setup */
+ullong
+msTime(void)
+{
+# if defined(HAVE_GETCLOCK) && !defined(alpha_HOST_ARCH) && !defined(hppa1_1_HOST_ARCH)
+    struct timespec tv;
+
+    if (getclock(TIMEOFDAY, &tv) != 0) {
+	fflush(stdout);
+	fprintf(stderr, "Clock failed\n");
+	stg_exit(EXIT_FAILURE);
+    }
+    return tv.tv_sec * LL(1000) + tv.tv_nsec / LL(1000000) - startTime;
+# elif HAVE_GETTIMEOFDAY && !defined(alpha_HOST_ARCH)
+    struct timeval tv;
+ 
+    if (gettimeofday(&tv, NULL) != 0) {
+	fflush(stdout);
+	fprintf(stderr, "Clock failed\n");
+	stg_exit(EXIT_FAILURE);
+    }
+    return tv.tv_sec * LL(1000) + tv.tv_usec / LL(1000) - startTime;
+# else
+    time_t t;
+    if ((t = time(NULL)) == (time_t) -1) {
+	fflush(stdout);
+	fprintf(stderr, "Clock failed\n");
+	stg_exit(EXIT_FAILURE);
+    }
+    return t * LL(1000) - startTime;
+# endif
+}
+#endif /* PAR */
+
+/* -----------------------------------------------------------------------------
+   Print large numbers, with punctuation.
+   -------------------------------------------------------------------------- */
+
+char *
+ullong_format_string(ullong x, char *s, rtsBool with_commas)
+{
+    if (x < (ullong)1000) 
+	sprintf(s, "%lu", (lnat)x);
+    else if (x < (ullong)1000000)
+	sprintf(s, (with_commas) ? "%lu,%3.3lu" : "%lu%3.3lu",
+		(lnat)((x)/(ullong)1000),
+		(lnat)((x)%(ullong)1000));
+    else if (x < (ullong)1000000000)
+	sprintf(s, (with_commas) ? "%lu,%3.3lu,%3.3lu" :  "%lu%3.3lu%3.3lu",
+		(lnat)((x)/(ullong)1000000),
+		(lnat)((x)/(ullong)1000%(ullong)1000),
+		(lnat)((x)%(ullong)1000));
+    else
+	sprintf(s, (with_commas) ? "%lu,%3.3lu,%3.3lu,%3.3lu" : "%lu%3.3lu%3.3lu%3.3lu",
+		(lnat)((x)/(ullong)1000000000),
+		(lnat)((x)/(ullong)1000000%(ullong)1000),
+		(lnat)((x)/(ullong)1000%(ullong)1000), 
+		(lnat)((x)%(ullong)1000));
+    return s;
+}
+
+
+// Can be used as a breakpoint to set on every heap check failure.
+#ifdef DEBUG
+void
+heapCheckFail( void )
+{
+}
+#endif
+
+/* 
+ * It seems that pthreads and signals interact oddly in OpenBSD & FreeBSD
+ * pthreads (and possibly others). When linking with -lpthreads, we
+ * have to use pthread_kill to send blockable signals. So use that
+ * when we have a threaded rts. So System.Posix.Signals will call
+ * genericRaise(), rather than raise(3).
+ */
+int genericRaise(int sig) {
+#if defined(THREADED_RTS) && (defined(openbsd_HOST_OS) || defined(freebsd_HOST_OS))
+        return pthread_kill(pthread_self(), sig);
+#else
+        return raise(sig);
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+   Allocating executable memory
+   -------------------------------------------------------------------------- */
+
+/* Heavily arch-specific, I'm afraid.. */
+
+/*
+ * Allocate len bytes which are readable, writable, and executable.
+ *
+ * ToDo: If this turns out to be a performance bottleneck, one could
+ * e.g. cache the last VirtualProtect/mprotect-ed region and do
+ * nothing in case of a cache hit.
+ */
+void*
+stgMallocBytesRWX(int len)
+{
+  void *addr = stgMallocBytes(len, "mallocBytesRWX");
+#if defined(i386_HOST_ARCH) && defined(_WIN32)
+  /* This could be necessary for processors which distinguish between READ and
+     EXECUTE memory accesses, e.g. Itaniums. */
+  DWORD dwOldProtect = 0;
+  if (VirtualProtect (addr, len, PAGE_EXECUTE_READWRITE, &dwOldProtect) == 0) {
+    barf("mallocBytesRWX: failed to protect 0x%p; error=%lu; old protection: %lu\n",
+         addr, (unsigned long)GetLastError(), (unsigned long)dwOldProtect);
+  }
+#elif defined(openbsd_HOST_OS) || defined(linux_HOST_OS) || defined(darwin_HOST_OS)
+  /* malloced memory isn't executable by default on OpenBSD */
+  my_uintptr_t pageSize         = sysconf(_SC_PAGESIZE);
+  my_uintptr_t mask             = ~(pageSize - 1);
+  my_uintptr_t startOfFirstPage = ((my_uintptr_t)addr          ) & mask;
+  my_uintptr_t startOfLastPage  = ((my_uintptr_t)addr + len - 1) & mask;
+  my_uintptr_t size             = startOfLastPage - startOfFirstPage + pageSize;
+  if (mprotect((void*)startOfFirstPage, (size_t)size, PROT_EXEC | PROT_READ | PROT_WRITE) != 0) {
+    barf("mallocBytesRWX: failed to protect 0x%p\n", addr);
+  }
+#endif
+  return addr;
+}
diff --git a/rts/RtsUtils.h b/rts/RtsUtils.h
new file mode 100644
index 0000000000..96a5f0d82f
--- /dev/null
+++ b/rts/RtsUtils.h
@@ -0,0 +1,54 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * General utility functions used in the RTS.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef RTSUTILS_H
+#define RTSUTILS_H
+
+/* -----------------------------------------------------------------------------
+ * (Checked) dynamic allocation
+ * -------------------------------------------------------------------------- */
+
+extern void *stgMallocBytes(int n, char *msg)
+    GNUC3_ATTRIBUTE(__malloc__);
+
+extern void* stgMallocBytesRWX(int len)
+    GNUC3_ATTRIBUTE(__malloc__);
+
+extern void *stgReallocBytes(void *p, int n, char *msg);
+
+extern void *stgCallocBytes(int n, int m, char *msg)
+     GNUC3_ATTRIBUTE(__malloc__);
+
+extern void stgFree(void* p);
+
+/* -----------------------------------------------------------------------------
+ * Misc other utilities
+ * -------------------------------------------------------------------------- */
+
+extern void heapOverflow(void);
+
+extern void setNonBlockingFd(int fd);
+extern void resetNonBlockingFd(int fd);
+
+extern nat stg_strlen(char *str);
+
+extern char *time_str(void);
+extern char *ullong_format_string(ullong, char *, rtsBool);
+
+#ifdef PAR
+extern ullong msTime(void);
+#endif
+
+#ifdef DEBUG
+extern void heapCheckFail( void );
+#endif
+
+extern void* __hscore_get_saved_termios(int fd);
+extern void __hscore_set_saved_termios(int fd, void* ts);
+
+#endif /* RTSUTILS_H */
diff --git a/rts/STM.c b/rts/STM.c
new file mode 100644
index 0000000000..d3283a92f0
--- /dev/null
+++ b/rts/STM.c
@@ -0,0 +1,1261 @@
+/* -----------------------------------------------------------------------------
+ * (c) The GHC Team 1998-2005
+ * 
+ * STM implementation.
+ *
+ * Overview
+ * --------
+ *
+ * See the PPoPP 2005 paper "Composable memory transactions".  In summary, 
+ * each transcation has a TRec (transaction record) holding entries for each of the
+ * TVars (transactional variables) that it has accessed.  Each entry records
+ * (a) the TVar, (b) the expected value seen in the TVar, (c) the new value that
+ * the transaction wants to write to the TVar, (d) during commit, the identity of
+ * the TRec that wrote the expected value.  
+ *
+ * Separate TRecs are used for each level in a nest of transactions.  This allows
+ * a nested transaction to be aborted without condemning its enclosing transactions.
+ * This is needed in the implementation of catchRetry.  Note that the "expected value"
+ * in a nested transaction's TRec is the value expected to be *held in memory* if
+ * the transaction commits -- not the "new value" stored in one of the enclosing
+ * transactions.  This means that validation can be done without searching through
+ * a nest of TRecs.
+ *
+ * Concurrency control
+ * -------------------
+ *
+ * Three different concurrency control schemes can be built according to the settings
+ * in STM.h:
+ * 
+ * STM_UNIPROC assumes that the caller serialises invocations on the STM interface.
+ * In the Haskell RTS this means it is suitable only for non-THREADED_RTS builds.
+ *
+ * STM_CG_LOCK uses coarse-grained locking -- a single 'stm lock' is acquired during
+ * an invocation on the STM interface.  Note that this does not mean that 
+ * transactions are simply serialized -- the lock is only held *within* the 
+ * implementation of stmCommitTransaction, stmWait etc.
+ *
+ * STM_FG_LOCKS uses fine-grained locking -- locking is done on a per-TVar basis
+ * and, when committing a transaction, no locks are acquired for TVars that have
+ * been read but not updated.
+ *
+ * Concurrency control is implemented in the functions:
+ *
+ *    lock_stm
+ *    unlock_stm
+ *    lock_tvar / cond_lock_tvar
+ *    unlock_tvar
+ *
+ * The choice between STM_UNIPROC / STM_CG_LOCK / STM_FG_LOCKS affects the 
+ * implementation of these functions.  
+ *
+ * lock_stm & unlock_stm are straightforward : they acquire a simple spin-lock
+ * using STM_CG_LOCK, and otherwise they are no-ops.
+ *
+ * lock_tvar / cond_lock_tvar and unlock_tvar are more complex because they 
+ * have other effects (present in STM_UNIPROC and STM_CG_LOCK builds) as well
+ * as the actual business of maniupultaing a lock (present only in STM_FG_LOCKS
+ * builds).  This is because locking a TVar is implemented by writing the lock
+ * holder's TRec into the TVar's current_value field:
+ *
+ *   lock_tvar - lock a specified TVar (STM_FG_LOCKS only), returning the value 
+ *               it contained.
+ *
+ *   cond_lock_tvar - lock a specified TVar (STM_FG_LOCKS only) if it 
+ *               contains a specified value.  Return TRUE if this succeeds,
+ *               FALSE otherwise.
+ *
+ *   unlock_tvar - release the lock on a specified TVar (STM_FG_LOCKS only),
+ *               storing a specified value in place of the lock entry.
+ *
+ * Using these operations, the typcial pattern of a commit/validate/wait operation
+ * is to (a) lock the STM, (b) lock all the TVars being updated, (c) check that 
+ * the TVars that were only read from still contain their expected values, 
+ * (d) release the locks on the TVars, writing updates to them in the case of a 
+ * commit, (e) unlock the STM.
+ *
+ * Queues of waiting threads hang off the first_wait_queue_entry field of each
+ * TVar.  This may only be manipulated when holding that TVar's lock.  In
+ * particular, when a thread is putting itself to sleep, it mustn't release
+ * the TVar's lock until it has added itself to the wait queue and marked its
+ * TSO as BlockedOnSTM -- this makes sure that other threads will know to wake it.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Schedule.h"
+#include "SMP.h"
+#include "STM.h"
+#include "Storage.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#define TRUE 1
+#define FALSE 0
+
+// ACQ_ASSERT is used for assertions which are only required for
+// THREADED_RTS builds with fine-grained locking.
+
+#if defined(STM_FG_LOCKS)
+#define ACQ_ASSERT(_X) ASSERT(_X)
+#define NACQ_ASSERT(_X) /*Nothing*/
+#else
+#define ACQ_ASSERT(_X) /*Nothing*/
+#define NACQ_ASSERT(_X) ASSERT(_X)
+#endif
+
+/*......................................................................*/
+
+// If SHAKE is defined then validation will sometime spuriously fail.  They helps test
+// unusualy code paths if genuine contention is rare
+
+#if defined(DEBUG)
+#define SHAKE
+#if defined(THREADED_RTS)
+#define TRACE(_x...) IF_DEBUG(stm, debugBelch("STM  (task %p): ", (void *)(unsigned long)(unsigned int)osThreadId()); debugBelch ( _x ))
+#else
+#define TRACE(_x...) IF_DEBUG(stm, debugBelch ( _x ))
+#endif
+#else
+#define TRACE(_x...) /*Nothing*/
+#endif
+
+#ifdef SHAKE
+static const int do_shake = TRUE;
+#else
+static const int do_shake = FALSE;
+#endif
+static int shake_ctr = 0;
+static int shake_lim = 1;
+
+static int shake(void) {
+  if (do_shake) {
+    if (((shake_ctr++) % shake_lim) == 0) {
+      shake_ctr = 1;
+      shake_lim ++;
+      return TRUE;
+    } 
+    return FALSE;
+  } else {
+    return FALSE;
+  }
+}
+
+/*......................................................................*/
+
+// Helper macros for iterating over entries within a transaction
+// record
+
+#define FOR_EACH_ENTRY(_t,_x,CODE) do {                                         \
+  StgTRecHeader *__t = (_t);                                                    \
+  StgTRecChunk *__c = __t -> current_chunk;                                     \
+  StgWord __limit = __c -> next_entry_idx;                                      \
+  TRACE("%p : FOR_EACH_ENTRY, current_chunk=%p limit=%ld\n", __t, __c, __limit); \
+  while (__c != END_STM_CHUNK_LIST) {                                           \
+    StgWord __i;                                                                \
+    for (__i = 0; __i < __limit; __i ++) {                                      \
+      TRecEntry *_x = &(__c -> entries[__i]);                                   \
+      do { CODE } while (0);                                                    \
+    }                                                                           \
+    __c = __c -> prev_chunk;                                                    \
+    __limit = TREC_CHUNK_NUM_ENTRIES;                                           \
+  }                                                                             \
+ exit_for_each:                                                                 \
+  if (FALSE) goto exit_for_each;                                                \
+} while (0)
+
+#define BREAK_FOR_EACH goto exit_for_each
+     
+/*......................................................................*/
+
+// if REUSE_MEMORY is defined then attempt to re-use descriptors, log chunks,
+// and wait queue entries without GC
+
+#define REUSE_MEMORY
+
+/*......................................................................*/
+
+#define IF_STM_UNIPROC(__X)  do { } while (0)
+#define IF_STM_CG_LOCK(__X)  do { } while (0)
+#define IF_STM_FG_LOCKS(__X) do { } while (0)
+
+#if defined(STM_UNIPROC)
+#undef IF_STM_UNIPROC
+#define IF_STM_UNIPROC(__X)  do { __X } while (0)
+static const StgBool use_read_phase = FALSE;
+
+static void lock_stm(StgTRecHeader *trec STG_UNUSED) {
+  TRACE("%p : lock_stm()\n", trec);
+}
+
+static void unlock_stm(StgTRecHeader *trec STG_UNUSED) {
+  TRACE("%p : unlock_stm()\n", trec);
+}
+
+static StgClosure *lock_tvar(StgTRecHeader *trec STG_UNUSED, 
+                             StgTVar *s STG_UNUSED) {
+  StgClosure *result;
+  TRACE("%p : lock_tvar(%p)\n", trec, s);
+  result = s -> current_value;
+  return result;
+}
+
+static void unlock_tvar(StgTRecHeader *trec STG_UNUSED,
+                        StgTVar *s STG_UNUSED,
+                        StgClosure *c,
+                        StgBool force_update) {
+  TRACE("%p : unlock_tvar(%p)\n", trec, s);
+  if (force_update) {
+    s -> current_value = c;
+  }
+}
+
+static StgBool cond_lock_tvar(StgTRecHeader *trec STG_UNUSED, 
+                              StgTVar *s STG_UNUSED,
+                              StgClosure *expected) {
+  StgClosure *result;
+  TRACE("%p : cond_lock_tvar(%p, %p)\n", trec, s, expected);
+  result = s -> current_value;
+  TRACE("%p : %s\n", trec, (result == expected) ? "success" : "failure");
+  return (result == expected);
+}
+#endif
+
+#if defined(STM_CG_LOCK) /*........................................*/
+
+#undef IF_STM_CG_LOCK
+#define IF_STM_CG_LOCK(__X)  do { __X } while (0)
+static const StgBool use_read_phase = FALSE;
+static volatile StgTRecHeader *smp_locked = NULL;
+
+static void lock_stm(StgTRecHeader *trec) {
+  while (cas(&smp_locked, NULL, trec) != NULL) { }
+  TRACE("%p : lock_stm()\n", trec);
+}
+
+static void unlock_stm(StgTRecHeader *trec STG_UNUSED) {
+  TRACE("%p : unlock_stm()\n", trec);
+  ASSERT (smp_locked == trec);
+  smp_locked = 0;
+}
+
+static StgClosure *lock_tvar(StgTRecHeader *trec STG_UNUSED, 
+                             StgTVar *s STG_UNUSED) {
+  StgClosure *result;
+  TRACE("%p : lock_tvar(%p)\n", trec, s);
+  ASSERT (smp_locked == trec);
+  result = s -> current_value;
+  return result;
+}
+
+static void *unlock_tvar(StgTRecHeader *trec STG_UNUSED,
+                         StgTVar *s STG_UNUSED,
+                         StgClosure *c,
+                         StgBool force_update) {
+  TRACE("%p : unlock_tvar(%p, %p)\n", trec, s, c);
+  ASSERT (smp_locked == trec);
+  if (force_update) {
+    s -> current_value = c;
+  }
+}
+
+static StgBool cond_lock_tvar(StgTRecHeader *trec STG_UNUSED, 
+                               StgTVar *s STG_UNUSED,
+                               StgClosure *expected) {
+  StgClosure *result;
+  TRACE("%p : cond_lock_tvar(%p, %p)\n", trec, s, expected);
+  ASSERT (smp_locked == trec);
+  result = s -> current_value;
+  TRACE("%p : %d\n", result ? "success" : "failure");
+  return (result == expected);
+}
+#endif
+
+#if defined(STM_FG_LOCKS) /*...................................*/
+
+#undef IF_STM_FG_LOCKS
+#define IF_STM_FG_LOCKS(__X) do { __X } while (0)
+static const StgBool use_read_phase = TRUE;
+
+static void lock_stm(StgTRecHeader *trec STG_UNUSED) {
+  TRACE("%p : lock_stm()\n", trec);
+}
+
+static void unlock_stm(StgTRecHeader *trec STG_UNUSED) {
+  TRACE("%p : unlock_stm()\n", trec);
+}
+
+static StgClosure *lock_tvar(StgTRecHeader *trec, 
+                             StgTVar *s STG_UNUSED) {
+  StgClosure *result;
+  TRACE("%p : lock_tvar(%p)\n", trec, s);
+  do {
+    do {
+      result = s -> current_value;
+    } while (GET_INFO(result) == &stg_TREC_HEADER_info);
+  } while (cas(&(s -> current_value), result, trec) != result);
+  return result;
+}
+
+static void unlock_tvar(StgTRecHeader *trec STG_UNUSED,
+                        StgTVar *s,
+                        StgClosure *c,
+                        StgBool force_update STG_UNUSED) {
+  TRACE("%p : unlock_tvar(%p, %p)\n", trec, s, c);
+  ASSERT(s -> current_value == trec);
+  s -> current_value = c;
+}
+
+static StgBool cond_lock_tvar(StgTRecHeader *trec, 
+                              StgTVar *s,
+                              StgClosure *expected) {
+  StgClosure *result;
+  TRACE("%p : cond_lock_tvar(%p, %p)\n", trec, s, expected);
+  result = cas(&(s -> current_value), expected, trec);
+  TRACE("%p : %s\n", trec, result ? "success" : "failure");
+  return (result == expected);
+}
+#endif
+
+/*......................................................................*/
+
+// Helper functions for thread blocking and unblocking
+
+static void park_tso(StgTSO *tso) {
+  ASSERT(tso -> why_blocked == NotBlocked);
+  tso -> why_blocked = BlockedOnSTM;
+  tso -> block_info.closure = (StgClosure *) END_TSO_QUEUE;
+  TRACE("park_tso on tso=%p\n", tso);
+}
+
+static void unpark_tso(Capability *cap, StgTSO *tso) {
+  // We will continue unparking threads while they remain on one of the wait
+  // queues: it's up to the thread itself to remove it from the wait queues
+  // if it decides to do so when it is scheduled.
+  if (tso -> why_blocked == BlockedOnSTM) {
+    TRACE("unpark_tso on tso=%p\n", tso);
+    unblockOne(cap,tso);
+  } else {
+    TRACE("spurious unpark_tso on tso=%p\n", tso);
+  }
+}
+
+static void unpark_waiters_on(Capability *cap, StgTVar *s) {
+  StgTVarWaitQueue *q;
+  TRACE("unpark_waiters_on tvar=%p\n", s);
+  for (q = s -> first_wait_queue_entry; 
+       q != END_STM_WAIT_QUEUE; 
+       q = q -> next_queue_entry) {
+    unpark_tso(cap, q -> waiting_tso);
+  }
+}
+
+/*......................................................................*/
+
+// Helper functions for downstream allocation and initialization
+
+static StgTVarWaitQueue *new_stg_tvar_wait_queue(Capability *cap,
+                                                 StgTSO *waiting_tso) {
+  StgTVarWaitQueue *result;
+  result = (StgTVarWaitQueue *)allocateLocal(cap, sizeofW(StgTVarWaitQueue));
+  SET_HDR (result, &stg_TVAR_WAIT_QUEUE_info, CCS_SYSTEM);
+  result -> waiting_tso = waiting_tso;
+  return result;
+}
+
+static StgTRecChunk *new_stg_trec_chunk(Capability *cap) {
+  StgTRecChunk *result;
+  result = (StgTRecChunk *)allocateLocal(cap, sizeofW(StgTRecChunk));
+  SET_HDR (result, &stg_TREC_CHUNK_info, CCS_SYSTEM);
+  result -> prev_chunk = END_STM_CHUNK_LIST;
+  result -> next_entry_idx = 0;
+  return result;
+}
+
+static StgTRecHeader *new_stg_trec_header(Capability *cap,
+                                          StgTRecHeader *enclosing_trec) {
+  StgTRecHeader *result;
+  result = (StgTRecHeader *) allocateLocal(cap, sizeofW(StgTRecHeader));
+  SET_HDR (result, &stg_TREC_HEADER_info, CCS_SYSTEM);
+
+  result -> enclosing_trec = enclosing_trec;
+  result -> current_chunk = new_stg_trec_chunk(cap);
+
+  if (enclosing_trec == NO_TREC) {
+    result -> state = TREC_ACTIVE;
+  } else {
+    ASSERT(enclosing_trec -> state == TREC_ACTIVE ||
+           enclosing_trec -> state == TREC_CONDEMNED);
+    result -> state = enclosing_trec -> state;
+  }
+
+  return result;  
+}
+
+/*......................................................................*/
+
+// Allocation / deallocation functions that retain per-capability lists
+// of closures that can be re-used
+
+static StgTVarWaitQueue *alloc_stg_tvar_wait_queue(Capability *cap,
+                                                   StgTSO *waiting_tso) {
+  StgTVarWaitQueue *result = NULL;
+  if (cap -> free_tvar_wait_queues == END_STM_WAIT_QUEUE) {
+    result = new_stg_tvar_wait_queue(cap, waiting_tso);
+  } else {
+    result = cap -> free_tvar_wait_queues;
+    result -> waiting_tso = waiting_tso;
+    cap -> free_tvar_wait_queues = result -> next_queue_entry;
+  }
+  return result;
+}
+
+static void free_stg_tvar_wait_queue(Capability *cap,
+                                     StgTVarWaitQueue *wq) {
+#if defined(REUSE_MEMORY)
+  wq -> next_queue_entry = cap -> free_tvar_wait_queues;
+  cap -> free_tvar_wait_queues = wq;
+#endif
+}
+
+static StgTRecChunk *alloc_stg_trec_chunk(Capability *cap) {
+  StgTRecChunk *result = NULL;
+  if (cap -> free_trec_chunks == END_STM_CHUNK_LIST) {
+    result = new_stg_trec_chunk(cap);
+  } else {
+    result = cap -> free_trec_chunks;
+    cap -> free_trec_chunks = result -> prev_chunk;
+    result -> prev_chunk = END_STM_CHUNK_LIST;
+    result -> next_entry_idx = 0;
+  }
+  return result;
+}
+
+static void free_stg_trec_chunk(Capability *cap, 
+                                StgTRecChunk *c) {
+#if defined(REUSE_MEMORY)
+  c -> prev_chunk = cap -> free_trec_chunks;
+  cap -> free_trec_chunks = c;
+#endif
+}
+
+static StgTRecHeader *alloc_stg_trec_header(Capability *cap,
+                                            StgTRecHeader *enclosing_trec) {
+  StgTRecHeader *result = NULL;
+  if (cap -> free_trec_headers == NO_TREC) {
+    result = new_stg_trec_header(cap, enclosing_trec);
+  } else {
+    result = cap -> free_trec_headers;
+    cap -> free_trec_headers = result -> enclosing_trec;
+    result -> enclosing_trec = enclosing_trec;
+    result -> current_chunk -> next_entry_idx = 0;
+    if (enclosing_trec == NO_TREC) {
+      result -> state = TREC_ACTIVE;
+    } else {
+      ASSERT(enclosing_trec -> state == TREC_ACTIVE ||
+             enclosing_trec -> state == TREC_CONDEMNED);
+      result -> state = enclosing_trec -> state;
+    }
+  }
+  return result;
+}
+
+static void free_stg_trec_header(Capability *cap,
+                                 StgTRecHeader *trec) {
+#if defined(REUSE_MEMORY)
+  StgTRecChunk *chunk = trec -> current_chunk -> prev_chunk;
+  while (chunk != END_STM_CHUNK_LIST) {
+    StgTRecChunk *prev_chunk = chunk -> prev_chunk;
+    free_stg_trec_chunk(cap, chunk);
+    chunk = prev_chunk;
+  } 
+  trec -> current_chunk -> prev_chunk = END_STM_CHUNK_LIST;
+  trec -> enclosing_trec = cap -> free_trec_headers;
+  cap -> free_trec_headers = trec;
+#endif
+}
+
+/*......................................................................*/
+
+// Helper functions for managing waiting lists
+
+static void build_wait_queue_entries_for_trec(Capability *cap,
+                                      StgTSO *tso, 
+                                      StgTRecHeader *trec) {
+  ASSERT(trec != NO_TREC);
+  ASSERT(trec -> enclosing_trec == NO_TREC);
+  ASSERT(trec -> state == TREC_ACTIVE);
+
+  TRACE("%p : build_wait_queue_entries_for_trec()\n", trec);
+
+  FOR_EACH_ENTRY(trec, e, {
+    StgTVar *s;
+    StgTVarWaitQueue *q;
+    StgTVarWaitQueue *fq;
+    s = e -> tvar;
+    TRACE("%p : adding tso=%p to wait queue for tvar=%p\n", trec, tso, s);
+    ACQ_ASSERT(s -> current_value == trec);
+    NACQ_ASSERT(s -> current_value == e -> expected_value);
+    fq = s -> first_wait_queue_entry;
+    q = alloc_stg_tvar_wait_queue(cap, tso);
+    q -> next_queue_entry = fq;
+    q -> prev_queue_entry = END_STM_WAIT_QUEUE;
+    if (fq != END_STM_WAIT_QUEUE) {
+      fq -> prev_queue_entry = q;
+    }
+    s -> first_wait_queue_entry = q;
+    e -> new_value = (StgClosure *) q;
+  });
+}
+
+static void remove_wait_queue_entries_for_trec(Capability *cap,
+                                               StgTRecHeader *trec) {
+  ASSERT(trec != NO_TREC);
+  ASSERT(trec -> enclosing_trec == NO_TREC);
+  ASSERT(trec -> state == TREC_WAITING ||
+         trec -> state == TREC_CONDEMNED);
+
+  TRACE("%p : remove_wait_queue_entries_for_trec()\n", trec);
+
+  FOR_EACH_ENTRY(trec, e, {
+    StgTVar *s;
+    StgTVarWaitQueue *pq;
+    StgTVarWaitQueue *nq;
+    StgTVarWaitQueue *q;
+    s = e -> tvar;
+    StgClosure *saw = lock_tvar(trec, s);
+    q = (StgTVarWaitQueue *) (e -> new_value);
+    TRACE("%p : removing tso=%p from wait queue for tvar=%p\n", trec, q -> waiting_tso, s);
+    ACQ_ASSERT(s -> current_value == trec);
+    nq = q -> next_queue_entry;
+    pq = q -> prev_queue_entry;
+    if (nq != END_STM_WAIT_QUEUE) {
+      nq -> prev_queue_entry = pq;
+    }
+    if (pq != END_STM_WAIT_QUEUE) {
+      pq -> next_queue_entry = nq;
+    } else {
+      ASSERT (s -> first_wait_queue_entry == q);
+      s -> first_wait_queue_entry = nq;
+    }
+    free_stg_tvar_wait_queue(cap, q);
+    unlock_tvar(trec, s, saw, FALSE);
+  });
+}
+ 
+/*......................................................................*/
+ 
+static TRecEntry *get_new_entry(Capability *cap,
+                                StgTRecHeader *t) {
+  TRecEntry *result;
+  StgTRecChunk *c;
+  int i;
+
+  c = t -> current_chunk;
+  i = c -> next_entry_idx;
+  ASSERT(c != END_STM_CHUNK_LIST);
+
+  if (i < TREC_CHUNK_NUM_ENTRIES) {
+    // Continue to use current chunk
+    result = &(c -> entries[i]);
+    c -> next_entry_idx ++;
+  } else {
+    // Current chunk is full: allocate a fresh one
+    StgTRecChunk *nc;
+    nc = alloc_stg_trec_chunk(cap);
+    nc -> prev_chunk = c;
+    nc -> next_entry_idx = 1;
+    t -> current_chunk = nc;
+    result = &(nc -> entries[0]);
+  }
+
+  return result;
+}
+
+/*......................................................................*/
+
+static void merge_update_into(Capability *cap,
+                              StgTRecHeader *t,
+                              StgTVar *tvar,
+                              StgClosure *expected_value,
+                              StgClosure *new_value) {
+  int found;
+  
+  // Look for an entry in this trec
+  found = FALSE;
+  FOR_EACH_ENTRY(t, e, {
+    StgTVar *s;
+    s = e -> tvar;
+    if (s == tvar) {
+      found = TRUE;
+      if (e -> expected_value != expected_value) {
+        // Must abort if the two entries start from different values
+        TRACE("%p : entries inconsistent at %p (%p vs %p)\n", 
+              t, tvar, e -> expected_value, expected_value);
+        t -> state = TREC_CONDEMNED;
+      } 
+      e -> new_value = new_value;
+      BREAK_FOR_EACH;
+    }
+  });
+
+  if (!found) {
+    // No entry so far in this trec
+    TRecEntry *ne;
+    ne = get_new_entry(cap, t);
+    ne -> tvar = tvar;
+    ne -> expected_value = expected_value;
+    ne -> new_value = new_value;
+  }
+}
+
+/*......................................................................*/
+
+static StgBool entry_is_update(TRecEntry *e) {
+  StgBool result;
+  result = (e -> expected_value != e -> new_value);
+  return result;
+} 
+
+#if defined(STM_FG_LOCKS)
+static StgBool entry_is_read_only(TRecEntry *e) {
+  StgBool result;
+  result = (e -> expected_value == e -> new_value);
+  return result;
+} 
+
+static StgBool tvar_is_locked(StgTVar *s, StgTRecHeader *h) {
+  StgClosure *c;
+  StgBool result;
+  c = s -> current_value;
+  result = (c == (StgClosure *) h);
+  return result;  
+}
+#endif
+
+// revert_ownership : release a lock on a TVar, storing back
+// the value that it held when the lock was acquired.  "revert_all"
+// is set in stmWait and stmReWait when we acquired locks on all of 
+// the TVars involved.  "revert_all" is not set in commit operations
+// where we don't lock TVars that have been read from but not updated.
+
+static void revert_ownership(StgTRecHeader *trec STG_UNUSED,
+                             StgBool revert_all STG_UNUSED) {
+#if defined(STM_FG_LOCKS) 
+  FOR_EACH_ENTRY(trec, e, {
+    if (revert_all || entry_is_update(e)) {
+      StgTVar *s;
+      s = e -> tvar;
+      if (tvar_is_locked(s, trec)) {
+        unlock_tvar(trec, s, e -> expected_value, TRUE);
+      }
+    }
+  });
+#endif
+}
+
+/*......................................................................*/
+
+// validate_and_acquire_ownership : this performs the twin functions
+// of checking that the TVars referred to by entries in trec hold the
+// expected values and:
+// 
+//   - locking the TVar (on updated TVars during commit, or all TVars
+//     during wait)
+//
+//   - recording the identity of the TRec who wrote the value seen in the
+//     TVar (on non-updated TVars during commit).  These values are 
+//     stashed in the TRec entries and are then checked in check_read_only
+//     to ensure that an atomic snapshot of all of these locations has been
+//     seen.
+
+static StgBool validate_and_acquire_ownership (StgTRecHeader *trec, 
+                                               int acquire_all,
+                                               int retain_ownership) {
+  StgBool result;
+
+  if (shake()) {
+    TRACE("%p : shake, pretending trec is invalid when it may not be\n", trec);
+    return FALSE;
+  }
+
+  ASSERT ((trec -> state == TREC_ACTIVE) || 
+	  (trec -> state == TREC_WAITING) ||
+	  (trec -> state == TREC_CONDEMNED));
+  result = !((trec -> state) == TREC_CONDEMNED);
+  if (result) {
+    FOR_EACH_ENTRY(trec, e, {
+      StgTVar *s;
+      s = e -> tvar;
+      if (acquire_all || entry_is_update(e)) {
+        TRACE("%p : trying to acquire %p\n", trec, s);
+        if (!cond_lock_tvar(trec, s, e -> expected_value)) {
+          TRACE("%p : failed to acquire %p\n", trec, s);
+          result = FALSE;
+          BREAK_FOR_EACH;
+        }
+      } else {
+        ASSERT(use_read_phase);
+        IF_STM_FG_LOCKS({
+          TRACE("%p : will need to check %p\n", trec, s);
+          if (s -> current_value != e -> expected_value) {
+            TRACE("%p : doesn't match\n", trec);
+            result = FALSE;
+            BREAK_FOR_EACH;
+          }
+          e -> num_updates = s -> num_updates;
+          if (s -> current_value != e -> expected_value) {
+            TRACE("%p : doesn't match (race)\n", trec);
+            result = FALSE;
+            BREAK_FOR_EACH;
+          } else {
+            TRACE("%p : need to check version %d\n", trec, e -> num_updates);
+          }
+        });
+      }
+    });
+  }
+
+  if ((!result) || (!retain_ownership)) {
+    revert_ownership(trec, acquire_all);
+  }
+  
+  return result;
+}
+
+// check_read_only : check that we've seen an atomic snapshot of the
+// non-updated TVars accessed by a trec.  This checks that the last TRec to
+// commit an update to the TVar is unchanged since the value was stashed in
+// validate_and_acquire_ownership.  If no udpate is seen to any TVar than
+// all of them contained their expected values at the start of the call to
+// check_read_only.
+//
+// The paper "Concurrent programming without locks" (under submission), or
+// Keir Fraser's PhD dissertation "Practical lock-free programming" discuss
+// this kind of algorithm.
+
+static StgBool check_read_only(StgTRecHeader *trec STG_UNUSED) {
+  StgBool result = TRUE;
+
+  ASSERT (use_read_phase);
+  IF_STM_FG_LOCKS({
+    FOR_EACH_ENTRY(trec, e, {
+      StgTVar *s;
+      s = e -> tvar;
+      if (entry_is_read_only(e)) {
+        TRACE("%p : check_read_only for TVar %p, saw %d\n", trec, s, e -> num_updates);
+        if (s -> num_updates != e -> num_updates) {
+          // ||s -> current_value != e -> expected_value) {
+          TRACE("%p : mismatch\n", trec);
+          result = FALSE;
+          BREAK_FOR_EACH;
+        }
+      }
+    });
+  });
+
+  return result;
+}
+
+
+/************************************************************************/
+
+void stmPreGCHook() {
+  nat i;
+
+  lock_stm(NO_TREC);
+  TRACE("stmPreGCHook\n");
+  for (i = 0; i < n_capabilities; i ++) {
+    Capability *cap = &capabilities[i];
+    cap -> free_tvar_wait_queues = END_STM_WAIT_QUEUE;
+    cap -> free_trec_chunks = END_STM_CHUNK_LIST;
+    cap -> free_trec_headers = NO_TREC;
+  }
+  unlock_stm(NO_TREC);
+}
+
+/************************************************************************/
+
+// check_read_only relies on version numbers held in TVars' "num_updates" 
+// fields not wrapping around while a transaction is committed.  The version
+// number is incremented each time an update is committed to the TVar
+// This is unlikely to wrap around when 32-bit integers are used for the counts, 
+// but to ensure correctness we maintain a shared count on the maximum
+// number of commit operations that may occur and check that this has 
+// not increased by more than 2^32 during a commit.
+
+#define TOKEN_BATCH_SIZE 1024
+
+static volatile StgInt64 max_commits = 0;
+
+static volatile StgBool token_locked = FALSE;
+
+#if defined(THREADED_RTS)
+static void getTokenBatch(Capability *cap) {
+  while (cas(&token_locked, FALSE, TRUE) == TRUE) { /* nothing */ }
+  max_commits += TOKEN_BATCH_SIZE;
+  cap -> transaction_tokens = TOKEN_BATCH_SIZE;
+  token_locked = FALSE;
+}
+
+static void getToken(Capability *cap) {
+  if (cap -> transaction_tokens == 0) {
+    getTokenBatch(cap);
+  }
+  cap -> transaction_tokens --;
+}
+#else
+static void getToken(Capability *cap STG_UNUSED) {
+  // Nothing
+}
+#endif
+
+/*......................................................................*/
+
+StgTRecHeader *stmStartTransaction(Capability *cap,
+                                   StgTRecHeader *outer) {
+  StgTRecHeader *t;
+  TRACE("%p : stmStartTransaction with %d tokens\n", 
+        outer, 
+        cap -> transaction_tokens);
+
+  getToken(cap);
+
+  t = alloc_stg_trec_header(cap, outer);
+  TRACE("%p : stmStartTransaction()=%p\n", outer, t);
+  return t;
+}
+
+/*......................................................................*/
+
+void stmAbortTransaction(Capability *cap,
+                         StgTRecHeader *trec) {
+  TRACE("%p : stmAbortTransaction\n", trec);
+  ASSERT (trec != NO_TREC);
+  ASSERT ((trec -> state == TREC_ACTIVE) || 
+          (trec -> state == TREC_WAITING) ||
+          (trec -> state == TREC_CONDEMNED));
+
+  lock_stm(trec);
+  if (trec -> state == TREC_WAITING) {
+    ASSERT (trec -> enclosing_trec == NO_TREC);
+    TRACE("%p : stmAbortTransaction aborting waiting transaction\n", trec);
+    remove_wait_queue_entries_for_trec(cap, trec);
+  } 
+  trec -> state = TREC_ABORTED;
+  unlock_stm(trec);
+
+  free_stg_trec_header(cap, trec);
+
+  TRACE("%p : stmAbortTransaction done\n", trec);
+}
+
+/*......................................................................*/
+
+void stmCondemnTransaction(Capability *cap,
+                           StgTRecHeader *trec) {
+  TRACE("%p : stmCondemnTransaction\n", trec);
+  ASSERT (trec != NO_TREC);
+  ASSERT ((trec -> state == TREC_ACTIVE) || 
+          (trec -> state == TREC_WAITING) ||
+          (trec -> state == TREC_CONDEMNED));
+
+  lock_stm(trec);
+  if (trec -> state == TREC_WAITING) {
+    ASSERT (trec -> enclosing_trec == NO_TREC);
+    TRACE("%p : stmCondemnTransaction condemning waiting transaction\n", trec);
+    remove_wait_queue_entries_for_trec(cap, trec);
+  } 
+  trec -> state = TREC_CONDEMNED;
+  unlock_stm(trec);
+
+  TRACE("%p : stmCondemnTransaction done\n", trec);
+}
+
+/*......................................................................*/
+
+StgTRecHeader *stmGetEnclosingTRec(StgTRecHeader *trec) {
+  StgTRecHeader *outer;
+  TRACE("%p : stmGetEnclosingTRec\n", trec);
+  outer = trec -> enclosing_trec;
+  TRACE("%p : stmGetEnclosingTRec()=%p\n", trec, outer);
+  return outer;
+}
+
+/*......................................................................*/
+
+StgBool stmValidateNestOfTransactions(StgTRecHeader *trec) {
+  StgTRecHeader *t;
+  StgBool result;
+
+  TRACE("%p : stmValidateNestOfTransactions\n", trec);
+  ASSERT(trec != NO_TREC);
+  ASSERT((trec -> state == TREC_ACTIVE) || 
+         (trec -> state == TREC_WAITING) ||
+         (trec -> state == TREC_CONDEMNED));
+
+  lock_stm(trec);
+
+  t = trec;
+  result = TRUE;
+  while (t != NO_TREC) {
+    result &= validate_and_acquire_ownership(t, TRUE, FALSE);
+    t = t -> enclosing_trec;
+  }
+
+  if (!result && trec -> state != TREC_WAITING) {
+    trec -> state = TREC_CONDEMNED; 
+  }
+
+  unlock_stm(trec);
+
+  TRACE("%p : stmValidateNestOfTransactions()=%d\n", trec, result);
+  return result;
+}
+
+/*......................................................................*/
+
+StgBool stmCommitTransaction(Capability *cap, StgTRecHeader *trec) {
+  int result;
+  StgInt64 max_commits_at_start = max_commits;
+
+  TRACE("%p : stmCommitTransaction()\n", trec);
+  ASSERT (trec != NO_TREC);
+
+  lock_stm(trec);
+
+  ASSERT (trec -> enclosing_trec == NO_TREC);
+  ASSERT ((trec -> state == TREC_ACTIVE) || 
+          (trec -> state == TREC_CONDEMNED));
+
+  result = validate_and_acquire_ownership(trec, (!use_read_phase), TRUE);
+  if (result) {
+    // We now know that all the updated locations hold their expected values.
+    ASSERT (trec -> state == TREC_ACTIVE);
+
+    if (use_read_phase) {
+      TRACE("%p : doing read check\n", trec);
+      result = check_read_only(trec);
+      TRACE("%p : read-check %s\n", trec, result ? "succeeded" : "failed");
+
+      StgInt64 max_commits_at_end = max_commits;
+      StgInt64 max_concurrent_commits;
+      max_concurrent_commits = ((max_commits_at_end - max_commits_at_start) +
+                                (n_capabilities * TOKEN_BATCH_SIZE));
+      if (((max_concurrent_commits >> 32) > 0) || shake()) {
+        result = FALSE;
+      }
+    }
+    
+    if (result) {
+      // We now know that all of the read-only locations held their exepcted values
+      // at the end of the call to validate_and_acquire_ownership.  This forms the
+      // linearization point of the commit.
+      
+      FOR_EACH_ENTRY(trec, e, {
+        StgTVar *s;
+        s = e -> tvar;
+        if (e -> new_value != e -> expected_value) {
+          // Entry is an update: write the value back to the TVar, unlocking it if
+          // necessary.
+
+          ACQ_ASSERT(tvar_is_locked(s, trec));
+          TRACE("%p : writing %p to %p, waking waiters\n", trec, e -> new_value, s);
+          unpark_waiters_on(cap,s);
+          IF_STM_FG_LOCKS({
+            s -> num_updates ++;
+          });
+          unlock_tvar(trec, s, e -> new_value, TRUE);
+        } 
+        ACQ_ASSERT(!tvar_is_locked(s, trec));
+      });
+    } else {
+      revert_ownership(trec, FALSE);
+    }
+  } 
+
+  unlock_stm(trec);
+
+  free_stg_trec_header(cap, trec);
+
+  TRACE("%p : stmCommitTransaction()=%d\n", trec, result);
+
+  return result;
+}
+
+/*......................................................................*/
+
+StgBool stmCommitNestedTransaction(Capability *cap, StgTRecHeader *trec) {
+  StgTRecHeader *et;
+  int result;
+  ASSERT (trec != NO_TREC && trec -> enclosing_trec != NO_TREC);
+  TRACE("%p : stmCommitNestedTransaction() into %p\n", trec, trec -> enclosing_trec);
+  ASSERT ((trec -> state == TREC_ACTIVE) || (trec -> state == TREC_CONDEMNED));
+
+  lock_stm(trec);
+
+  et = trec -> enclosing_trec;
+  result = validate_and_acquire_ownership(trec, (!use_read_phase), TRUE);
+  if (result) {
+    // We now know that all the updated locations hold their expected values.
+
+    if (use_read_phase) {
+      TRACE("%p : doing read check\n", trec);
+      result = check_read_only(trec);
+    }
+    if (result) {
+      // We now know that all of the read-only locations held their exepcted values
+      // at the end of the call to validate_and_acquire_ownership.  This forms the
+      // linearization point of the commit.
+
+      if (result) {
+        TRACE("%p : read-check succeeded\n", trec);
+        FOR_EACH_ENTRY(trec, e, {
+          // Merge each entry into the enclosing transaction record, release all
+          // locks.
+
+          StgTVar *s;
+          s = e -> tvar;
+          if (entry_is_update(e)) {
+            unlock_tvar(trec, s, e -> expected_value, FALSE);
+          }
+          merge_update_into(cap, et, s, e -> expected_value, e -> new_value);
+          ACQ_ASSERT(s -> current_value != trec);
+        });
+      } else {
+        revert_ownership(trec, FALSE);
+      }
+    }
+  } 
+
+  unlock_stm(trec);
+
+  free_stg_trec_header(cap, trec);
+
+  TRACE("%p : stmCommitNestedTransaction()=%d\n", trec, result);
+
+  return result;
+}
+
+/*......................................................................*/
+
+StgBool stmWait(Capability *cap, StgTSO *tso, StgTRecHeader *trec) {
+  int result;
+  TRACE("%p : stmWait(%p)\n", trec, tso);
+  ASSERT (trec != NO_TREC);
+  ASSERT (trec -> enclosing_trec == NO_TREC);
+  ASSERT ((trec -> state == TREC_ACTIVE) || 
+          (trec -> state == TREC_CONDEMNED));
+
+  lock_stm(trec);
+  result = validate_and_acquire_ownership(trec, TRUE, TRUE);
+  if (result) {
+    // The transaction is valid so far so we can actually start waiting.
+    // (Otherwise the transaction was not valid and the thread will have to
+    // retry it).
+
+    // Put ourselves to sleep.  We retain locks on all the TVars involved
+    // until we are sound asleep : (a) on the wait queues, (b) BlockedOnSTM
+    // in the TSO, (c) TREC_WAITING in the Trec.  
+    build_wait_queue_entries_for_trec(cap, tso, trec);
+    park_tso(tso);
+    trec -> state = TREC_WAITING;
+
+    // We haven't released ownership of the transaction yet.  The TSO
+    // has been put on the wait queue for the TVars it is waiting for,
+    // but we haven't yet tidied up the TSO's stack and made it safe
+    // to wake up the TSO.  Therefore, we must wait until the TSO is
+    // safe to wake up before we release ownership - when all is well,
+    // the runtime will call stmWaitUnlock() below, with the same
+    // TRec.
+
+  } else {
+    unlock_stm(trec);
+    free_stg_trec_header(cap, trec);
+  }
+
+  TRACE("%p : stmWait(%p)=%d\n", trec, tso, result);
+  return result;
+}
+
+
+void
+stmWaitUnlock(Capability *cap STG_UNUSED, StgTRecHeader *trec) {
+    revert_ownership(trec, TRUE);
+    unlock_stm(trec);
+}
+
+/*......................................................................*/
+
+StgBool stmReWait(Capability *cap, StgTSO *tso) {
+  int result;
+  StgTRecHeader *trec = tso->trec;
+
+  TRACE("%p : stmReWait\n", trec);
+  ASSERT (trec != NO_TREC);
+  ASSERT (trec -> enclosing_trec == NO_TREC);
+  ASSERT ((trec -> state == TREC_WAITING) || 
+          (trec -> state == TREC_CONDEMNED));
+
+  lock_stm(trec);
+  result = validate_and_acquire_ownership(trec, TRUE, TRUE);
+  TRACE("%p : validation %s\n", trec, result ? "succeeded" : "failed");
+  if (result) {
+    // The transaction remains valid -- do nothing because it is already on
+    // the wait queues
+    ASSERT (trec -> state == TREC_WAITING);
+    park_tso(tso);
+    revert_ownership(trec, TRUE);
+  } else {
+    // The transcation has become invalid.  We can now remove it from the wait
+    // queues.
+    if (trec -> state != TREC_CONDEMNED) {
+      remove_wait_queue_entries_for_trec (cap, trec);
+    }
+    free_stg_trec_header(cap, trec);
+  }
+  unlock_stm(trec);
+
+  TRACE("%p : stmReWait()=%d\n", trec, result);
+  return result;
+}
+
+/*......................................................................*/
+
+static TRecEntry *get_entry_for(StgTRecHeader *trec, StgTVar *tvar, StgTRecHeader **in) {
+  TRecEntry *result = NULL;
+
+  TRACE("%p : get_entry_for TVar %p\n", trec, tvar);
+  ASSERT(trec != NO_TREC);
+
+  do {
+    FOR_EACH_ENTRY(trec, e, {
+      if (e -> tvar == tvar) {
+        result = e;
+        if (in != NULL) {
+          *in = trec;
+        }
+        BREAK_FOR_EACH;
+      }
+    });
+    trec = trec -> enclosing_trec;
+  } while (result == NULL && trec != NO_TREC);
+
+  return result;    
+}
+
+static StgClosure *read_current_value(StgTRecHeader *trec STG_UNUSED, StgTVar *tvar) {
+  StgClosure *result;
+  result = tvar -> current_value;
+
+#if defined(STM_FG_LOCKS)
+  while (GET_INFO(result) == &stg_TREC_HEADER_info) {
+    TRACE("%p : read_current_value(%p) saw %p\n", trec, tvar, result);
+    result = tvar -> current_value;
+  }
+#endif
+
+  TRACE("%p : read_current_value(%p)=%p\n", trec, tvar, result);
+  return result;
+}
+
+/*......................................................................*/
+
+StgClosure *stmReadTVar(Capability *cap,
+                        StgTRecHeader *trec, 
+			StgTVar *tvar) {
+  StgTRecHeader *entry_in;
+  StgClosure *result = NULL;
+  TRecEntry *entry = NULL;
+  TRACE("%p : stmReadTVar(%p)\n", trec, tvar);
+  ASSERT (trec != NO_TREC);
+  ASSERT (trec -> state == TREC_ACTIVE || 
+          trec -> state == TREC_CONDEMNED);
+
+  entry = get_entry_for(trec, tvar, &entry_in);
+
+  if (entry != NULL) {
+    if (entry_in == trec) {
+      // Entry found in our trec
+      result = entry -> new_value;
+    } else {
+      // Entry found in another trec
+      TRecEntry *new_entry = get_new_entry(cap, trec);
+      new_entry -> tvar = tvar;
+      new_entry -> expected_value = entry -> expected_value;
+      new_entry -> new_value = entry -> new_value;
+      result = new_entry -> new_value;
+    } 
+  } else {
+    // No entry found
+    StgClosure *current_value = read_current_value(trec, tvar);
+    TRecEntry *new_entry = get_new_entry(cap, trec);
+    new_entry -> tvar = tvar;
+    new_entry -> expected_value = current_value;
+    new_entry -> new_value = current_value;
+    result = current_value;
+  }
+
+  TRACE("%p : stmReadTVar(%p)=%p\n", trec, tvar, result);
+  return result;
+}
+
+/*......................................................................*/
+
+void stmWriteTVar(Capability *cap,
+                  StgTRecHeader *trec,
+		  StgTVar *tvar, 
+		  StgClosure *new_value) {
+
+  StgTRecHeader *entry_in;
+  TRecEntry *entry = NULL;
+  TRACE("%p : stmWriteTVar(%p, %p)\n", trec, tvar, new_value);
+  ASSERT (trec != NO_TREC);
+  ASSERT (trec -> state == TREC_ACTIVE || 
+          trec -> state == TREC_CONDEMNED);
+
+  entry = get_entry_for(trec, tvar, &entry_in);
+
+  if (entry != NULL) {
+    if (entry_in == trec) {
+      // Entry found in our trec
+      entry -> new_value = new_value;
+    } else {
+      // Entry found in another trec
+      TRecEntry *new_entry = get_new_entry(cap, trec);
+      new_entry -> tvar = tvar;
+      new_entry -> expected_value = entry -> expected_value;
+      new_entry -> new_value = new_value;
+    } 
+  } else {
+    // No entry found
+    StgClosure *current_value = read_current_value(trec, tvar);
+    TRecEntry *new_entry = get_new_entry(cap, trec);
+    new_entry -> tvar = tvar;
+    new_entry -> expected_value = current_value;
+    new_entry -> new_value = new_value;
+  }
+
+  TRACE("%p : stmWriteTVar done\n", trec);
+}
+
+/*......................................................................*/
+
+StgTVar *stmNewTVar(Capability *cap,
+                    StgClosure *new_value) {
+  StgTVar *result;
+  result = (StgTVar *)allocateLocal(cap, sizeofW(StgTVar));
+  SET_HDR (result, &stg_TVAR_info, CCS_SYSTEM);
+  result -> current_value = new_value;
+  result -> first_wait_queue_entry = END_STM_WAIT_QUEUE;
+#if defined(THREADED_RTS)
+  result -> num_updates = 0;
+#endif
+  return result;
+}
+
+/*......................................................................*/
diff --git a/rts/Sanity.c b/rts/Sanity.c
new file mode 100644
index 0000000000..0e68a86ba7
--- /dev/null
+++ b/rts/Sanity.c
@@ -0,0 +1,948 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2006
+ *
+ * Sanity checking code for the heap and stack.
+ *
+ * Used when debugging: check that everything reasonable.
+ *
+ *    - All things that are supposed to be pointers look like pointers.
+ *
+ *    - Objects in text space are marked as static closures, those
+ *	in the heap are dynamic.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+
+#ifdef DEBUG                                                   /* whole file */
+
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "BlockAlloc.h"
+#include "Sanity.h"
+#include "MBlock.h"
+#include "Storage.h"
+#include "Schedule.h"
+#include "Apply.h"
+
+/* -----------------------------------------------------------------------------
+   Forward decls.
+   -------------------------------------------------------------------------- */
+
+static void      checkSmallBitmap    ( StgPtr payload, StgWord bitmap, nat );
+static void      checkLargeBitmap    ( StgPtr payload, StgLargeBitmap*, nat );
+static void      checkClosureShallow ( StgClosure * );
+
+/* -----------------------------------------------------------------------------
+   Check stack sanity
+   -------------------------------------------------------------------------- */
+
+static void
+checkSmallBitmap( StgPtr payload, StgWord bitmap, nat size )
+{
+    StgPtr p;
+    nat i;
+
+    p = payload;
+    for(i = 0; i < size; i++, bitmap >>= 1 ) {
+	if ((bitmap & 1) == 0) {
+	    checkClosureShallow((StgClosure *)payload[i]);
+	}
+    }
+}
+
+static void
+checkLargeBitmap( StgPtr payload, StgLargeBitmap* large_bitmap, nat size )
+{
+    StgWord bmp;
+    nat i, j;
+
+    i = 0;
+    for (bmp=0; i < size; bmp++) {
+	StgWord bitmap = large_bitmap->bitmap[bmp];
+	j = 0;
+	for(; i < size && j < BITS_IN(W_); j++, i++, bitmap >>= 1 ) {
+	    if ((bitmap & 1) == 0) {
+		checkClosureShallow((StgClosure *)payload[i]);
+	    }
+	}
+    }
+}
+
+/*
+ * check that it looks like a valid closure - without checking its payload
+ * used to avoid recursion between checking PAPs and checking stack
+ * chunks.
+ */
+ 
+static void 
+checkClosureShallow( StgClosure* p )
+{
+    ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
+
+    /* Is it a static closure? */
+    if (!HEAP_ALLOCED(p)) {
+	ASSERT(closure_STATIC(p));
+    } else {
+	ASSERT(!closure_STATIC(p));
+    }
+}
+
+// check an individual stack object
+StgOffset 
+checkStackFrame( StgPtr c )
+{
+    nat size;
+    const StgRetInfoTable* info;
+
+    info = get_ret_itbl((StgClosure *)c);
+
+    /* All activation records have 'bitmap' style layout info. */
+    switch (info->i.type) {
+    case RET_DYN: /* Dynamic bitmap: the mask is stored on the stack */
+    {
+	StgWord dyn;
+	StgPtr p;
+	StgRetDyn* r;
+	
+	r = (StgRetDyn *)c;
+	dyn = r->liveness;
+	
+	p = (P_)(r->payload);
+	checkSmallBitmap(p,RET_DYN_LIVENESS(r->liveness),RET_DYN_BITMAP_SIZE);
+	p += RET_DYN_BITMAP_SIZE + RET_DYN_NONPTR_REGS_SIZE;
+
+	// skip over the non-pointers
+	p += RET_DYN_NONPTRS(dyn);
+	
+	// follow the ptr words
+	for (size = RET_DYN_PTRS(dyn); size > 0; size--) {
+	    checkClosureShallow((StgClosure *)*p);
+	    p++;
+	}
+	
+	return sizeofW(StgRetDyn) + RET_DYN_BITMAP_SIZE +
+	    RET_DYN_NONPTR_REGS_SIZE +
+	    RET_DYN_NONPTRS(dyn) + RET_DYN_PTRS(dyn);
+    }
+
+    case UPDATE_FRAME:
+      ASSERT(LOOKS_LIKE_CLOSURE_PTR(((StgUpdateFrame*)c)->updatee));
+    case ATOMICALLY_FRAME:
+    case CATCH_RETRY_FRAME:
+    case CATCH_STM_FRAME:
+    case CATCH_FRAME:
+      // small bitmap cases (<= 32 entries)
+    case STOP_FRAME:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+	size = BITMAP_SIZE(info->i.layout.bitmap);
+	checkSmallBitmap((StgPtr)c + 1, 
+			 BITMAP_BITS(info->i.layout.bitmap), size);
+	return 1 + size;
+
+    case RET_BCO: {
+	StgBCO *bco;
+	nat size;
+	bco = (StgBCO *)*(c+1);
+	size = BCO_BITMAP_SIZE(bco);
+	checkLargeBitmap((StgPtr)c + 2, BCO_BITMAP(bco), size);
+	return 2 + size;
+    }
+
+    case RET_BIG: // large bitmap (> 32 entries)
+    case RET_VEC_BIG:
+	size = GET_LARGE_BITMAP(&info->i)->size;
+	checkLargeBitmap((StgPtr)c + 1, GET_LARGE_BITMAP(&info->i), size);
+	return 1 + size;
+
+    case RET_FUN:
+    {
+	StgFunInfoTable *fun_info;
+	StgRetFun *ret_fun;
+
+	ret_fun = (StgRetFun *)c;
+	fun_info = get_fun_itbl(ret_fun->fun);
+	size = ret_fun->size;
+	switch (fun_info->f.fun_type) {
+	case ARG_GEN:
+	    checkSmallBitmap((StgPtr)ret_fun->payload, 
+			     BITMAP_BITS(fun_info->f.b.bitmap), size);
+	    break;
+	case ARG_GEN_BIG:
+	    checkLargeBitmap((StgPtr)ret_fun->payload,
+			     GET_FUN_LARGE_BITMAP(fun_info), size);
+	    break;
+	default:
+	    checkSmallBitmap((StgPtr)ret_fun->payload,
+			     BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]),
+			     size);
+	    break;
+	}
+	return sizeofW(StgRetFun) + size;
+    }
+
+    default:
+	barf("checkStackFrame: weird activation record found on stack (%p %d).",c,info->i.type);
+    }
+}
+
+// check sections of stack between update frames
+void 
+checkStackChunk( StgPtr sp, StgPtr stack_end )
+{
+    StgPtr p;
+
+    p = sp;
+    while (p < stack_end) {
+	p += checkStackFrame( p );
+    }
+    // ASSERT( p == stack_end ); -- HWL
+}
+
+static void
+checkPAP (StgClosure *fun, StgClosure** payload, StgWord n_args)
+{ 
+    StgClosure *p;
+    StgFunInfoTable *fun_info;
+    
+    ASSERT(LOOKS_LIKE_CLOSURE_PTR(fun));
+    fun_info = get_fun_itbl(fun);
+    
+    p = (StgClosure *)payload;
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+	checkSmallBitmap( (StgPtr)payload, 
+			  BITMAP_BITS(fun_info->f.b.bitmap), n_args );
+	break;
+    case ARG_GEN_BIG:
+	checkLargeBitmap( (StgPtr)payload, 
+			  GET_FUN_LARGE_BITMAP(fun_info), 
+			  n_args );
+	break;
+    case ARG_BCO:
+	checkLargeBitmap( (StgPtr)payload, 
+			  BCO_BITMAP(fun), 
+			  n_args );
+	break;
+    default:
+	checkSmallBitmap( (StgPtr)payload, 
+			  BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]),
+			  n_args );
+	break;
+    }
+}
+
+
+StgOffset 
+checkClosure( StgClosure* p )
+{
+    const StgInfoTable *info;
+
+    ASSERT(LOOKS_LIKE_INFO_PTR(p->header.info));
+
+    /* Is it a static closure (i.e. in the data segment)? */
+    if (!HEAP_ALLOCED(p)) {
+	ASSERT(closure_STATIC(p));
+    } else {
+	ASSERT(!closure_STATIC(p));
+    }
+
+    info = get_itbl(p);
+    switch (info->type) {
+
+    case MVAR:
+      { 
+	StgMVar *mvar = (StgMVar *)p;
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(mvar->head));
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(mvar->tail));
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(mvar->value));
+#if 0
+#if defined(PAR)
+	checkBQ((StgBlockingQueueElement *)mvar->head, p);
+#else
+	checkBQ(mvar->head, p);
+#endif
+#endif
+	return sizeofW(StgMVar);
+      }
+
+    case THUNK:
+    case THUNK_1_0:
+    case THUNK_0_1:
+    case THUNK_1_1:
+    case THUNK_0_2:
+    case THUNK_2_0:
+      {
+	nat i;
+	for (i = 0; i < info->layout.payload.ptrs; i++) {
+	  ASSERT(LOOKS_LIKE_CLOSURE_PTR(((StgThunk *)p)->payload[i]));
+	}
+	return thunk_sizeW_fromITBL(info);
+      }
+
+    case FUN:
+    case FUN_1_0:
+    case FUN_0_1:
+    case FUN_1_1:
+    case FUN_0_2:
+    case FUN_2_0:
+    case CONSTR:
+    case CONSTR_1_0:
+    case CONSTR_0_1:
+    case CONSTR_1_1:
+    case CONSTR_0_2:
+    case CONSTR_2_0:
+    case IND_PERM:
+    case IND_OLDGEN:
+    case IND_OLDGEN_PERM:
+#ifdef TICKY_TICKY
+    case SE_BLACKHOLE:
+    case SE_CAF_BLACKHOLE:
+#endif
+    case BLACKHOLE:
+    case CAF_BLACKHOLE:
+    case STABLE_NAME:
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+    case CONSTR_INTLIKE:
+    case CONSTR_CHARLIKE:
+    case CONSTR_STATIC:
+    case CONSTR_NOCAF_STATIC:
+    case THUNK_STATIC:
+    case FUN_STATIC:
+	{
+	    nat i;
+	    for (i = 0; i < info->layout.payload.ptrs; i++) {
+		ASSERT(LOOKS_LIKE_CLOSURE_PTR(p->payload[i]));
+	    }
+	    return sizeW_fromITBL(info);
+	}
+
+    case BCO: {
+	StgBCO *bco = (StgBCO *)p;
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(bco->instrs));
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(bco->literals));
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(bco->ptrs));
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(bco->itbls));
+	return bco_sizeW(bco);
+    }
+
+    case IND_STATIC: /* (1, 0) closure */
+      ASSERT(LOOKS_LIKE_CLOSURE_PTR(((StgIndStatic*)p)->indirectee));
+      return sizeW_fromITBL(info);
+
+    case WEAK:
+      /* deal with these specially - the info table isn't
+       * representative of the actual layout.
+       */
+      { StgWeak *w = (StgWeak *)p;
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(w->key));
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(w->value));
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(w->finalizer));
+	if (w->link) {
+	  ASSERT(LOOKS_LIKE_CLOSURE_PTR(w->link));
+	}
+	return sizeW_fromITBL(info);
+      }
+
+    case THUNK_SELECTOR:
+	    ASSERT(LOOKS_LIKE_CLOSURE_PTR(((StgSelector *)p)->selectee));
+	    return THUNK_SELECTOR_sizeW();
+
+    case IND:
+	{ 
+  	    /* we don't expect to see any of these after GC
+	     * but they might appear during execution
+	     */
+	    StgInd *ind = (StgInd *)p;
+	    ASSERT(LOOKS_LIKE_CLOSURE_PTR(ind->indirectee));
+	    return sizeofW(StgInd);
+	}
+
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+    case RET_BIG:
+    case RET_VEC_BIG:
+    case RET_DYN:
+    case UPDATE_FRAME:
+    case STOP_FRAME:
+    case CATCH_FRAME:
+    case ATOMICALLY_FRAME:
+    case CATCH_RETRY_FRAME:
+    case CATCH_STM_FRAME:
+	    barf("checkClosure: stack frame");
+
+    case AP:
+    {
+	StgAP* ap = (StgAP *)p;
+	checkPAP (ap->fun, ap->payload, ap->n_args);
+	return ap_sizeW(ap);
+    }
+
+    case PAP:
+    {
+	StgPAP* pap = (StgPAP *)p;
+	checkPAP (pap->fun, pap->payload, pap->n_args);
+	return pap_sizeW(pap);
+    }
+
+    case AP_STACK:
+    { 
+	StgAP_STACK *ap = (StgAP_STACK *)p;
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(ap->fun));
+	checkStackChunk((StgPtr)ap->payload, (StgPtr)ap->payload + ap->size);
+	return ap_stack_sizeW(ap);
+    }
+
+    case ARR_WORDS:
+	    return arr_words_sizeW((StgArrWords *)p);
+
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    case MUT_ARR_PTRS_FROZEN:
+    case MUT_ARR_PTRS_FROZEN0:
+	{
+	    StgMutArrPtrs* a = (StgMutArrPtrs *)p;
+	    nat i;
+	    for (i = 0; i < a->ptrs; i++) {
+		ASSERT(LOOKS_LIKE_CLOSURE_PTR(a->payload[i]));
+	    }
+	    return mut_arr_ptrs_sizeW(a);
+	}
+
+    case TSO:
+        checkTSO((StgTSO *)p);
+        return tso_sizeW((StgTSO *)p);
+
+#if defined(PAR)
+
+    case BLOCKED_FETCH:
+      ASSERT(LOOKS_LIKE_GA(&(((StgBlockedFetch *)p)->ga)));
+      ASSERT(LOOKS_LIKE_CLOSURE_PTR((((StgBlockedFetch *)p)->node)));
+      return sizeofW(StgBlockedFetch);  // see size used in evacuate()
+
+#ifdef DIST
+    case REMOTE_REF:
+      return sizeofW(StgFetchMe); 
+#endif /*DIST */
+      
+    case FETCH_ME:
+      ASSERT(LOOKS_LIKE_GA(((StgFetchMe *)p)->ga));
+      return sizeofW(StgFetchMe);  // see size used in evacuate()
+
+    case FETCH_ME_BQ:
+      checkBQ(((StgFetchMeBlockingQueue *)p)->blocking_queue, (StgClosure *)p);
+      return sizeofW(StgFetchMeBlockingQueue); // see size used in evacuate()
+
+    case RBH:
+      /* In an RBH the BQ may be empty (ie END_BQ_QUEUE) but not NULL */
+      ASSERT(((StgRBH *)p)->blocking_queue!=NULL);
+      if (((StgRBH *)p)->blocking_queue!=END_BQ_QUEUE)
+	checkBQ(((StgRBH *)p)->blocking_queue, p);
+      ASSERT(LOOKS_LIKE_INFO_PTR(REVERT_INFOPTR(get_itbl((StgClosure *)p))));
+      return BLACKHOLE_sizeW();   // see size used in evacuate()
+      // sizeW_fromITBL(REVERT_INFOPTR(get_itbl((StgClosure *)p)));
+
+#endif
+
+    case TVAR_WAIT_QUEUE:
+      {
+        StgTVarWaitQueue *wq = (StgTVarWaitQueue *)p;
+        ASSERT(LOOKS_LIKE_CLOSURE_PTR(wq->next_queue_entry));
+        ASSERT(LOOKS_LIKE_CLOSURE_PTR(wq->prev_queue_entry));
+        return sizeofW(StgTVarWaitQueue);
+      }
+
+    case TVAR:
+      {
+        StgTVar *tv = (StgTVar *)p;
+        ASSERT(LOOKS_LIKE_CLOSURE_PTR(tv->current_value));
+        ASSERT(LOOKS_LIKE_CLOSURE_PTR(tv->first_wait_queue_entry));
+        return sizeofW(StgTVar);
+      }
+
+    case TREC_CHUNK:
+      {
+        nat i;
+        StgTRecChunk *tc = (StgTRecChunk *)p;
+        ASSERT(LOOKS_LIKE_CLOSURE_PTR(tc->prev_chunk));
+        for (i = 0; i < tc -> next_entry_idx; i ++) {
+          ASSERT(LOOKS_LIKE_CLOSURE_PTR(tc->entries[i].tvar));
+          ASSERT(LOOKS_LIKE_CLOSURE_PTR(tc->entries[i].expected_value));
+          ASSERT(LOOKS_LIKE_CLOSURE_PTR(tc->entries[i].new_value));
+        }
+        return sizeofW(StgTRecChunk);
+      }
+
+    case TREC_HEADER:
+      {
+        StgTRecHeader *trec = (StgTRecHeader *)p;
+        ASSERT(LOOKS_LIKE_CLOSURE_PTR(trec -> enclosing_trec));
+        ASSERT(LOOKS_LIKE_CLOSURE_PTR(trec -> current_chunk));
+        return sizeofW(StgTRecHeader);
+      }
+      
+      
+    case EVACUATED:
+	    barf("checkClosure: found EVACUATED closure %d",
+		 info->type);
+    default:
+	    barf("checkClosure (closure type %d)", info->type);
+    }
+}
+
+#if defined(PAR)
+
+#define PVM_PE_MASK    0xfffc0000
+#define MAX_PVM_PES    MAX_PES
+#define MAX_PVM_TIDS   MAX_PES
+#define MAX_SLOTS      100000
+
+rtsBool
+looks_like_tid(StgInt tid)
+{
+  StgInt hi = (tid & PVM_PE_MASK) >> 18;
+  StgInt lo = (tid & ~PVM_PE_MASK);
+  rtsBool ok = (hi != 0) && (lo < MAX_PVM_TIDS) && (hi < MAX_PVM_TIDS);
+  return ok;
+}
+
+rtsBool
+looks_like_slot(StgInt slot)
+{
+  /* if tid is known better use looks_like_ga!! */
+  rtsBool ok = slot<MAX_SLOTS;
+  // This refers only to the no. of slots on the current PE
+  // rtsBool ok = slot<=highest_slot();
+  return ok; 
+}
+
+rtsBool
+looks_like_ga(globalAddr *ga)
+{
+  rtsBool is_tid = looks_like_tid((ga)->payload.gc.gtid);
+  rtsBool is_slot = ((ga)->payload.gc.gtid==mytid) ? 
+                     (ga)->payload.gc.slot<=highest_slot() : 
+                     (ga)->payload.gc.slot<MAX_SLOTS;
+  rtsBool ok = is_tid && is_slot;
+  return ok;
+}
+
+#endif
+
+
+/* -----------------------------------------------------------------------------
+   Check Heap Sanity
+
+   After garbage collection, the live heap is in a state where we can
+   run through and check that all the pointers point to the right
+   place.  This function starts at a given position and sanity-checks
+   all the objects in the remainder of the chain.
+   -------------------------------------------------------------------------- */
+
+void 
+checkHeap(bdescr *bd)
+{
+    StgPtr p;
+
+#if defined(THREADED_RTS)
+    // heap sanity checking doesn't work with SMP, because we can't
+    // zero the slop (see Updates.h).
+    return;
+#endif
+
+    for (; bd != NULL; bd = bd->link) {
+	p = bd->start;
+	while (p < bd->free) {
+	    nat size = checkClosure((StgClosure *)p);
+	    /* This is the smallest size of closure that can live in the heap */
+	    ASSERT( size >= MIN_PAYLOAD_SIZE + sizeofW(StgHeader) );
+	    p += size;
+	    
+	    /* skip over slop */
+	    while (p < bd->free &&
+		   (*p < 0x1000 || !LOOKS_LIKE_INFO_PTR((void*)*p))) { p++; } 
+	}
+    }
+}
+
+#if defined(PAR)
+/* 
+   Check heap between start and end. Used after unpacking graphs.
+*/
+void 
+checkHeapChunk(StgPtr start, StgPtr end)
+{
+  extern globalAddr *LAGAlookup(StgClosure *addr);
+  StgPtr p;
+  nat size;
+
+  for (p=start; p<end; p+=size) {
+    ASSERT(LOOKS_LIKE_INFO_PTR((void*)*p));
+    if (get_itbl((StgClosure*)p)->type == FETCH_ME &&
+	*(p+1) == 0x0000eeee /* ie. unpack garbage (see SetGAandCommonUp) */) {
+      /* if it's a FM created during unpack and commoned up, it's not global */
+      ASSERT(LAGAlookup((StgClosure*)p)==NULL);
+      size = sizeofW(StgFetchMe);
+    } else if (get_itbl((StgClosure*)p)->type == IND) {
+      *(p+2) = 0x0000ee11; /* mark slop in IND as garbage */
+      size = sizeofW(StgInd);
+    } else {
+      size = checkClosure((StgClosure *)p);
+      /* This is the smallest size of closure that can live in the heap. */
+      ASSERT( size >= MIN_PAYLOAD_SIZE + sizeofW(StgHeader) );
+    }
+  }
+}
+#else /* !PAR */
+void 
+checkHeapChunk(StgPtr start, StgPtr end)
+{
+  StgPtr p;
+  nat size;
+
+  for (p=start; p<end; p+=size) {
+    ASSERT(LOOKS_LIKE_INFO_PTR((void*)*p));
+    size = checkClosure((StgClosure *)p);
+    /* This is the smallest size of closure that can live in the heap. */
+    ASSERT( size >= MIN_PAYLOAD_SIZE + sizeofW(StgHeader) );
+  }
+}
+#endif
+
+void
+checkChain(bdescr *bd)
+{
+  while (bd != NULL) {
+    checkClosure((StgClosure *)bd->start);
+    bd = bd->link;
+  }
+}
+
+void
+checkTSO(StgTSO *tso)
+{
+    StgPtr sp = tso->sp;
+    StgPtr stack = tso->stack;
+    StgOffset stack_size = tso->stack_size;
+    StgPtr stack_end = stack + stack_size;
+
+    if (tso->what_next == ThreadRelocated) {
+      checkTSO(tso->link);
+      return;
+    }
+
+    if (tso->what_next == ThreadKilled) {
+      /* The garbage collector doesn't bother following any pointers
+       * from dead threads, so don't check sanity here.  
+       */
+      return;
+    }
+
+    ASSERT(stack <= sp && sp < stack_end);
+
+#if defined(PAR)
+    ASSERT(tso->par.magic==TSO_MAGIC);
+
+    switch (tso->why_blocked) {
+    case BlockedOnGA: 
+      checkClosureShallow(tso->block_info.closure);
+      ASSERT(/* Can't be a FETCH_ME because *this* closure is on its BQ */
+	     get_itbl(tso->block_info.closure)->type==FETCH_ME_BQ);
+      break;
+    case BlockedOnGA_NoSend: 
+      checkClosureShallow(tso->block_info.closure);
+      ASSERT(get_itbl(tso->block_info.closure)->type==FETCH_ME_BQ);
+      break;
+    case BlockedOnBlackHole: 
+      checkClosureShallow(tso->block_info.closure);
+      ASSERT(get_itbl(tso->block_info.closure)->type==BLACKHOLE ||
+             get_itbl(tso->block_info.closure)->type==RBH);
+      break;
+    case BlockedOnRead:
+    case BlockedOnWrite:
+    case BlockedOnDelay:
+#if defined(mingw32_HOST_OS)
+    case BlockedOnDoProc:
+#endif
+      /* isOnBQ(blocked_queue) */
+      break;
+    case BlockedOnException:
+      /* isOnSomeBQ(tso) */
+      ASSERT(get_itbl(tso->block_info.tso)->type==TSO);
+      break;
+    case BlockedOnMVar:
+      ASSERT(get_itbl(tso->block_info.closure)->type==MVAR);
+      break;
+    case BlockedOnSTM:
+      ASSERT(tso->block_info.closure == END_TSO_QUEUE);
+      break;
+    default:
+      /* 
+	 Could check other values of why_blocked but I am more 
+	 lazy than paranoid (bad combination) -- HWL 
+      */
+    }
+
+    /* if the link field is non-nil it most point to one of these
+       three closure types */
+    ASSERT(tso->link == END_TSO_QUEUE ||
+	   get_itbl(tso->link)->type == TSO ||
+	   get_itbl(tso->link)->type == BLOCKED_FETCH ||
+	   get_itbl(tso->link)->type == CONSTR);
+#endif
+
+    checkStackChunk(sp, stack_end);
+}
+
+#if defined(GRAN)
+void  
+checkTSOsSanity(void) {
+  nat i, tsos;
+  StgTSO *tso;
+  
+  debugBelch("Checking sanity of all runnable TSOs:");
+  
+  for (i=0, tsos=0; i<RtsFlags.GranFlags.proc; i++) {
+    for (tso=run_queue_hds[i]; tso!=END_TSO_QUEUE; tso=tso->link) {
+      debugBelch("TSO %p on PE %d ...", tso, i);
+      checkTSO(tso); 
+      debugBelch("OK, ");
+      tsos++;
+    }
+  }
+  
+  debugBelch(" checked %d TSOs on %d PEs; ok\n", tsos, RtsFlags.GranFlags.proc);
+}
+
+
+// still GRAN only
+
+rtsBool
+checkThreadQSanity (PEs proc, rtsBool check_TSO_too) 
+{
+  StgTSO *tso, *prev;
+
+  /* the NIL value for TSOs is END_TSO_QUEUE; thus, finding NULL is an error */
+  ASSERT(run_queue_hds[proc]!=NULL);
+  ASSERT(run_queue_tls[proc]!=NULL);
+  /* if either head or tail is NIL then the other one must be NIL, too */
+  ASSERT(run_queue_hds[proc]!=END_TSO_QUEUE || run_queue_tls[proc]==END_TSO_QUEUE);
+  ASSERT(run_queue_tls[proc]!=END_TSO_QUEUE || run_queue_hds[proc]==END_TSO_QUEUE);
+  for (tso=run_queue_hds[proc], prev=END_TSO_QUEUE; 
+       tso!=END_TSO_QUEUE;
+       prev=tso, tso=tso->link) {
+    ASSERT((prev!=END_TSO_QUEUE || tso==run_queue_hds[proc]) &&
+	   (prev==END_TSO_QUEUE || prev->link==tso));
+    if (check_TSO_too)
+      checkTSO(tso);
+  }
+  ASSERT(prev==run_queue_tls[proc]);
+}
+
+rtsBool
+checkThreadQsSanity (rtsBool check_TSO_too)
+{
+  PEs p;
+  
+  for (p=0; p<RtsFlags.GranFlags.proc; p++)
+    checkThreadQSanity(p, check_TSO_too);
+}
+#endif /* GRAN */
+
+/* 
+   Check that all TSOs have been evacuated.
+   Optionally also check the sanity of the TSOs.
+*/
+void
+checkGlobalTSOList (rtsBool checkTSOs)
+{
+  extern  StgTSO *all_threads;
+  StgTSO *tso;
+  for (tso=all_threads; tso != END_TSO_QUEUE; tso = tso->global_link) {
+      ASSERT(LOOKS_LIKE_CLOSURE_PTR(tso));
+      ASSERT(get_itbl(tso)->type == TSO);
+      if (checkTSOs)
+	  checkTSO(tso);
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   Check mutable list sanity.
+   -------------------------------------------------------------------------- */
+
+void
+checkMutableList( bdescr *mut_bd, nat gen )
+{
+    bdescr *bd;
+    StgPtr q;
+    StgClosure *p;
+
+    for (bd = mut_bd; bd != NULL; bd = bd->link) {
+	for (q = bd->start; q < bd->free; q++) {
+	    p = (StgClosure *)*q;
+	    ASSERT(!HEAP_ALLOCED(p) || Bdescr((P_)p)->gen_no == gen);
+	}
+    }
+}
+
+/*
+  Check the static objects list.
+*/
+void
+checkStaticObjects ( StgClosure* static_objects )
+{
+  StgClosure *p = static_objects;
+  StgInfoTable *info;
+
+  while (p != END_OF_STATIC_LIST) {
+    checkClosure(p);
+    info = get_itbl(p);
+    switch (info->type) {
+    case IND_STATIC:
+      { 
+	StgClosure *indirectee = ((StgIndStatic *)p)->indirectee;
+
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(indirectee));
+	ASSERT(LOOKS_LIKE_INFO_PTR(indirectee->header.info));
+	p = *IND_STATIC_LINK((StgClosure *)p);
+	break;
+      }
+
+    case THUNK_STATIC:
+      p = *THUNK_STATIC_LINK((StgClosure *)p);
+      break;
+
+    case FUN_STATIC:
+      p = *FUN_STATIC_LINK((StgClosure *)p);
+      break;
+
+    case CONSTR_STATIC:
+      p = *STATIC_LINK(info,(StgClosure *)p);
+      break;
+
+    default:
+      barf("checkStaticObjetcs: strange closure %p (%s)", 
+	   p, info_type(p));
+    }
+  }
+}
+
+/* 
+   Check the sanity of a blocking queue starting at bqe with closure being
+   the closure holding the blocking queue.
+   Note that in GUM we can have several different closure types in a 
+   blocking queue 
+*/
+#if defined(PAR)
+void
+checkBQ (StgBlockingQueueElement *bqe, StgClosure *closure) 
+{
+  rtsBool end = rtsFalse;
+  StgInfoTable *info = get_itbl(closure);
+
+  ASSERT(info->type == MVAR || info->type == FETCH_ME_BQ || info->type == RBH);
+
+  do {
+    switch (get_itbl(bqe)->type) {
+    case BLOCKED_FETCH:
+    case TSO:
+      checkClosure((StgClosure *)bqe);
+      bqe = bqe->link;
+      end = (bqe==END_BQ_QUEUE);
+      break;
+    
+    case CONSTR:
+      checkClosure((StgClosure *)bqe);
+      end = rtsTrue;
+      break;
+
+    default:
+      barf("checkBQ: strange closure %d in blocking queue for closure %p (%s)\n", 
+	   get_itbl(bqe)->type, closure, info_type(closure));
+    }
+  } while (!end);
+}
+#elif defined(GRAN)
+void
+checkBQ (StgTSO *bqe, StgClosure *closure) 
+{  
+  rtsBool end = rtsFalse;
+  StgInfoTable *info = get_itbl(closure);
+
+  ASSERT(info->type == MVAR);
+
+  do {
+    switch (get_itbl(bqe)->type) {
+    case BLOCKED_FETCH:
+    case TSO:
+      checkClosure((StgClosure *)bqe);
+      bqe = bqe->link;
+      end = (bqe==END_BQ_QUEUE);
+      break;
+    
+    default:
+      barf("checkBQ: strange closure %d in blocking queue for closure %p (%s)\n", 
+	   get_itbl(bqe)->type, closure, info_type(closure));
+    }
+  } while (!end);
+}
+#endif
+    
+
+
+/*
+  This routine checks the sanity of the LAGA and GALA tables. They are 
+  implemented as lists through one hash table, LAtoGALAtable, because entries 
+  in both tables have the same structure:
+   - the LAGA table maps local addresses to global addresses; it starts
+     with liveIndirections
+   - the GALA table maps global addresses to local addresses; it starts 
+     with liveRemoteGAs
+*/
+
+#if defined(PAR)
+#include "Hash.h"
+
+/* hidden in parallel/Global.c; only accessed for testing here */
+extern GALA *liveIndirections;
+extern GALA *liveRemoteGAs;
+extern HashTable *LAtoGALAtable;
+
+void
+checkLAGAtable(rtsBool check_closures)
+{
+  GALA *gala, *gala0;
+  nat n=0, m=0; // debugging
+
+  for (gala = liveIndirections; gala != NULL; gala = gala->next) {
+    n++;
+    gala0 = lookupHashTable(LAtoGALAtable, (StgWord) gala->la);
+    ASSERT(!gala->preferred || gala == gala0);
+    ASSERT(LOOKS_LIKE_INFO_PTR(((StgClosure *)gala->la)->header.info));
+    ASSERT(gala->next!=gala); // detect direct loops
+    if ( check_closures ) {
+      checkClosure((StgClosure *)gala->la);
+    }
+  }
+
+  for (gala = liveRemoteGAs; gala != NULL; gala = gala->next) {
+    m++;
+    gala0 = lookupHashTable(LAtoGALAtable, (StgWord) gala->la);
+    ASSERT(!gala->preferred || gala == gala0);
+    ASSERT(LOOKS_LIKE_INFO_PTR(((StgClosure *)gala->la)->header.info));
+    ASSERT(gala->next!=gala); // detect direct loops
+    /*
+    if ( check_closures ) {
+      checkClosure((StgClosure *)gala->la);
+    }
+    */
+  }
+}
+#endif
+
+#endif /* DEBUG */
diff --git a/rts/Sanity.h b/rts/Sanity.h
new file mode 100644
index 0000000000..8cf3f9e52e
--- /dev/null
+++ b/rts/Sanity.h
@@ -0,0 +1,56 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-1999
+ *
+ * Prototypes for functions in Sanity.c
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef SANITY_H
+
+#ifdef DEBUG
+
+# if defined(PAR)
+# define PVM_PE_MASK    0xfffc0000
+# define MAX_PVM_PES    MAX_PES
+# define MAX_PVM_TIDS   MAX_PES
+# define MAX_SLOTS      100000
+# endif
+
+/* debugging routines */
+extern void checkHeap      ( bdescr *bd );
+extern void checkHeapChunk ( StgPtr start, StgPtr end );
+extern void checkChain     ( bdescr *bd );
+extern void checkTSO       ( StgTSO* tso );
+extern void checkGlobalTSOList ( rtsBool checkTSOs );
+extern void checkStaticObjects ( StgClosure* static_objects );
+extern void checkStackChunk    ( StgPtr sp, StgPtr stack_end );
+extern StgOffset checkStackFrame ( StgPtr sp );
+extern StgOffset checkClosure  ( StgClosure* p );
+
+extern void checkMutableList   ( bdescr *bd, nat gen );
+
+#if defined(GRAN)
+extern void checkTSOsSanity(void);
+extern rtsBool checkThreadQSanity (PEs proc, rtsBool check_TSO_too);
+extern rtsBool checkThreadQsSanity (rtsBool check_TSO_too);
+#endif
+
+#if defined(PAR)
+extern void checkBQ (StgBlockingQueueElement *bqe, StgClosure *closure);
+#else
+extern void checkBQ (StgTSO *bqe, StgClosure *closure);
+#endif
+
+#if defined(PAR)
+extern void checkLAGAtable(rtsBool check_closures);
+extern void checkHeapChunk(StgPtr start, StgPtr end);
+#endif
+
+/* test whether an object is already on update list */
+extern rtsBool isBlackhole( StgTSO* tso, StgClosure* p );
+
+#endif /* DEBUG */
+ 
+#endif /* SANITY_H */
+
diff --git a/rts/Schedule.c b/rts/Schedule.c
new file mode 100644
index 0000000000..52fd4d5df6
--- /dev/null
+++ b/rts/Schedule.c
@@ -0,0 +1,4589 @@
+/* ---------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * The scheduler and thread-related functionality
+ *
+ * --------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "SchedAPI.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "BlockAlloc.h"
+#include "OSThreads.h"
+#include "Storage.h"
+#include "StgRun.h"
+#include "Hooks.h"
+#include "Schedule.h"
+#include "StgMiscClosures.h"
+#include "Interpreter.h"
+#include "Exception.h"
+#include "Printer.h"
+#include "RtsSignals.h"
+#include "Sanity.h"
+#include "Stats.h"
+#include "STM.h"
+#include "Timer.h"
+#include "Prelude.h"
+#include "ThreadLabels.h"
+#include "LdvProfile.h"
+#include "Updates.h"
+#ifdef PROFILING
+#include "Proftimer.h"
+#include "ProfHeap.h"
+#endif
+#if defined(GRAN) || defined(PARALLEL_HASKELL)
+# include "GranSimRts.h"
+# include "GranSim.h"
+# include "ParallelRts.h"
+# include "Parallel.h"
+# include "ParallelDebug.h"
+# include "FetchMe.h"
+# include "HLC.h"
+#endif
+#include "Sparks.h"
+#include "Capability.h"
+#include "Task.h"
+#include "AwaitEvent.h"
+#if defined(mingw32_HOST_OS)
+#include "win32/IOManager.h"
+#endif
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#ifdef HAVE_ERRNO_H
+#include <errno.h>
+#endif
+
+// Turn off inlining when debugging - it obfuscates things
+#ifdef DEBUG
+# undef  STATIC_INLINE
+# define STATIC_INLINE static
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Global variables
+ * -------------------------------------------------------------------------- */
+
+#if defined(GRAN)
+
+StgTSO* ActiveTSO = NULL; /* for assigning system costs; GranSim-Light only */
+/* rtsTime TimeOfNextEvent, EndOfTimeSlice;            now in GranSim.c */
+
+/* 
+   In GranSim we have a runnable and a blocked queue for each processor.
+   In order to minimise code changes new arrays run_queue_hds/tls
+   are created. run_queue_hd is then a short cut (macro) for
+   run_queue_hds[CurrentProc] (see GranSim.h).
+   -- HWL
+*/
+StgTSO *run_queue_hds[MAX_PROC], *run_queue_tls[MAX_PROC];
+StgTSO *blocked_queue_hds[MAX_PROC], *blocked_queue_tls[MAX_PROC];
+StgTSO *ccalling_threadss[MAX_PROC];
+/* We use the same global list of threads (all_threads) in GranSim as in
+   the std RTS (i.e. we are cheating). However, we don't use this list in
+   the GranSim specific code at the moment (so we are only potentially
+   cheating).  */
+
+#else /* !GRAN */
+
+#if !defined(THREADED_RTS)
+// Blocked/sleeping thrads
+StgTSO *blocked_queue_hd = NULL;
+StgTSO *blocked_queue_tl = NULL;
+StgTSO *sleeping_queue = NULL;    // perhaps replace with a hash table?
+#endif
+
+/* Threads blocked on blackholes.
+ * LOCK: sched_mutex+capability, or all capabilities
+ */
+StgTSO *blackhole_queue = NULL;
+#endif
+
+/* The blackhole_queue should be checked for threads to wake up.  See
+ * Schedule.h for more thorough comment.
+ * LOCK: none (doesn't matter if we miss an update)
+ */
+rtsBool blackholes_need_checking = rtsFalse;
+
+/* Linked list of all threads.
+ * Used for detecting garbage collected threads.
+ * LOCK: sched_mutex+capability, or all capabilities
+ */
+StgTSO *all_threads = NULL;
+
+/* flag set by signal handler to precipitate a context switch
+ * LOCK: none (just an advisory flag)
+ */
+int context_switch = 0;
+
+/* flag that tracks whether we have done any execution in this time slice.
+ * LOCK: currently none, perhaps we should lock (but needs to be
+ * updated in the fast path of the scheduler).
+ */
+nat recent_activity = ACTIVITY_YES;
+
+/* if this flag is set as well, give up execution
+ * LOCK: none (changes once, from false->true)
+ */
+rtsBool sched_state = SCHED_RUNNING;
+
+/* Next thread ID to allocate.
+ * LOCK: sched_mutex
+ */
+static StgThreadID next_thread_id = 1;
+
+/* The smallest stack size that makes any sense is:
+ *    RESERVED_STACK_WORDS    (so we can get back from the stack overflow)
+ *  + sizeofW(StgStopFrame)   (the stg_stop_thread_info frame)
+ *  + 1                       (the closure to enter)
+ *  + 1			      (stg_ap_v_ret)
+ *  + 1			      (spare slot req'd by stg_ap_v_ret)
+ *
+ * A thread with this stack will bomb immediately with a stack
+ * overflow, which will increase its stack size.  
+ */
+#define MIN_STACK_WORDS (RESERVED_STACK_WORDS + sizeofW(StgStopFrame) + 3)
+
+#if defined(GRAN)
+StgTSO *CurrentTSO;
+#endif
+
+/*  This is used in `TSO.h' and gcc 2.96 insists that this variable actually 
+ *  exists - earlier gccs apparently didn't.
+ *  -= chak
+ */
+StgTSO dummy_tso;
+
+/*
+ * Set to TRUE when entering a shutdown state (via shutdownHaskellAndExit()) --
+ * in an MT setting, needed to signal that a worker thread shouldn't hang around
+ * in the scheduler when it is out of work.
+ */
+rtsBool shutting_down_scheduler = rtsFalse;
+
+/*
+ * This mutex protects most of the global scheduler data in
+ * the THREADED_RTS runtime.
+ */
+#if defined(THREADED_RTS)
+Mutex sched_mutex;
+#endif
+
+#if defined(PARALLEL_HASKELL)
+StgTSO *LastTSO;
+rtsTime TimeOfLastYield;
+rtsBool emitSchedule = rtsTrue;
+#endif
+
+/* -----------------------------------------------------------------------------
+ * static function prototypes
+ * -------------------------------------------------------------------------- */
+
+static Capability *schedule (Capability *initialCapability, Task *task);
+
+//
+// These function all encapsulate parts of the scheduler loop, and are
+// abstracted only to make the structure and control flow of the
+// scheduler clearer.
+//
+static void schedulePreLoop (void);
+#if defined(THREADED_RTS)
+static void schedulePushWork(Capability *cap, Task *task);
+#endif
+static void scheduleStartSignalHandlers (Capability *cap);
+static void scheduleCheckBlockedThreads (Capability *cap);
+static void scheduleCheckWakeupThreads(Capability *cap USED_IF_NOT_THREADS);
+static void scheduleCheckBlackHoles (Capability *cap);
+static void scheduleDetectDeadlock (Capability *cap, Task *task);
+#if defined(GRAN)
+static StgTSO *scheduleProcessEvent(rtsEvent *event);
+#endif
+#if defined(PARALLEL_HASKELL)
+static StgTSO *scheduleSendPendingMessages(void);
+static void scheduleActivateSpark(void);
+static rtsBool scheduleGetRemoteWork(rtsBool *receivedFinish);
+#endif
+#if defined(PAR) || defined(GRAN)
+static void scheduleGranParReport(void);
+#endif
+static void schedulePostRunThread(void);
+static rtsBool scheduleHandleHeapOverflow( Capability *cap, StgTSO *t );
+static void scheduleHandleStackOverflow( Capability *cap, Task *task, 
+					 StgTSO *t);
+static rtsBool scheduleHandleYield( Capability *cap, StgTSO *t, 
+				    nat prev_what_next );
+static void scheduleHandleThreadBlocked( StgTSO *t );
+static rtsBool scheduleHandleThreadFinished( Capability *cap, Task *task,
+					     StgTSO *t );
+static rtsBool scheduleDoHeapProfile(rtsBool ready_to_gc);
+static Capability *scheduleDoGC(Capability *cap, Task *task,
+				rtsBool force_major, 
+				void (*get_roots)(evac_fn));
+
+static void unblockThread(Capability *cap, StgTSO *tso);
+static rtsBool checkBlackHoles(Capability *cap);
+static void AllRoots(evac_fn evac);
+
+static StgTSO *threadStackOverflow(Capability *cap, StgTSO *tso);
+
+static void raiseAsync_(Capability *cap, StgTSO *tso, StgClosure *exception, 
+			rtsBool stop_at_atomically, StgPtr stop_here);
+
+static void deleteThread (Capability *cap, StgTSO *tso);
+static void deleteAllThreads (Capability *cap);
+
+#ifdef DEBUG
+static void printThreadBlockage(StgTSO *tso);
+static void printThreadStatus(StgTSO *tso);
+void printThreadQueue(StgTSO *tso);
+#endif
+
+#if defined(PARALLEL_HASKELL)
+StgTSO * createSparkThread(rtsSpark spark);
+StgTSO * activateSpark (rtsSpark spark);  
+#endif
+
+#ifdef DEBUG
+static char *whatNext_strs[] = {
+  "(unknown)",
+  "ThreadRunGHC",
+  "ThreadInterpret",
+  "ThreadKilled",
+  "ThreadRelocated",
+  "ThreadComplete"
+};
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Putting a thread on the run queue: different scheduling policies
+ * -------------------------------------------------------------------------- */
+
+STATIC_INLINE void
+addToRunQueue( Capability *cap, StgTSO *t )
+{
+#if defined(PARALLEL_HASKELL)
+    if (RtsFlags.ParFlags.doFairScheduling) { 
+	// this does round-robin scheduling; good for concurrency
+	appendToRunQueue(cap,t);
+    } else {
+	// this does unfair scheduling; good for parallelism
+	pushOnRunQueue(cap,t);
+    }
+#else
+    // this does round-robin scheduling; good for concurrency
+    appendToRunQueue(cap,t);
+#endif
+}
+
+/* ---------------------------------------------------------------------------
+   Main scheduling loop.
+
+   We use round-robin scheduling, each thread returning to the
+   scheduler loop when one of these conditions is detected:
+
+      * out of heap space
+      * timer expires (thread yields)
+      * thread blocks
+      * thread ends
+      * stack overflow
+
+   GRAN version:
+     In a GranSim setup this loop iterates over the global event queue.
+     This revolves around the global event queue, which determines what 
+     to do next. Therefore, it's more complicated than either the 
+     concurrent or the parallel (GUM) setup.
+
+   GUM version:
+     GUM iterates over incoming messages.
+     It starts with nothing to do (thus CurrentTSO == END_TSO_QUEUE),
+     and sends out a fish whenever it has nothing to do; in-between
+     doing the actual reductions (shared code below) it processes the
+     incoming messages and deals with delayed operations 
+     (see PendingFetches).
+     This is not the ugliest code you could imagine, but it's bloody close.
+
+   ------------------------------------------------------------------------ */
+
+static Capability *
+schedule (Capability *initialCapability, Task *task)
+{
+  StgTSO *t;
+  Capability *cap;
+  StgThreadReturnCode ret;
+#if defined(GRAN)
+  rtsEvent *event;
+#elif defined(PARALLEL_HASKELL)
+  StgTSO *tso;
+  GlobalTaskId pe;
+  rtsBool receivedFinish = rtsFalse;
+# if defined(DEBUG)
+  nat tp_size, sp_size; // stats only
+# endif
+#endif
+  nat prev_what_next;
+  rtsBool ready_to_gc;
+#if defined(THREADED_RTS)
+  rtsBool first = rtsTrue;
+#endif
+  
+  cap = initialCapability;
+
+  // Pre-condition: this task owns initialCapability.
+  // The sched_mutex is *NOT* held
+  // NB. on return, we still hold a capability.
+
+  IF_DEBUG(scheduler,
+	   sched_belch("### NEW SCHEDULER LOOP (task: %p, cap: %p)",
+		       task, initialCapability);
+      );
+
+  schedulePreLoop();
+
+  // -----------------------------------------------------------
+  // Scheduler loop starts here:
+
+#if defined(PARALLEL_HASKELL)
+#define TERMINATION_CONDITION        (!receivedFinish)
+#elif defined(GRAN)
+#define TERMINATION_CONDITION        ((event = get_next_event()) != (rtsEvent*)NULL) 
+#else
+#define TERMINATION_CONDITION        rtsTrue
+#endif
+
+  while (TERMINATION_CONDITION) {
+
+#if defined(GRAN)
+      /* Choose the processor with the next event */
+      CurrentProc = event->proc;
+      CurrentTSO = event->tso;
+#endif
+
+#if defined(THREADED_RTS)
+      if (first) {
+	  // don't yield the first time, we want a chance to run this
+	  // thread for a bit, even if there are others banging at the
+	  // door.
+	  first = rtsFalse;
+	  ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
+      } else {
+	  // Yield the capability to higher-priority tasks if necessary.
+	  yieldCapability(&cap, task);
+      }
+#endif
+      
+#if defined(THREADED_RTS)
+      schedulePushWork(cap,task);
+#endif
+
+    // Check whether we have re-entered the RTS from Haskell without
+    // going via suspendThread()/resumeThread (i.e. a 'safe' foreign
+    // call).
+    if (cap->in_haskell) {
+    	  errorBelch("schedule: re-entered unsafely.\n"
+    		     "   Perhaps a 'foreign import unsafe' should be 'safe'?");
+    	  stg_exit(EXIT_FAILURE);
+    }
+
+    // The interruption / shutdown sequence.
+    // 
+    // In order to cleanly shut down the runtime, we want to:
+    //   * make sure that all main threads return to their callers
+    //     with the state 'Interrupted'.
+    //   * clean up all OS threads assocated with the runtime
+    //   * free all memory etc.
+    //
+    // So the sequence for ^C goes like this:
+    //
+    //   * ^C handler sets sched_state := SCHED_INTERRUPTING and
+    //     arranges for some Capability to wake up
+    //
+    //   * all threads in the system are halted, and the zombies are
+    //     placed on the run queue for cleaning up.  We acquire all
+    //     the capabilities in order to delete the threads, this is
+    //     done by scheduleDoGC() for convenience (because GC already
+    //     needs to acquire all the capabilities).  We can't kill
+    //     threads involved in foreign calls.
+    // 
+    //   * sched_state := SCHED_INTERRUPTED
+    //
+    //   * somebody calls shutdownHaskell(), which calls exitScheduler()
+    //
+    //   * sched_state := SCHED_SHUTTING_DOWN
+    //
+    //   * all workers exit when the run queue on their capability
+    //     drains.  All main threads will also exit when their TSO
+    //     reaches the head of the run queue and they can return.
+    //
+    //   * eventually all Capabilities will shut down, and the RTS can
+    //     exit.
+    //
+    //   * We might be left with threads blocked in foreign calls, 
+    //     we should really attempt to kill these somehow (TODO);
+    
+    switch (sched_state) {
+    case SCHED_RUNNING:
+	break;
+    case SCHED_INTERRUPTING:
+	IF_DEBUG(scheduler, sched_belch("SCHED_INTERRUPTING"));
+#if defined(THREADED_RTS)
+	discardSparksCap(cap);
+#endif
+	/* scheduleDoGC() deletes all the threads */
+	cap = scheduleDoGC(cap,task,rtsFalse,GetRoots);
+	break;
+    case SCHED_INTERRUPTED:
+	IF_DEBUG(scheduler, sched_belch("SCHED_INTERRUPTED"));
+	break;
+    case SCHED_SHUTTING_DOWN:
+	IF_DEBUG(scheduler, sched_belch("SCHED_SHUTTING_DOWN"));
+	// If we are a worker, just exit.  If we're a bound thread
+	// then we will exit below when we've removed our TSO from
+	// the run queue.
+	if (task->tso == NULL && emptyRunQueue(cap)) {
+	    return cap;
+	}
+	break;
+    default:
+	barf("sched_state: %d", sched_state);
+    }
+
+#if defined(THREADED_RTS)
+    // If the run queue is empty, take a spark and turn it into a thread.
+    {
+	if (emptyRunQueue(cap)) {
+	    StgClosure *spark;
+	    spark = findSpark(cap);
+	    if (spark != NULL) {
+		IF_DEBUG(scheduler,
+			 sched_belch("turning spark of closure %p into a thread",
+				     (StgClosure *)spark));
+		createSparkThread(cap,spark);	  
+	    }
+	}
+    }
+#endif // THREADED_RTS
+
+    scheduleStartSignalHandlers(cap);
+
+    // Only check the black holes here if we've nothing else to do.
+    // During normal execution, the black hole list only gets checked
+    // at GC time, to avoid repeatedly traversing this possibly long
+    // list each time around the scheduler.
+    if (emptyRunQueue(cap)) { scheduleCheckBlackHoles(cap); }
+
+    scheduleCheckWakeupThreads(cap);
+
+    scheduleCheckBlockedThreads(cap);
+
+    scheduleDetectDeadlock(cap,task);
+#if defined(THREADED_RTS)
+    cap = task->cap;    // reload cap, it might have changed
+#endif
+
+    // Normally, the only way we can get here with no threads to
+    // run is if a keyboard interrupt received during 
+    // scheduleCheckBlockedThreads() or scheduleDetectDeadlock().
+    // Additionally, it is not fatal for the
+    // threaded RTS to reach here with no threads to run.
+    //
+    // win32: might be here due to awaitEvent() being abandoned
+    // as a result of a console event having been delivered.
+    if ( emptyRunQueue(cap) ) {
+#if !defined(THREADED_RTS) && !defined(mingw32_HOST_OS)
+	ASSERT(sched_state >= SCHED_INTERRUPTING);
+#endif
+	continue; // nothing to do
+    }
+
+#if defined(PARALLEL_HASKELL)
+    scheduleSendPendingMessages();
+    if (emptyRunQueue(cap) && scheduleActivateSpark()) 
+	continue;
+
+#if defined(SPARKS)
+    ASSERT(next_fish_to_send_at==0);  // i.e. no delayed fishes left!
+#endif
+
+    /* If we still have no work we need to send a FISH to get a spark
+       from another PE */
+    if (emptyRunQueue(cap)) {
+	if (!scheduleGetRemoteWork(&receivedFinish)) continue;
+	ASSERT(rtsFalse); // should not happen at the moment
+    }
+    // from here: non-empty run queue.
+    //  TODO: merge above case with this, only one call processMessages() !
+    if (PacketsWaiting()) {  /* process incoming messages, if
+				any pending...  only in else
+				because getRemoteWork waits for
+				messages as well */
+	receivedFinish = processMessages();
+    }
+#endif
+
+#if defined(GRAN)
+    scheduleProcessEvent(event);
+#endif
+
+    // 
+    // Get a thread to run
+    //
+    t = popRunQueue(cap);
+
+#if defined(GRAN) || defined(PAR)
+    scheduleGranParReport(); // some kind of debuging output
+#else
+    // Sanity check the thread we're about to run.  This can be
+    // expensive if there is lots of thread switching going on...
+    IF_DEBUG(sanity,checkTSO(t));
+#endif
+
+#if defined(THREADED_RTS)
+    // Check whether we can run this thread in the current task.
+    // If not, we have to pass our capability to the right task.
+    {
+	Task *bound = t->bound;
+      
+	if (bound) {
+	    if (bound == task) {
+		IF_DEBUG(scheduler,
+			 sched_belch("### Running thread %d in bound thread",
+				     t->id));
+		// yes, the Haskell thread is bound to the current native thread
+	    } else {
+		IF_DEBUG(scheduler,
+			 sched_belch("### thread %d bound to another OS thread",
+				     t->id));
+		// no, bound to a different Haskell thread: pass to that thread
+		pushOnRunQueue(cap,t);
+		continue;
+	    }
+	} else {
+	    // The thread we want to run is unbound.
+	    if (task->tso) { 
+		IF_DEBUG(scheduler,
+			 sched_belch("### this OS thread cannot run thread %d", t->id));
+		// no, the current native thread is bound to a different
+		// Haskell thread, so pass it to any worker thread
+		pushOnRunQueue(cap,t);
+		continue; 
+	    }
+	}
+    }
+#endif
+
+    cap->r.rCurrentTSO = t;
+    
+    /* context switches are initiated by the timer signal, unless
+     * the user specified "context switch as often as possible", with
+     * +RTS -C0
+     */
+    if (RtsFlags.ConcFlags.ctxtSwitchTicks == 0
+	&& !emptyThreadQueues(cap)) {
+	context_switch = 1;
+    }
+	 
+run_thread:
+
+    IF_DEBUG(scheduler, sched_belch("-->> running thread %ld %s ...", 
+			      (long)t->id, whatNext_strs[t->what_next]));
+
+#if defined(PROFILING)
+    startHeapProfTimer();
+#endif
+
+    // ----------------------------------------------------------------------
+    // Run the current thread 
+
+    ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
+    ASSERT(t->cap == cap);
+
+    prev_what_next = t->what_next;
+
+    errno = t->saved_errno;
+    cap->in_haskell = rtsTrue;
+
+    dirtyTSO(t);
+
+    recent_activity = ACTIVITY_YES;
+
+    switch (prev_what_next) {
+	
+    case ThreadKilled:
+    case ThreadComplete:
+	/* Thread already finished, return to scheduler. */
+	ret = ThreadFinished;
+	break;
+	
+    case ThreadRunGHC:
+    {
+	StgRegTable *r;
+	r = StgRun((StgFunPtr) stg_returnToStackTop, &cap->r);
+	cap = regTableToCapability(r);
+	ret = r->rRet;
+	break;
+    }
+    
+    case ThreadInterpret:
+	cap = interpretBCO(cap);
+	ret = cap->r.rRet;
+	break;
+	
+    default:
+	barf("schedule: invalid what_next field");
+    }
+
+    cap->in_haskell = rtsFalse;
+
+    // The TSO might have moved, eg. if it re-entered the RTS and a GC
+    // happened.  So find the new location:
+    t = cap->r.rCurrentTSO;
+
+    // We have run some Haskell code: there might be blackhole-blocked
+    // threads to wake up now.
+    // Lock-free test here should be ok, we're just setting a flag.
+    if ( blackhole_queue != END_TSO_QUEUE ) {
+	blackholes_need_checking = rtsTrue;
+    }
+    
+    // And save the current errno in this thread.
+    // XXX: possibly bogus for SMP because this thread might already
+    // be running again, see code below.
+    t->saved_errno = errno;
+
+#if defined(THREADED_RTS)
+    // If ret is ThreadBlocked, and this Task is bound to the TSO that
+    // blocked, we are in limbo - the TSO is now owned by whatever it
+    // is blocked on, and may in fact already have been woken up,
+    // perhaps even on a different Capability.  It may be the case
+    // that task->cap != cap.  We better yield this Capability
+    // immediately and return to normaility.
+    if (ret == ThreadBlocked) {
+	IF_DEBUG(scheduler,
+		 sched_belch("--<< thread %d (%s) stopped: blocked\n",
+			     t->id, whatNext_strs[t->what_next]));
+	continue;
+    }
+#endif
+
+    ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
+    ASSERT(t->cap == cap);
+
+    // ----------------------------------------------------------------------
+    
+    // Costs for the scheduler are assigned to CCS_SYSTEM
+#if defined(PROFILING)
+    stopHeapProfTimer();
+    CCCS = CCS_SYSTEM;
+#endif
+    
+#if defined(THREADED_RTS)
+    IF_DEBUG(scheduler,debugBelch("sched (task %p): ", (void *)(unsigned long)(unsigned int)osThreadId()););
+#elif !defined(GRAN) && !defined(PARALLEL_HASKELL)
+    IF_DEBUG(scheduler,debugBelch("sched: "););
+#endif
+    
+    schedulePostRunThread();
+
+    ready_to_gc = rtsFalse;
+
+    switch (ret) {
+    case HeapOverflow:
+	ready_to_gc = scheduleHandleHeapOverflow(cap,t);
+	break;
+
+    case StackOverflow:
+	scheduleHandleStackOverflow(cap,task,t);
+	break;
+
+    case ThreadYielding:
+	if (scheduleHandleYield(cap, t, prev_what_next)) {
+            // shortcut for switching between compiler/interpreter:
+	    goto run_thread; 
+	}
+	break;
+
+    case ThreadBlocked:
+	scheduleHandleThreadBlocked(t);
+	break;
+
+    case ThreadFinished:
+	if (scheduleHandleThreadFinished(cap, task, t)) return cap;
+	ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
+	break;
+
+    default:
+      barf("schedule: invalid thread return code %d", (int)ret);
+    }
+
+    if (scheduleDoHeapProfile(ready_to_gc)) { ready_to_gc = rtsFalse; }
+    if (ready_to_gc) {
+      cap = scheduleDoGC(cap,task,rtsFalse,GetRoots);
+    }
+  } /* end of while() */
+
+  IF_PAR_DEBUG(verbose,
+	       debugBelch("== Leaving schedule() after having received Finish\n"));
+}
+
+/* ----------------------------------------------------------------------------
+ * Setting up the scheduler loop
+ * ------------------------------------------------------------------------- */
+
+static void
+schedulePreLoop(void)
+{
+#if defined(GRAN) 
+    /* set up first event to get things going */
+    /* ToDo: assign costs for system setup and init MainTSO ! */
+    new_event(CurrentProc, CurrentProc, CurrentTime[CurrentProc],
+	      ContinueThread, 
+	      CurrentTSO, (StgClosure*)NULL, (rtsSpark*)NULL);
+    
+    IF_DEBUG(gran,
+	     debugBelch("GRAN: Init CurrentTSO (in schedule) = %p\n", 
+			CurrentTSO);
+	     G_TSO(CurrentTSO, 5));
+    
+    if (RtsFlags.GranFlags.Light) {
+	/* Save current time; GranSim Light only */
+	CurrentTSO->gran.clock = CurrentTime[CurrentProc];
+    }      
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+ * schedulePushWork()
+ *
+ * Push work to other Capabilities if we have some.
+ * -------------------------------------------------------------------------- */
+
+#if defined(THREADED_RTS)
+static void
+schedulePushWork(Capability *cap USED_IF_THREADS, 
+		 Task *task      USED_IF_THREADS)
+{
+    Capability *free_caps[n_capabilities], *cap0;
+    nat i, n_free_caps;
+
+    // migration can be turned off with +RTS -qg
+    if (!RtsFlags.ParFlags.migrate) return;
+
+    // Check whether we have more threads on our run queue, or sparks
+    // in our pool, that we could hand to another Capability.
+    if ((emptyRunQueue(cap) || cap->run_queue_hd->link == END_TSO_QUEUE)
+	&& sparkPoolSizeCap(cap) < 2) {
+	return;
+    }
+
+    // First grab as many free Capabilities as we can.
+    for (i=0, n_free_caps=0; i < n_capabilities; i++) {
+	cap0 = &capabilities[i];
+	if (cap != cap0 && tryGrabCapability(cap0,task)) {
+	    if (!emptyRunQueue(cap0) || cap->returning_tasks_hd != NULL) {
+		// it already has some work, we just grabbed it at 
+		// the wrong moment.  Or maybe it's deadlocked!
+		releaseCapability(cap0);
+	    } else {
+		free_caps[n_free_caps++] = cap0;
+	    }
+	}
+    }
+
+    // we now have n_free_caps free capabilities stashed in
+    // free_caps[].  Share our run queue equally with them.  This is
+    // probably the simplest thing we could do; improvements we might
+    // want to do include:
+    //
+    //   - giving high priority to moving relatively new threads, on 
+    //     the gournds that they haven't had time to build up a
+    //     working set in the cache on this CPU/Capability.
+    //
+    //   - giving low priority to moving long-lived threads
+
+    if (n_free_caps > 0) {
+	StgTSO *prev, *t, *next;
+	rtsBool pushed_to_all;
+
+	IF_DEBUG(scheduler, sched_belch("excess threads on run queue and %d free capabilities, sharing...", n_free_caps));
+
+	i = 0;
+	pushed_to_all = rtsFalse;
+
+	if (cap->run_queue_hd != END_TSO_QUEUE) {
+	    prev = cap->run_queue_hd;
+	    t = prev->link;
+	    prev->link = END_TSO_QUEUE;
+	    for (; t != END_TSO_QUEUE; t = next) {
+		next = t->link;
+		t->link = END_TSO_QUEUE;
+		if (t->what_next == ThreadRelocated
+		    || t->bound == task // don't move my bound thread
+		    || tsoLocked(t)) {  // don't move a locked thread
+		    prev->link = t;
+		    prev = t;
+		} else if (i == n_free_caps) {
+		    pushed_to_all = rtsTrue;
+		    i = 0;
+		    // keep one for us
+		    prev->link = t;
+		    prev = t;
+		} else {
+		    IF_DEBUG(scheduler, sched_belch("pushing thread %d to capability %d", t->id, free_caps[i]->no));
+		    appendToRunQueue(free_caps[i],t);
+		    if (t->bound) { t->bound->cap = free_caps[i]; }
+		    t->cap = free_caps[i];
+		    i++;
+		}
+	    }
+	    cap->run_queue_tl = prev;
+	}
+
+	// If there are some free capabilities that we didn't push any
+	// threads to, then try to push a spark to each one.
+	if (!pushed_to_all) {
+	    StgClosure *spark;
+	    // i is the next free capability to push to
+	    for (; i < n_free_caps; i++) {
+		if (emptySparkPoolCap(free_caps[i])) {
+		    spark = findSpark(cap);
+		    if (spark != NULL) {
+			IF_DEBUG(scheduler, sched_belch("pushing spark %p to capability %d", spark, free_caps[i]->no));
+			newSpark(&(free_caps[i]->r), spark);
+		    }
+		}
+	    }
+	}
+
+	// release the capabilities
+	for (i = 0; i < n_free_caps; i++) {
+	    task->cap = free_caps[i];
+	    releaseCapability(free_caps[i]);
+	}
+    }
+    task->cap = cap; // reset to point to our Capability.
+}
+#endif
+
+/* ----------------------------------------------------------------------------
+ * Start any pending signal handlers
+ * ------------------------------------------------------------------------- */
+
+#if defined(RTS_USER_SIGNALS) && (!defined(THREADED_RTS) || defined(mingw32_HOST_OS))
+static void
+scheduleStartSignalHandlers(Capability *cap)
+{
+    if (signals_pending()) { // safe outside the lock
+	startSignalHandlers(cap);
+    }
+}
+#else
+static void
+scheduleStartSignalHandlers(Capability *cap STG_UNUSED)
+{
+}
+#endif
+
+/* ----------------------------------------------------------------------------
+ * Check for blocked threads that can be woken up.
+ * ------------------------------------------------------------------------- */
+
+static void
+scheduleCheckBlockedThreads(Capability *cap USED_IF_NOT_THREADS)
+{
+#if !defined(THREADED_RTS)
+    //
+    // Check whether any waiting threads need to be woken up.  If the
+    // run queue is empty, and there are no other tasks running, we
+    // can wait indefinitely for something to happen.
+    //
+    if ( !emptyQueue(blocked_queue_hd) || !emptyQueue(sleeping_queue) )
+    {
+	awaitEvent( emptyRunQueue(cap) && !blackholes_need_checking );
+    }
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------
+ * Check for threads woken up by other Capabilities
+ * ------------------------------------------------------------------------- */
+
+static void
+scheduleCheckWakeupThreads(Capability *cap USED_IF_THREADS)
+{
+#if defined(THREADED_RTS)
+    // Any threads that were woken up by other Capabilities get
+    // appended to our run queue.
+    if (!emptyWakeupQueue(cap)) {
+	ACQUIRE_LOCK(&cap->lock);
+	if (emptyRunQueue(cap)) {
+	    cap->run_queue_hd = cap->wakeup_queue_hd;
+	    cap->run_queue_tl = cap->wakeup_queue_tl;
+	} else {
+	    cap->run_queue_tl->link = cap->wakeup_queue_hd;
+	    cap->run_queue_tl = cap->wakeup_queue_tl;
+	}
+	cap->wakeup_queue_hd = cap->wakeup_queue_tl = END_TSO_QUEUE;
+	RELEASE_LOCK(&cap->lock);
+    }
+#endif
+}
+
+/* ----------------------------------------------------------------------------
+ * Check for threads blocked on BLACKHOLEs that can be woken up
+ * ------------------------------------------------------------------------- */
+static void
+scheduleCheckBlackHoles (Capability *cap)
+{
+    if ( blackholes_need_checking ) // check without the lock first
+    {
+	ACQUIRE_LOCK(&sched_mutex);
+	if ( blackholes_need_checking ) {
+	    checkBlackHoles(cap);
+	    blackholes_need_checking = rtsFalse;
+	}
+	RELEASE_LOCK(&sched_mutex);
+    }
+}
+
+/* ----------------------------------------------------------------------------
+ * Detect deadlock conditions and attempt to resolve them.
+ * ------------------------------------------------------------------------- */
+
+static void
+scheduleDetectDeadlock (Capability *cap, Task *task)
+{
+
+#if defined(PARALLEL_HASKELL)
+    // ToDo: add deadlock detection in GUM (similar to THREADED_RTS) -- HWL
+    return;
+#endif
+
+    /* 
+     * Detect deadlock: when we have no threads to run, there are no
+     * threads blocked, waiting for I/O, or sleeping, and all the
+     * other tasks are waiting for work, we must have a deadlock of
+     * some description.
+     */
+    if ( emptyThreadQueues(cap) )
+    {
+#if defined(THREADED_RTS)
+	/* 
+	 * In the threaded RTS, we only check for deadlock if there
+	 * has been no activity in a complete timeslice.  This means
+	 * we won't eagerly start a full GC just because we don't have
+	 * any threads to run currently.
+	 */
+	if (recent_activity != ACTIVITY_INACTIVE) return;
+#endif
+
+	IF_DEBUG(scheduler, sched_belch("deadlocked, forcing major GC..."));
+
+	// Garbage collection can release some new threads due to
+	// either (a) finalizers or (b) threads resurrected because
+	// they are unreachable and will therefore be sent an
+	// exception.  Any threads thus released will be immediately
+	// runnable.
+	cap = scheduleDoGC (cap, task, rtsTrue/*force  major GC*/, GetRoots);
+
+	recent_activity = ACTIVITY_DONE_GC;
+	
+	if ( !emptyRunQueue(cap) ) return;
+
+#if defined(RTS_USER_SIGNALS) && (!defined(THREADED_RTS) || defined(mingw32_HOST_OS))
+	/* If we have user-installed signal handlers, then wait
+	 * for signals to arrive rather then bombing out with a
+	 * deadlock.
+	 */
+	if ( anyUserHandlers() ) {
+	    IF_DEBUG(scheduler, 
+		     sched_belch("still deadlocked, waiting for signals..."));
+
+	    awaitUserSignals();
+
+	    if (signals_pending()) {
+		startSignalHandlers(cap);
+	    }
+
+	    // either we have threads to run, or we were interrupted:
+	    ASSERT(!emptyRunQueue(cap) || sched_state >= SCHED_INTERRUPTING);
+	}
+#endif
+
+#if !defined(THREADED_RTS)
+	/* Probably a real deadlock.  Send the current main thread the
+	 * Deadlock exception.
+	 */
+	if (task->tso) {
+	    switch (task->tso->why_blocked) {
+	    case BlockedOnSTM:
+	    case BlockedOnBlackHole:
+	    case BlockedOnException:
+	    case BlockedOnMVar:
+		raiseAsync(cap, task->tso, (StgClosure *)NonTermination_closure);
+		return;
+	    default:
+		barf("deadlock: main thread blocked in a strange way");
+	    }
+	}
+	return;
+#endif
+    }
+}
+
+/* ----------------------------------------------------------------------------
+ * Process an event (GRAN only)
+ * ------------------------------------------------------------------------- */
+
+#if defined(GRAN)
+static StgTSO *
+scheduleProcessEvent(rtsEvent *event)
+{
+    StgTSO *t;
+
+    if (RtsFlags.GranFlags.Light)
+      GranSimLight_enter_system(event, &ActiveTSO); // adjust ActiveTSO etc
+
+    /* adjust time based on time-stamp */
+    if (event->time > CurrentTime[CurrentProc] &&
+        event->evttype != ContinueThread)
+      CurrentTime[CurrentProc] = event->time;
+    
+    /* Deal with the idle PEs (may issue FindWork or MoveSpark events) */
+    if (!RtsFlags.GranFlags.Light)
+      handleIdlePEs();
+
+    IF_DEBUG(gran, debugBelch("GRAN: switch by event-type\n"));
+
+    /* main event dispatcher in GranSim */
+    switch (event->evttype) {
+      /* Should just be continuing execution */
+    case ContinueThread:
+      IF_DEBUG(gran, debugBelch("GRAN: doing ContinueThread\n"));
+      /* ToDo: check assertion
+      ASSERT(run_queue_hd != (StgTSO*)NULL &&
+	     run_queue_hd != END_TSO_QUEUE);
+      */
+      /* Ignore ContinueThreads for fetching threads (if synchr comm) */
+      if (!RtsFlags.GranFlags.DoAsyncFetch &&
+	  procStatus[CurrentProc]==Fetching) {
+	debugBelch("ghuH: Spurious ContinueThread while Fetching ignored; TSO %d (%p) [PE %d]\n",
+	      CurrentTSO->id, CurrentTSO, CurrentProc);
+	goto next_thread;
+      }	
+      /* Ignore ContinueThreads for completed threads */
+      if (CurrentTSO->what_next == ThreadComplete) {
+	debugBelch("ghuH: found a ContinueThread event for completed thread %d (%p) [PE %d] (ignoring ContinueThread)\n", 
+	      CurrentTSO->id, CurrentTSO, CurrentProc);
+	goto next_thread;
+      }	
+      /* Ignore ContinueThreads for threads that are being migrated */
+      if (PROCS(CurrentTSO)==Nowhere) { 
+	debugBelch("ghuH: trying to run the migrating TSO %d (%p) [PE %d] (ignoring ContinueThread)\n",
+	      CurrentTSO->id, CurrentTSO, CurrentProc);
+	goto next_thread;
+      }
+      /* The thread should be at the beginning of the run queue */
+      if (CurrentTSO!=run_queue_hds[CurrentProc]) { 
+	debugBelch("ghuH: TSO %d (%p) [PE %d] is not at the start of the run_queue when doing a ContinueThread\n",
+	      CurrentTSO->id, CurrentTSO, CurrentProc);
+	break; // run the thread anyway
+      }
+      /*
+      new_event(proc, proc, CurrentTime[proc],
+		FindWork,
+		(StgTSO*)NULL, (StgClosure*)NULL, (rtsSpark*)NULL);
+      goto next_thread; 
+      */ /* Catches superfluous CONTINUEs -- should be unnecessary */
+      break; // now actually run the thread; DaH Qu'vam yImuHbej 
+
+    case FetchNode:
+      do_the_fetchnode(event);
+      goto next_thread;             /* handle next event in event queue  */
+      
+    case GlobalBlock:
+      do_the_globalblock(event);
+      goto next_thread;             /* handle next event in event queue  */
+      
+    case FetchReply:
+      do_the_fetchreply(event);
+      goto next_thread;             /* handle next event in event queue  */
+      
+    case UnblockThread:   /* Move from the blocked queue to the tail of */
+      do_the_unblock(event);
+      goto next_thread;             /* handle next event in event queue  */
+      
+    case ResumeThread:  /* Move from the blocked queue to the tail of */
+      /* the runnable queue ( i.e. Qu' SImqa'lu') */ 
+      event->tso->gran.blocktime += 
+	CurrentTime[CurrentProc] - event->tso->gran.blockedat;
+      do_the_startthread(event);
+      goto next_thread;             /* handle next event in event queue  */
+      
+    case StartThread:
+      do_the_startthread(event);
+      goto next_thread;             /* handle next event in event queue  */
+      
+    case MoveThread:
+      do_the_movethread(event);
+      goto next_thread;             /* handle next event in event queue  */
+      
+    case MoveSpark:
+      do_the_movespark(event);
+      goto next_thread;             /* handle next event in event queue  */
+      
+    case FindWork:
+      do_the_findwork(event);
+      goto next_thread;             /* handle next event in event queue  */
+      
+    default:
+      barf("Illegal event type %u\n", event->evttype);
+    }  /* switch */
+    
+    /* This point was scheduler_loop in the old RTS */
+
+    IF_DEBUG(gran, debugBelch("GRAN: after main switch\n"));
+
+    TimeOfLastEvent = CurrentTime[CurrentProc];
+    TimeOfNextEvent = get_time_of_next_event();
+    IgnoreEvents=(TimeOfNextEvent==0); // HWL HACK
+    // CurrentTSO = ThreadQueueHd;
+
+    IF_DEBUG(gran, debugBelch("GRAN: time of next event is: %ld\n", 
+			 TimeOfNextEvent));
+
+    if (RtsFlags.GranFlags.Light) 
+      GranSimLight_leave_system(event, &ActiveTSO); 
+
+    EndOfTimeSlice = CurrentTime[CurrentProc]+RtsFlags.GranFlags.time_slice;
+
+    IF_DEBUG(gran, 
+	     debugBelch("GRAN: end of time-slice is %#lx\n", EndOfTimeSlice));
+
+    /* in a GranSim setup the TSO stays on the run queue */
+    t = CurrentTSO;
+    /* Take a thread from the run queue. */
+    POP_RUN_QUEUE(t); // take_off_run_queue(t);
+
+    IF_DEBUG(gran, 
+	     debugBelch("GRAN: About to run current thread, which is\n");
+	     G_TSO(t,5));
+
+    context_switch = 0; // turned on via GranYield, checking events and time slice
+
+    IF_DEBUG(gran, 
+	     DumpGranEvent(GR_SCHEDULE, t));
+
+    procStatus[CurrentProc] = Busy;
+}
+#endif // GRAN
+
+/* ----------------------------------------------------------------------------
+ * Send pending messages (PARALLEL_HASKELL only)
+ * ------------------------------------------------------------------------- */
+
+#if defined(PARALLEL_HASKELL)
+static StgTSO *
+scheduleSendPendingMessages(void)
+{
+    StgSparkPool *pool;
+    rtsSpark spark;
+    StgTSO *t;
+
+# if defined(PAR) // global Mem.Mgmt., omit for now
+    if (PendingFetches != END_BF_QUEUE) {
+        processFetches();
+    }
+# endif
+    
+    if (RtsFlags.ParFlags.BufferTime) {
+	// if we use message buffering, we must send away all message
+	// packets which have become too old...
+	sendOldBuffers(); 
+    }
+}
+#endif
+
+/* ----------------------------------------------------------------------------
+ * Activate spark threads (PARALLEL_HASKELL only)
+ * ------------------------------------------------------------------------- */
+
+#if defined(PARALLEL_HASKELL)
+static void
+scheduleActivateSpark(void)
+{
+#if defined(SPARKS)
+  ASSERT(emptyRunQueue());
+/* We get here if the run queue is empty and want some work.
+   We try to turn a spark into a thread, and add it to the run queue,
+   from where it will be picked up in the next iteration of the scheduler
+   loop.
+*/
+
+      /* :-[  no local threads => look out for local sparks */
+      /* the spark pool for the current PE */
+      pool = &(cap.r.rSparks); // JB: cap = (old) MainCap
+      if (advisory_thread_count < RtsFlags.ParFlags.maxThreads &&
+	  pool->hd < pool->tl) {
+	/* 
+	 * ToDo: add GC code check that we really have enough heap afterwards!!
+	 * Old comment:
+	 * If we're here (no runnable threads) and we have pending
+	 * sparks, we must have a space problem.  Get enough space
+	 * to turn one of those pending sparks into a
+	 * thread... 
+	 */
+
+  	spark = findSpark(rtsFalse);            /* get a spark */
+  	if (spark != (rtsSpark) NULL) {
+  	  tso = createThreadFromSpark(spark);       /* turn the spark into a thread */
+  	  IF_PAR_DEBUG(fish, // schedule,
+  		       debugBelch("==== schedule: Created TSO %d (%p); %d threads active\n",
+  			     tso->id, tso, advisory_thread_count));
+
+  	  if (tso==END_TSO_QUEUE) { /* failed to activate spark->back to loop */
+  	    IF_PAR_DEBUG(fish, // schedule,
+  			 debugBelch("==^^ failed to create thread from spark @ %lx\n",
+                            spark));
+  	    return rtsFalse; /* failed to generate a thread */
+  	  }                  /* otherwise fall through & pick-up new tso */
+  	} else {
+  	  IF_PAR_DEBUG(fish, // schedule,
+  		       debugBelch("==^^ no local sparks (spark pool contains only NFs: %d)\n", 
+  			     spark_queue_len(pool)));
+  	  return rtsFalse;  /* failed to generate a thread */
+  	}
+  	return rtsTrue;  /* success in generating a thread */
+  } else { /* no more threads permitted or pool empty */
+    return rtsFalse;  /* failed to generateThread */
+  }
+#else
+  tso = NULL; // avoid compiler warning only
+  return rtsFalse;  /* dummy in non-PAR setup */
+#endif // SPARKS
+}
+#endif // PARALLEL_HASKELL
+
+/* ----------------------------------------------------------------------------
+ * Get work from a remote node (PARALLEL_HASKELL only)
+ * ------------------------------------------------------------------------- */
+    
+#if defined(PARALLEL_HASKELL)
+static rtsBool
+scheduleGetRemoteWork(rtsBool *receivedFinish)
+{
+  ASSERT(emptyRunQueue());
+
+  if (RtsFlags.ParFlags.BufferTime) {
+	IF_PAR_DEBUG(verbose, 
+	        debugBelch("...send all pending data,"));
+        {
+	  nat i;
+	  for (i=1; i<=nPEs; i++)
+	    sendImmediately(i); // send all messages away immediately
+	}
+  }
+# ifndef SPARKS
+	//++EDEN++ idle() , i.e. send all buffers, wait for work
+	// suppress fishing in EDEN... just look for incoming messages
+	// (blocking receive)
+  IF_PAR_DEBUG(verbose, 
+	       debugBelch("...wait for incoming messages...\n"));
+  *receivedFinish = processMessages(); // blocking receive...
+
+	// and reenter scheduling loop after having received something
+	// (return rtsFalse below)
+
+# else /* activate SPARKS machinery */
+/* We get here, if we have no work, tried to activate a local spark, but still
+   have no work. We try to get a remote spark, by sending a FISH message.
+   Thread migration should be added here, and triggered when a sequence of 
+   fishes returns without work. */
+	delay = (RtsFlags.ParFlags.fishDelay!=0ll ? RtsFlags.ParFlags.fishDelay : 0ll);
+
+      /* =8-[  no local sparks => look for work on other PEs */
+	/*
+	 * We really have absolutely no work.  Send out a fish
+	 * (there may be some out there already), and wait for
+	 * something to arrive.  We clearly can't run any threads
+	 * until a SCHEDULE or RESUME arrives, and so that's what
+	 * we're hoping to see.  (Of course, we still have to
+	 * respond to other types of messages.)
+	 */
+	rtsTime now = msTime() /*CURRENT_TIME*/;
+	IF_PAR_DEBUG(verbose, 
+		     debugBelch("--  now=%ld\n", now));
+	IF_PAR_DEBUG(fish, // verbose,
+  	     if (outstandingFishes < RtsFlags.ParFlags.maxFishes &&
+  		 (last_fish_arrived_at!=0 &&
+  		  last_fish_arrived_at+delay > now)) {
+  	       debugBelch("--$$ <%llu> delaying FISH until %llu (last fish %llu, delay %llu)\n",
+  		     now, last_fish_arrived_at+delay, 
+		     last_fish_arrived_at,
+  		     delay);
+  	     });
+  
+	if (outstandingFishes < RtsFlags.ParFlags.maxFishes &&
+	    advisory_thread_count < RtsFlags.ParFlags.maxThreads) { // send a FISH, but when?
+	  if (last_fish_arrived_at==0 ||
+	      (last_fish_arrived_at+delay <= now)) {           // send FISH now!
+	    /* outstandingFishes is set in sendFish, processFish;
+	       avoid flooding system with fishes via delay */
+    next_fish_to_send_at = 0;  
+  } else {
+    /* ToDo: this should be done in the main scheduling loop to avoid the
+             busy wait here; not so bad if fish delay is very small  */
+    int iq = 0; // DEBUGGING -- HWL
+    next_fish_to_send_at = last_fish_arrived_at+delay; // remember when to send  
+    /* send a fish when ready, but process messages that arrive in the meantime */
+    do {
+      if (PacketsWaiting()) {
+        iq++; // DEBUGGING
+        *receivedFinish = processMessages();
+      }
+      now = msTime();
+    } while (!*receivedFinish || now<next_fish_to_send_at);
+    // JB: This means the fish could become obsolete, if we receive
+    // work. Better check for work again? 
+    // last line: while (!receivedFinish || !haveWork || now<...)
+    // next line: if (receivedFinish || haveWork )
+
+    if (*receivedFinish) // no need to send a FISH if we are finishing anyway
+      return rtsFalse;  // NB: this will leave scheduler loop
+			// immediately after return!
+  			  
+    IF_PAR_DEBUG(fish, // verbose,
+  	       debugBelch("--$$ <%llu> sent delayed fish (%d processMessages); active/total threads=%d/%d\n",now,iq,run_queue_len(),advisory_thread_count));
+
+  }
+
+    // JB: IMHO, this should all be hidden inside sendFish(...)
+    /* pe = choosePE(); 
+       sendFish(pe, thisPE, NEW_FISH_AGE, NEW_FISH_HISTORY, 
+                NEW_FISH_HUNGER);
+
+    // Global statistics: count no. of fishes
+    if (RtsFlags.ParFlags.ParStats.Global &&
+         RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	   globalParStats.tot_fish_mess++;
+	   }
+    */ 
+
+  /* delayed fishes must have been sent by now! */
+  next_fish_to_send_at = 0;  
+  }
+      
+  *receivedFinish = processMessages();
+# endif /* SPARKS */
+
+ return rtsFalse;
+ /* NB: this function always returns rtsFalse, meaning the scheduler
+    loop continues with the next iteration; 
+    rationale: 
+      return code means success in finding work; we enter this function
+      if there is no local work, thus have to send a fish which takes
+      time until it arrives with work; in the meantime we should process
+      messages in the main loop;
+ */
+}
+#endif // PARALLEL_HASKELL
+
+/* ----------------------------------------------------------------------------
+ * PAR/GRAN: Report stats & debugging info(?)
+ * ------------------------------------------------------------------------- */
+
+#if defined(PAR) || defined(GRAN)
+static void
+scheduleGranParReport(void)
+{
+  ASSERT(run_queue_hd != END_TSO_QUEUE);
+
+  /* Take a thread from the run queue, if we have work */
+  POP_RUN_QUEUE(t);  // take_off_run_queue(END_TSO_QUEUE);
+
+    /* If this TSO has got its outport closed in the meantime, 
+     *   it mustn't be run. Instead, we have to clean it up as if it was finished.
+     * It has to be marked as TH_DEAD for this purpose.
+     * If it is TH_TERM instead, it is supposed to have finished in the normal way.
+
+JB: TODO: investigate wether state change field could be nuked
+     entirely and replaced by the normal tso state (whatnext
+     field). All we want to do is to kill tsos from outside.
+     */
+
+    /* ToDo: write something to the log-file
+    if (RTSflags.ParFlags.granSimStats && !sameThread)
+        DumpGranEvent(GR_SCHEDULE, RunnableThreadsHd);
+
+    CurrentTSO = t;
+    */
+    /* the spark pool for the current PE */
+    pool = &(cap.r.rSparks); //  cap = (old) MainCap
+
+    IF_DEBUG(scheduler, 
+	     debugBelch("--=^ %d threads, %d sparks on [%#x]\n", 
+		   run_queue_len(), spark_queue_len(pool), CURRENT_PROC));
+
+    IF_PAR_DEBUG(fish,
+	     debugBelch("--=^ %d threads, %d sparks on [%#x]\n", 
+		   run_queue_len(), spark_queue_len(pool), CURRENT_PROC));
+
+    if (RtsFlags.ParFlags.ParStats.Full && 
+	(t->par.sparkname != (StgInt)0) && // only log spark generated threads
+	(emitSchedule || // forced emit
+         (t && LastTSO && t->id != LastTSO->id))) {
+      /* 
+	 we are running a different TSO, so write a schedule event to log file
+	 NB: If we use fair scheduling we also have to write  a deschedule 
+	     event for LastTSO; with unfair scheduling we know that the
+	     previous tso has blocked whenever we switch to another tso, so
+	     we don't need it in GUM for now
+      */
+      IF_PAR_DEBUG(fish, // schedule,
+		   debugBelch("____ scheduling spark generated thread %d (%lx) (%lx) via a forced emit\n",t->id,t,t->par.sparkname));
+
+      DumpRawGranEvent(CURRENT_PROC, CURRENT_PROC,
+		       GR_SCHEDULE, t, (StgClosure *)NULL, 0, 0);
+      emitSchedule = rtsFalse;
+    }
+}     
+#endif
+
+/* ----------------------------------------------------------------------------
+ * After running a thread...
+ * ------------------------------------------------------------------------- */
+
+static void
+schedulePostRunThread(void)
+{
+#if defined(PAR)
+    /* HACK 675: if the last thread didn't yield, make sure to print a 
+       SCHEDULE event to the log file when StgRunning the next thread, even
+       if it is the same one as before */
+    LastTSO = t; 
+    TimeOfLastYield = CURRENT_TIME;
+#endif
+
+  /* some statistics gathering in the parallel case */
+
+#if defined(GRAN) || defined(PAR) || defined(EDEN)
+  switch (ret) {
+    case HeapOverflow:
+# if defined(GRAN)
+      IF_DEBUG(gran, DumpGranEvent(GR_DESCHEDULE, t));
+      globalGranStats.tot_heapover++;
+# elif defined(PAR)
+      globalParStats.tot_heapover++;
+# endif
+      break;
+
+     case StackOverflow:
+# if defined(GRAN)
+      IF_DEBUG(gran, 
+	       DumpGranEvent(GR_DESCHEDULE, t));
+      globalGranStats.tot_stackover++;
+# elif defined(PAR)
+      // IF_DEBUG(par, 
+      // DumpGranEvent(GR_DESCHEDULE, t);
+      globalParStats.tot_stackover++;
+# endif
+      break;
+
+    case ThreadYielding:
+# if defined(GRAN)
+      IF_DEBUG(gran, 
+	       DumpGranEvent(GR_DESCHEDULE, t));
+      globalGranStats.tot_yields++;
+# elif defined(PAR)
+      // IF_DEBUG(par, 
+      // DumpGranEvent(GR_DESCHEDULE, t);
+      globalParStats.tot_yields++;
+# endif
+      break; 
+
+    case ThreadBlocked:
+# if defined(GRAN)
+      IF_DEBUG(scheduler,
+	       debugBelch("--<< thread %ld (%p; %s) stopped, blocking on node %p [PE %d] with BQ: ", 
+			  t->id, t, whatNext_strs[t->what_next], t->block_info.closure, 
+			  (t->block_info.closure==(StgClosure*)NULL ? 99 : where_is(t->block_info.closure)));
+	       if (t->block_info.closure!=(StgClosure*)NULL)
+	         print_bq(t->block_info.closure);
+	       debugBelch("\n"));
+
+      // ??? needed; should emit block before
+      IF_DEBUG(gran, 
+	       DumpGranEvent(GR_DESCHEDULE, t)); 
+      prune_eventq(t, (StgClosure *)NULL); // prune ContinueThreads for t
+      /*
+	ngoq Dogh!
+      ASSERT(procStatus[CurrentProc]==Busy || 
+	      ((procStatus[CurrentProc]==Fetching) && 
+	      (t->block_info.closure!=(StgClosure*)NULL)));
+      if (run_queue_hds[CurrentProc] == END_TSO_QUEUE &&
+	  !(!RtsFlags.GranFlags.DoAsyncFetch &&
+	    procStatus[CurrentProc]==Fetching)) 
+	procStatus[CurrentProc] = Idle;
+      */
+# elif defined(PAR)
+//++PAR++  blockThread() writes the event (change?)
+# endif
+    break;
+
+  case ThreadFinished:
+    break;
+
+  default:
+    barf("parGlobalStats: unknown return code");
+    break;
+    }
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+ * Handle a thread that returned to the scheduler with ThreadHeepOverflow
+ * -------------------------------------------------------------------------- */
+
+static rtsBool
+scheduleHandleHeapOverflow( Capability *cap, StgTSO *t )
+{
+    // did the task ask for a large block?
+    if (cap->r.rHpAlloc > BLOCK_SIZE) {
+	// if so, get one and push it on the front of the nursery.
+	bdescr *bd;
+	lnat blocks;
+	
+	blocks = (lnat)BLOCK_ROUND_UP(cap->r.rHpAlloc) / BLOCK_SIZE;
+	
+	IF_DEBUG(scheduler,
+		 debugBelch("--<< thread %ld (%s) stopped: requesting a large block (size %ld)\n", 
+			    (long)t->id, whatNext_strs[t->what_next], blocks));
+	
+	// don't do this if the nursery is (nearly) full, we'll GC first.
+	if (cap->r.rCurrentNursery->link != NULL ||
+	    cap->r.rNursery->n_blocks == 1) {  // paranoia to prevent infinite loop
+	                                       // if the nursery has only one block.
+	    
+	    ACQUIRE_SM_LOCK
+	    bd = allocGroup( blocks );
+	    RELEASE_SM_LOCK
+	    cap->r.rNursery->n_blocks += blocks;
+	    
+	    // link the new group into the list
+	    bd->link = cap->r.rCurrentNursery;
+	    bd->u.back = cap->r.rCurrentNursery->u.back;
+	    if (cap->r.rCurrentNursery->u.back != NULL) {
+		cap->r.rCurrentNursery->u.back->link = bd;
+	    } else {
+#if !defined(THREADED_RTS)
+		ASSERT(g0s0->blocks == cap->r.rCurrentNursery &&
+		       g0s0 == cap->r.rNursery);
+#endif
+		cap->r.rNursery->blocks = bd;
+	    }		  
+	    cap->r.rCurrentNursery->u.back = bd;
+	    
+	    // initialise it as a nursery block.  We initialise the
+	    // step, gen_no, and flags field of *every* sub-block in
+	    // this large block, because this is easier than making
+	    // sure that we always find the block head of a large
+	    // block whenever we call Bdescr() (eg. evacuate() and
+	    // isAlive() in the GC would both have to do this, at
+	    // least).
+	    { 
+		bdescr *x;
+		for (x = bd; x < bd + blocks; x++) {
+		    x->step = cap->r.rNursery;
+		    x->gen_no = 0;
+		    x->flags = 0;
+		}
+	    }
+	    
+	    // This assert can be a killer if the app is doing lots
+	    // of large block allocations.
+	    IF_DEBUG(sanity, checkNurserySanity(cap->r.rNursery));
+	    
+	    // now update the nursery to point to the new block
+	    cap->r.rCurrentNursery = bd;
+	    
+	    // we might be unlucky and have another thread get on the
+	    // run queue before us and steal the large block, but in that
+	    // case the thread will just end up requesting another large
+	    // block.
+	    pushOnRunQueue(cap,t);
+	    return rtsFalse;  /* not actually GC'ing */
+	}
+    }
+    
+    IF_DEBUG(scheduler,
+	     debugBelch("--<< thread %ld (%s) stopped: HeapOverflow\n", 
+			(long)t->id, whatNext_strs[t->what_next]));
+#if defined(GRAN)
+    ASSERT(!is_on_queue(t,CurrentProc));
+#elif defined(PARALLEL_HASKELL)
+    /* Currently we emit a DESCHEDULE event before GC in GUM.
+       ToDo: either add separate event to distinguish SYSTEM time from rest
+       or just nuke this DESCHEDULE (and the following SCHEDULE) */
+    if (0 && RtsFlags.ParFlags.ParStats.Full) {
+	DumpRawGranEvent(CURRENT_PROC, CURRENT_PROC,
+			 GR_DESCHEDULE, t, (StgClosure *)NULL, 0, 0);
+	emitSchedule = rtsTrue;
+    }
+#endif
+      
+    pushOnRunQueue(cap,t);
+    return rtsTrue;
+    /* actual GC is done at the end of the while loop in schedule() */
+}
+
+/* -----------------------------------------------------------------------------
+ * Handle a thread that returned to the scheduler with ThreadStackOverflow
+ * -------------------------------------------------------------------------- */
+
+static void
+scheduleHandleStackOverflow (Capability *cap, Task *task, StgTSO *t)
+{
+    IF_DEBUG(scheduler,debugBelch("--<< thread %ld (%s) stopped, StackOverflow\n", 
+				  (long)t->id, whatNext_strs[t->what_next]));
+    /* just adjust the stack for this thread, then pop it back
+     * on the run queue.
+     */
+    { 
+	/* enlarge the stack */
+	StgTSO *new_t = threadStackOverflow(cap, t);
+	
+	/* The TSO attached to this Task may have moved, so update the
+	 * pointer to it.
+	 */
+	if (task->tso == t) {
+	    task->tso = new_t;
+	}
+	pushOnRunQueue(cap,new_t);
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Handle a thread that returned to the scheduler with ThreadYielding
+ * -------------------------------------------------------------------------- */
+
+static rtsBool
+scheduleHandleYield( Capability *cap, StgTSO *t, nat prev_what_next )
+{
+    // Reset the context switch flag.  We don't do this just before
+    // running the thread, because that would mean we would lose ticks
+    // during GC, which can lead to unfair scheduling (a thread hogs
+    // the CPU because the tick always arrives during GC).  This way
+    // penalises threads that do a lot of allocation, but that seems
+    // better than the alternative.
+    context_switch = 0;
+    
+    /* put the thread back on the run queue.  Then, if we're ready to
+     * GC, check whether this is the last task to stop.  If so, wake
+     * up the GC thread.  getThread will block during a GC until the
+     * GC is finished.
+     */
+    IF_DEBUG(scheduler,
+	     if (t->what_next != prev_what_next) {
+		 debugBelch("--<< thread %ld (%s) stopped to switch evaluators\n", 
+			    (long)t->id, whatNext_strs[t->what_next]);
+	     } else {
+		 debugBelch("--<< thread %ld (%s) stopped, yielding\n",
+			    (long)t->id, whatNext_strs[t->what_next]);
+	     }
+	);
+    
+    IF_DEBUG(sanity,
+	     //debugBelch("&& Doing sanity check on yielding TSO %ld.", t->id);
+	     checkTSO(t));
+    ASSERT(t->link == END_TSO_QUEUE);
+    
+    // Shortcut if we're just switching evaluators: don't bother
+    // doing stack squeezing (which can be expensive), just run the
+    // thread.
+    if (t->what_next != prev_what_next) {
+	return rtsTrue;
+    }
+    
+#if defined(GRAN)
+    ASSERT(!is_on_queue(t,CurrentProc));
+      
+    IF_DEBUG(sanity,
+	     //debugBelch("&& Doing sanity check on all ThreadQueues (and their TSOs).");
+	     checkThreadQsSanity(rtsTrue));
+
+#endif
+
+    addToRunQueue(cap,t);
+
+#if defined(GRAN)
+    /* add a ContinueThread event to actually process the thread */
+    new_event(CurrentProc, CurrentProc, CurrentTime[CurrentProc],
+	      ContinueThread,
+	      t, (StgClosure*)NULL, (rtsSpark*)NULL);
+    IF_GRAN_DEBUG(bq, 
+		  debugBelch("GRAN: eventq and runnableq after adding yielded thread to queue again:\n");
+		  G_EVENTQ(0);
+		  G_CURR_THREADQ(0));
+#endif
+    return rtsFalse;
+}
+
+/* -----------------------------------------------------------------------------
+ * Handle a thread that returned to the scheduler with ThreadBlocked
+ * -------------------------------------------------------------------------- */
+
+static void
+scheduleHandleThreadBlocked( StgTSO *t
+#if !defined(GRAN) && !defined(DEBUG)
+    STG_UNUSED
+#endif
+    )
+{
+#if defined(GRAN)
+    IF_DEBUG(scheduler,
+	     debugBelch("--<< thread %ld (%p; %s) stopped, blocking on node %p [PE %d] with BQ: \n", 
+			t->id, t, whatNext_strs[t->what_next], t->block_info.closure, (t->block_info.closure==(StgClosure*)NULL ? 99 : where_is(t->block_info.closure)));
+	     if (t->block_info.closure!=(StgClosure*)NULL) print_bq(t->block_info.closure));
+    
+    // ??? needed; should emit block before
+    IF_DEBUG(gran, 
+	     DumpGranEvent(GR_DESCHEDULE, t)); 
+    prune_eventq(t, (StgClosure *)NULL); // prune ContinueThreads for t
+    /*
+      ngoq Dogh!
+      ASSERT(procStatus[CurrentProc]==Busy || 
+      ((procStatus[CurrentProc]==Fetching) && 
+      (t->block_info.closure!=(StgClosure*)NULL)));
+      if (run_queue_hds[CurrentProc] == END_TSO_QUEUE &&
+      !(!RtsFlags.GranFlags.DoAsyncFetch &&
+      procStatus[CurrentProc]==Fetching)) 
+      procStatus[CurrentProc] = Idle;
+    */
+#elif defined(PAR)
+    IF_DEBUG(scheduler,
+	     debugBelch("--<< thread %ld (%p; %s) stopped, blocking on node %p with BQ: \n", 
+			t->id, t, whatNext_strs[t->what_next], t->block_info.closure));
+    IF_PAR_DEBUG(bq,
+		 
+		 if (t->block_info.closure!=(StgClosure*)NULL) 
+		 print_bq(t->block_info.closure));
+    
+    /* Send a fetch (if BlockedOnGA) and dump event to log file */
+    blockThread(t);
+    
+    /* whatever we schedule next, we must log that schedule */
+    emitSchedule = rtsTrue;
+    
+#else /* !GRAN */
+
+      // We don't need to do anything.  The thread is blocked, and it
+      // has tidied up its stack and placed itself on whatever queue
+      // it needs to be on.
+
+#if !defined(THREADED_RTS)
+    ASSERT(t->why_blocked != NotBlocked);
+	     // This might not be true under THREADED_RTS: we don't have
+	     // exclusive access to this TSO, so someone might have
+	     // woken it up by now.  This actually happens: try
+	     // conc023 +RTS -N2.
+#endif
+
+    IF_DEBUG(scheduler,
+	     debugBelch("--<< thread %d (%s) stopped: ", 
+			t->id, whatNext_strs[t->what_next]);
+	     printThreadBlockage(t);
+	     debugBelch("\n"));
+    
+    /* Only for dumping event to log file 
+       ToDo: do I need this in GranSim, too?
+       blockThread(t);
+    */
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+ * Handle a thread that returned to the scheduler with ThreadFinished
+ * -------------------------------------------------------------------------- */
+
+static rtsBool
+scheduleHandleThreadFinished (Capability *cap STG_UNUSED, Task *task, StgTSO *t)
+{
+    /* Need to check whether this was a main thread, and if so,
+     * return with the return value.
+     *
+     * We also end up here if the thread kills itself with an
+     * uncaught exception, see Exception.cmm.
+     */
+    IF_DEBUG(scheduler,debugBelch("--++ thread %d (%s) finished\n", 
+				  t->id, whatNext_strs[t->what_next]));
+
+#if defined(GRAN)
+      endThread(t, CurrentProc); // clean-up the thread
+#elif defined(PARALLEL_HASKELL)
+      /* For now all are advisory -- HWL */
+      //if(t->priority==AdvisoryPriority) ??
+      advisory_thread_count--; // JB: Caution with this counter, buggy!
+      
+# if defined(DIST)
+      if(t->dist.priority==RevalPriority)
+	FinishReval(t);
+# endif
+    
+# if defined(EDENOLD)
+      // the thread could still have an outport... (BUG)
+      if (t->eden.outport != -1) {
+      // delete the outport for the tso which has finished...
+	IF_PAR_DEBUG(eden_ports,
+		   debugBelch("WARNING: Scheduler removes outport %d for TSO %d.\n",
+			      t->eden.outport, t->id));
+	deleteOPT(t);
+      }
+      // thread still in the process (HEAVY BUG! since outport has just been closed...)
+      if (t->eden.epid != -1) {
+	IF_PAR_DEBUG(eden_ports,
+		   debugBelch("WARNING: Scheduler removes TSO %d from process %d .\n",
+			   t->id, t->eden.epid));
+	removeTSOfromProcess(t);
+      }
+# endif 
+
+# if defined(PAR)
+      if (RtsFlags.ParFlags.ParStats.Full &&
+	  !RtsFlags.ParFlags.ParStats.Suppressed) 
+	DumpEndEvent(CURRENT_PROC, t, rtsFalse /* not mandatory */);
+
+      //  t->par only contains statistics: left out for now...
+      IF_PAR_DEBUG(fish,
+		   debugBelch("**** end thread: ended sparked thread %d (%lx); sparkname: %lx\n",
+			      t->id,t,t->par.sparkname));
+# endif
+#endif // PARALLEL_HASKELL
+
+      //
+      // Check whether the thread that just completed was a bound
+      // thread, and if so return with the result.  
+      //
+      // There is an assumption here that all thread completion goes
+      // through this point; we need to make sure that if a thread
+      // ends up in the ThreadKilled state, that it stays on the run
+      // queue so it can be dealt with here.
+      //
+
+      if (t->bound) {
+
+	  if (t->bound != task) {
+#if !defined(THREADED_RTS)
+	      // Must be a bound thread that is not the topmost one.  Leave
+	      // it on the run queue until the stack has unwound to the
+	      // point where we can deal with this.  Leaving it on the run
+	      // queue also ensures that the garbage collector knows about
+	      // this thread and its return value (it gets dropped from the
+	      // all_threads list so there's no other way to find it).
+	      appendToRunQueue(cap,t);
+	      return rtsFalse;
+#else
+	      // this cannot happen in the threaded RTS, because a
+	      // bound thread can only be run by the appropriate Task.
+	      barf("finished bound thread that isn't mine");
+#endif
+	  }
+
+	  ASSERT(task->tso == t);
+
+	  if (t->what_next == ThreadComplete) {
+	      if (task->ret) {
+		  // NOTE: return val is tso->sp[1] (see StgStartup.hc)
+		  *(task->ret) = (StgClosure *)task->tso->sp[1]; 
+	      }
+	      task->stat = Success;
+	  } else {
+	      if (task->ret) {
+		  *(task->ret) = NULL;
+	      }
+	      if (sched_state >= SCHED_INTERRUPTING) {
+		  task->stat = Interrupted;
+	      } else {
+		  task->stat = Killed;
+	      }
+	  }
+#ifdef DEBUG
+	  removeThreadLabel((StgWord)task->tso->id);
+#endif
+	  return rtsTrue; // tells schedule() to return
+      }
+
+      return rtsFalse;
+}
+
+/* -----------------------------------------------------------------------------
+ * Perform a heap census, if PROFILING
+ * -------------------------------------------------------------------------- */
+
+static rtsBool
+scheduleDoHeapProfile( rtsBool ready_to_gc STG_UNUSED )
+{
+#if defined(PROFILING)
+    // When we have +RTS -i0 and we're heap profiling, do a census at
+    // every GC.  This lets us get repeatable runs for debugging.
+    if (performHeapProfile ||
+	(RtsFlags.ProfFlags.profileInterval==0 &&
+	 RtsFlags.ProfFlags.doHeapProfile && ready_to_gc)) {
+
+	// checking black holes is necessary before GC, otherwise
+	// there may be threads that are unreachable except by the
+	// blackhole queue, which the GC will consider to be
+	// deadlocked.
+	scheduleCheckBlackHoles(&MainCapability);
+
+	IF_DEBUG(scheduler, sched_belch("garbage collecting before heap census"));
+	GarbageCollect(GetRoots, rtsTrue);
+
+	IF_DEBUG(scheduler, sched_belch("performing heap census"));
+	heapCensus();
+
+	performHeapProfile = rtsFalse;
+	return rtsTrue;  // true <=> we already GC'd
+    }
+#endif
+    return rtsFalse;
+}
+
+/* -----------------------------------------------------------------------------
+ * Perform a garbage collection if necessary
+ * -------------------------------------------------------------------------- */
+
+static Capability *
+scheduleDoGC (Capability *cap, Task *task USED_IF_THREADS,
+	      rtsBool force_major, void (*get_roots)(evac_fn))
+{
+    StgTSO *t;
+#ifdef THREADED_RTS
+    static volatile StgWord waiting_for_gc;
+    rtsBool was_waiting;
+    nat i;
+#endif
+
+#ifdef THREADED_RTS
+    // In order to GC, there must be no threads running Haskell code.
+    // Therefore, the GC thread needs to hold *all* the capabilities,
+    // and release them after the GC has completed.  
+    //
+    // This seems to be the simplest way: previous attempts involved
+    // making all the threads with capabilities give up their
+    // capabilities and sleep except for the *last* one, which
+    // actually did the GC.  But it's quite hard to arrange for all
+    // the other tasks to sleep and stay asleep.
+    //
+	
+    was_waiting = cas(&waiting_for_gc, 0, 1);
+    if (was_waiting) {
+	do {
+	    IF_DEBUG(scheduler, sched_belch("someone else is trying to GC..."));
+	    if (cap) yieldCapability(&cap,task);
+	} while (waiting_for_gc);
+	return cap;  // NOTE: task->cap might have changed here
+    }
+
+    for (i=0; i < n_capabilities; i++) {
+	IF_DEBUG(scheduler, sched_belch("ready_to_gc, grabbing all the capabilies (%d/%d)", i, n_capabilities));
+	if (cap != &capabilities[i]) {
+	    Capability *pcap = &capabilities[i];
+	    // we better hope this task doesn't get migrated to
+	    // another Capability while we're waiting for this one.
+	    // It won't, because load balancing happens while we have
+	    // all the Capabilities, but even so it's a slightly
+	    // unsavoury invariant.
+	    task->cap = pcap;
+	    context_switch = 1;
+	    waitForReturnCapability(&pcap, task);
+	    if (pcap != &capabilities[i]) {
+		barf("scheduleDoGC: got the wrong capability");
+	    }
+	}
+    }
+
+    waiting_for_gc = rtsFalse;
+#endif
+
+    /* Kick any transactions which are invalid back to their
+     * atomically frames.  When next scheduled they will try to
+     * commit, this commit will fail and they will retry.
+     */
+    { 
+	StgTSO *next;
+
+	for (t = all_threads; t != END_TSO_QUEUE; t = next) {
+	    if (t->what_next == ThreadRelocated) {
+		next = t->link;
+	    } else {
+		next = t->global_link;
+		if (t -> trec != NO_TREC && t -> why_blocked == NotBlocked) {
+		    if (!stmValidateNestOfTransactions (t -> trec)) {
+			IF_DEBUG(stm, sched_belch("trec %p found wasting its time", t));
+			
+			// strip the stack back to the
+			// ATOMICALLY_FRAME, aborting the (nested)
+			// transaction, and saving the stack of any
+			// partially-evaluated thunks on the heap.
+			raiseAsync_(&capabilities[0], t, NULL, rtsTrue, NULL);
+			
+#ifdef REG_R1
+			ASSERT(get_itbl((StgClosure *)t->sp)->type == ATOMICALLY_FRAME);
+#endif
+		    }
+		}
+	    }
+	}
+    }
+    
+    // so this happens periodically:
+    if (cap) scheduleCheckBlackHoles(cap);
+    
+    IF_DEBUG(scheduler, printAllThreads());
+
+    /*
+     * We now have all the capabilities; if we're in an interrupting
+     * state, then we should take the opportunity to delete all the
+     * threads in the system.
+     */
+    if (sched_state >= SCHED_INTERRUPTING) {
+	deleteAllThreads(&capabilities[0]);
+	sched_state = SCHED_INTERRUPTED;
+    }
+
+    /* everybody back, start the GC.
+     * Could do it in this thread, or signal a condition var
+     * to do it in another thread.  Either way, we need to
+     * broadcast on gc_pending_cond afterward.
+     */
+#if defined(THREADED_RTS)
+    IF_DEBUG(scheduler,sched_belch("doing GC"));
+#endif
+    GarbageCollect(get_roots, force_major);
+    
+#if defined(THREADED_RTS)
+    // release our stash of capabilities.
+    for (i = 0; i < n_capabilities; i++) {
+	if (cap != &capabilities[i]) {
+	    task->cap = &capabilities[i];
+	    releaseCapability(&capabilities[i]);
+	}
+    }
+    if (cap) {
+	task->cap = cap;
+    } else {
+	task->cap = NULL;
+    }
+#endif
+
+#if defined(GRAN)
+    /* add a ContinueThread event to continue execution of current thread */
+    new_event(CurrentProc, CurrentProc, CurrentTime[CurrentProc],
+	      ContinueThread,
+	      t, (StgClosure*)NULL, (rtsSpark*)NULL);
+    IF_GRAN_DEBUG(bq, 
+		  debugBelch("GRAN: eventq and runnableq after Garbage collection:\n\n");
+		  G_EVENTQ(0);
+		  G_CURR_THREADQ(0));
+#endif /* GRAN */
+
+    return cap;
+}
+
+/* ---------------------------------------------------------------------------
+ * rtsSupportsBoundThreads(): is the RTS built to support bound threads?
+ * used by Control.Concurrent for error checking.
+ * ------------------------------------------------------------------------- */
+ 
+StgBool
+rtsSupportsBoundThreads(void)
+{
+#if defined(THREADED_RTS)
+  return rtsTrue;
+#else
+  return rtsFalse;
+#endif
+}
+
+/* ---------------------------------------------------------------------------
+ * isThreadBound(tso): check whether tso is bound to an OS thread.
+ * ------------------------------------------------------------------------- */
+ 
+StgBool
+isThreadBound(StgTSO* tso USED_IF_THREADS)
+{
+#if defined(THREADED_RTS)
+  return (tso->bound != NULL);
+#endif
+  return rtsFalse;
+}
+
+/* ---------------------------------------------------------------------------
+ * Singleton fork(). Do not copy any running threads.
+ * ------------------------------------------------------------------------- */
+
+#if !defined(mingw32_HOST_OS)
+#define FORKPROCESS_PRIMOP_SUPPORTED
+#endif
+
+#ifdef FORKPROCESS_PRIMOP_SUPPORTED
+static void 
+deleteThread_(Capability *cap, StgTSO *tso);
+#endif
+StgInt
+forkProcess(HsStablePtr *entry
+#ifndef FORKPROCESS_PRIMOP_SUPPORTED
+	    STG_UNUSED
+#endif
+           )
+{
+#ifdef FORKPROCESS_PRIMOP_SUPPORTED
+    Task *task;
+    pid_t pid;
+    StgTSO* t,*next;
+    Capability *cap;
+    
+#if defined(THREADED_RTS)
+    if (RtsFlags.ParFlags.nNodes > 1) {
+	errorBelch("forking not supported with +RTS -N<n> greater than 1");
+	stg_exit(EXIT_FAILURE);
+    }
+#endif
+
+    IF_DEBUG(scheduler,sched_belch("forking!"));
+    
+    // ToDo: for SMP, we should probably acquire *all* the capabilities
+    cap = rts_lock();
+    
+    pid = fork();
+    
+    if (pid) { // parent
+	
+	// just return the pid
+	rts_unlock(cap);
+	return pid;
+	
+    } else { // child
+	
+	// Now, all OS threads except the thread that forked are
+	// stopped.  We need to stop all Haskell threads, including
+	// those involved in foreign calls.  Also we need to delete
+	// all Tasks, because they correspond to OS threads that are
+	// now gone.
+
+	for (t = all_threads; t != END_TSO_QUEUE; t = next) {
+	    if (t->what_next == ThreadRelocated) {
+		next = t->link;
+	    } else {
+		next = t->global_link;
+		// don't allow threads to catch the ThreadKilled
+		// exception, but we do want to raiseAsync() because these
+		// threads may be evaluating thunks that we need later.
+		deleteThread_(cap,t);
+	    }
+	}
+	
+	// Empty the run queue.  It seems tempting to let all the
+	// killed threads stay on the run queue as zombies to be
+	// cleaned up later, but some of them correspond to bound
+	// threads for which the corresponding Task does not exist.
+	cap->run_queue_hd = END_TSO_QUEUE;
+	cap->run_queue_tl = END_TSO_QUEUE;
+
+	// Any suspended C-calling Tasks are no more, their OS threads
+	// don't exist now:
+	cap->suspended_ccalling_tasks = NULL;
+
+	// Empty the all_threads list.  Otherwise, the garbage
+	// collector may attempt to resurrect some of these threads.
+	all_threads = END_TSO_QUEUE;
+
+	// Wipe the task list, except the current Task.
+	ACQUIRE_LOCK(&sched_mutex);
+	for (task = all_tasks; task != NULL; task=task->all_link) {
+	    if (task != cap->running_task) {
+		discardTask(task);
+	    }
+	}
+	RELEASE_LOCK(&sched_mutex);
+
+#if defined(THREADED_RTS)
+	// Wipe our spare workers list, they no longer exist.  New
+	// workers will be created if necessary.
+	cap->spare_workers = NULL;
+	cap->returning_tasks_hd = NULL;
+	cap->returning_tasks_tl = NULL;
+#endif
+
+	cap = rts_evalStableIO(cap, entry, NULL);  // run the action
+	rts_checkSchedStatus("forkProcess",cap);
+	
+	rts_unlock(cap);
+	hs_exit();                      // clean up and exit
+	stg_exit(EXIT_SUCCESS);
+    }
+#else /* !FORKPROCESS_PRIMOP_SUPPORTED */
+    barf("forkProcess#: primop not supported on this platform, sorry!\n");
+    return -1;
+#endif
+}
+
+/* ---------------------------------------------------------------------------
+ * Delete all the threads in the system
+ * ------------------------------------------------------------------------- */
+   
+static void
+deleteAllThreads ( Capability *cap )
+{
+  StgTSO* t, *next;
+  IF_DEBUG(scheduler,sched_belch("deleting all threads"));
+  for (t = all_threads; t != END_TSO_QUEUE; t = next) {
+      if (t->what_next == ThreadRelocated) {
+	  next = t->link;
+      } else {
+	  next = t->global_link;
+	  deleteThread(cap,t);
+      }
+  }      
+
+  // The run queue now contains a bunch of ThreadKilled threads.  We
+  // must not throw these away: the main thread(s) will be in there
+  // somewhere, and the main scheduler loop has to deal with it.
+  // Also, the run queue is the only thing keeping these threads from
+  // being GC'd, and we don't want the "main thread has been GC'd" panic.
+
+#if !defined(THREADED_RTS)
+  ASSERT(blocked_queue_hd == END_TSO_QUEUE);
+  ASSERT(sleeping_queue == END_TSO_QUEUE);
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+   Managing the suspended_ccalling_tasks list.
+   Locks required: sched_mutex
+   -------------------------------------------------------------------------- */
+
+STATIC_INLINE void
+suspendTask (Capability *cap, Task *task)
+{
+    ASSERT(task->next == NULL && task->prev == NULL);
+    task->next = cap->suspended_ccalling_tasks;
+    task->prev = NULL;
+    if (cap->suspended_ccalling_tasks) {
+	cap->suspended_ccalling_tasks->prev = task;
+    }
+    cap->suspended_ccalling_tasks = task;
+}
+
+STATIC_INLINE void
+recoverSuspendedTask (Capability *cap, Task *task)
+{
+    if (task->prev) {
+	task->prev->next = task->next;
+    } else {
+	ASSERT(cap->suspended_ccalling_tasks == task);
+	cap->suspended_ccalling_tasks = task->next;
+    }
+    if (task->next) {
+	task->next->prev = task->prev;
+    }
+    task->next = task->prev = NULL;
+}
+
+/* ---------------------------------------------------------------------------
+ * Suspending & resuming Haskell threads.
+ * 
+ * When making a "safe" call to C (aka _ccall_GC), the task gives back
+ * its capability before calling the C function.  This allows another
+ * task to pick up the capability and carry on running Haskell
+ * threads.  It also means that if the C call blocks, it won't lock
+ * the whole system.
+ *
+ * The Haskell thread making the C call is put to sleep for the
+ * duration of the call, on the susepended_ccalling_threads queue.  We
+ * give out a token to the task, which it can use to resume the thread
+ * on return from the C function.
+ * ------------------------------------------------------------------------- */
+   
+void *
+suspendThread (StgRegTable *reg)
+{
+  Capability *cap;
+  int saved_errno = errno;
+  StgTSO *tso;
+  Task *task;
+
+  /* assume that *reg is a pointer to the StgRegTable part of a Capability.
+   */
+  cap = regTableToCapability(reg);
+
+  task = cap->running_task;
+  tso = cap->r.rCurrentTSO;
+
+  IF_DEBUG(scheduler,
+	   sched_belch("thread %d did a safe foreign call", cap->r.rCurrentTSO->id));
+
+  // XXX this might not be necessary --SDM
+  tso->what_next = ThreadRunGHC;
+
+  threadPaused(cap,tso);
+
+  if(tso->blocked_exceptions == NULL)  {
+      tso->why_blocked = BlockedOnCCall;
+      tso->blocked_exceptions = END_TSO_QUEUE;
+  } else {
+      tso->why_blocked = BlockedOnCCall_NoUnblockExc;
+  }
+
+  // Hand back capability
+  task->suspended_tso = tso;
+
+  ACQUIRE_LOCK(&cap->lock);
+
+  suspendTask(cap,task);
+  cap->in_haskell = rtsFalse;
+  releaseCapability_(cap);
+  
+  RELEASE_LOCK(&cap->lock);
+
+#if defined(THREADED_RTS)
+  /* Preparing to leave the RTS, so ensure there's a native thread/task
+     waiting to take over.
+  */
+  IF_DEBUG(scheduler, sched_belch("thread %d: leaving RTS", tso->id));
+#endif
+
+  errno = saved_errno;
+  return task;
+}
+
+StgRegTable *
+resumeThread (void *task_)
+{
+    StgTSO *tso;
+    Capability *cap;
+    int saved_errno = errno;
+    Task *task = task_;
+
+    cap = task->cap;
+    // Wait for permission to re-enter the RTS with the result.
+    waitForReturnCapability(&cap,task);
+    // we might be on a different capability now... but if so, our
+    // entry on the suspended_ccalling_tasks list will also have been
+    // migrated.
+
+    // Remove the thread from the suspended list
+    recoverSuspendedTask(cap,task);
+
+    tso = task->suspended_tso;
+    task->suspended_tso = NULL;
+    tso->link = END_TSO_QUEUE;
+    IF_DEBUG(scheduler, sched_belch("thread %d: re-entering RTS", tso->id));
+    
+    if (tso->why_blocked == BlockedOnCCall) {
+	awakenBlockedQueue(cap,tso->blocked_exceptions);
+	tso->blocked_exceptions = NULL;
+    }
+    
+    /* Reset blocking status */
+    tso->why_blocked  = NotBlocked;
+    
+    cap->r.rCurrentTSO = tso;
+    cap->in_haskell = rtsTrue;
+    errno = saved_errno;
+
+    /* We might have GC'd, mark the TSO dirty again */
+    dirtyTSO(tso);
+
+    IF_DEBUG(sanity, checkTSO(tso));
+
+    return &cap->r;
+}
+
+/* ---------------------------------------------------------------------------
+ * Comparing Thread ids.
+ *
+ * This is used from STG land in the implementation of the
+ * instances of Eq/Ord for ThreadIds.
+ * ------------------------------------------------------------------------ */
+
+int
+cmp_thread(StgPtr tso1, StgPtr tso2) 
+{ 
+  StgThreadID id1 = ((StgTSO *)tso1)->id; 
+  StgThreadID id2 = ((StgTSO *)tso2)->id;
+ 
+  if (id1 < id2) return (-1);
+  if (id1 > id2) return 1;
+  return 0;
+}
+
+/* ---------------------------------------------------------------------------
+ * Fetching the ThreadID from an StgTSO.
+ *
+ * This is used in the implementation of Show for ThreadIds.
+ * ------------------------------------------------------------------------ */
+int
+rts_getThreadId(StgPtr tso) 
+{
+  return ((StgTSO *)tso)->id;
+}
+
+#ifdef DEBUG
+void
+labelThread(StgPtr tso, char *label)
+{
+  int len;
+  void *buf;
+
+  /* Caveat: Once set, you can only set the thread name to "" */
+  len = strlen(label)+1;
+  buf = stgMallocBytes(len * sizeof(char), "Schedule.c:labelThread()");
+  strncpy(buf,label,len);
+  /* Update will free the old memory for us */
+  updateThreadLabel(((StgTSO *)tso)->id,buf);
+}
+#endif /* DEBUG */
+
+/* ---------------------------------------------------------------------------
+   Create a new thread.
+
+   The new thread starts with the given stack size.  Before the
+   scheduler can run, however, this thread needs to have a closure
+   (and possibly some arguments) pushed on its stack.  See
+   pushClosure() in Schedule.h.
+
+   createGenThread() and createIOThread() (in SchedAPI.h) are
+   convenient packaged versions of this function.
+
+   currently pri (priority) is only used in a GRAN setup -- HWL
+   ------------------------------------------------------------------------ */
+#if defined(GRAN)
+/*   currently pri (priority) is only used in a GRAN setup -- HWL */
+StgTSO *
+createThread(nat size, StgInt pri)
+#else
+StgTSO *
+createThread(Capability *cap, nat size)
+#endif
+{
+    StgTSO *tso;
+    nat stack_size;
+
+    /* sched_mutex is *not* required */
+
+    /* First check whether we should create a thread at all */
+#if defined(PARALLEL_HASKELL)
+    /* check that no more than RtsFlags.ParFlags.maxThreads threads are created */
+    if (advisory_thread_count >= RtsFlags.ParFlags.maxThreads) {
+	threadsIgnored++;
+	debugBelch("{createThread}Daq ghuH: refusing to create another thread; no more than %d threads allowed (currently %d)\n",
+		   RtsFlags.ParFlags.maxThreads, advisory_thread_count);
+	return END_TSO_QUEUE;
+    }
+    threadsCreated++;
+#endif
+
+#if defined(GRAN)
+    ASSERT(!RtsFlags.GranFlags.Light || CurrentProc==0);
+#endif
+
+    // ToDo: check whether size = stack_size - TSO_STRUCT_SIZEW
+
+    /* catch ridiculously small stack sizes */
+    if (size < MIN_STACK_WORDS + TSO_STRUCT_SIZEW) {
+	size = MIN_STACK_WORDS + TSO_STRUCT_SIZEW;
+    }
+
+    stack_size = size - TSO_STRUCT_SIZEW;
+    
+    tso = (StgTSO *)allocateLocal(cap, size);
+    TICK_ALLOC_TSO(stack_size, 0);
+
+    SET_HDR(tso, &stg_TSO_info, CCS_SYSTEM);
+#if defined(GRAN)
+    SET_GRAN_HDR(tso, ThisPE);
+#endif
+
+    // Always start with the compiled code evaluator
+    tso->what_next = ThreadRunGHC;
+
+    tso->why_blocked  = NotBlocked;
+    tso->blocked_exceptions = NULL;
+    tso->flags = TSO_DIRTY;
+    
+    tso->saved_errno = 0;
+    tso->bound = NULL;
+    tso->cap = cap;
+    
+    tso->stack_size     = stack_size;
+    tso->max_stack_size = round_to_mblocks(RtsFlags.GcFlags.maxStkSize) 
+	                  - TSO_STRUCT_SIZEW;
+    tso->sp             = (P_)&(tso->stack) + stack_size;
+
+    tso->trec = NO_TREC;
+    
+#ifdef PROFILING
+    tso->prof.CCCS = CCS_MAIN;
+#endif
+    
+  /* put a stop frame on the stack */
+    tso->sp -= sizeofW(StgStopFrame);
+    SET_HDR((StgClosure*)tso->sp,(StgInfoTable *)&stg_stop_thread_info,CCS_SYSTEM);
+    tso->link = END_TSO_QUEUE;
+    
+  // ToDo: check this
+#if defined(GRAN)
+    /* uses more flexible routine in GranSim */
+    insertThread(tso, CurrentProc);
+#else
+    /* In a non-GranSim setup the pushing of a TSO onto the runq is separated
+     * from its creation
+     */
+#endif
+    
+#if defined(GRAN) 
+    if (RtsFlags.GranFlags.GranSimStats.Full) 
+	DumpGranEvent(GR_START,tso);
+#elif defined(PARALLEL_HASKELL)
+    if (RtsFlags.ParFlags.ParStats.Full) 
+	DumpGranEvent(GR_STARTQ,tso);
+    /* HACk to avoid SCHEDULE 
+       LastTSO = tso; */
+#endif
+    
+    /* Link the new thread on the global thread list.
+     */
+    ACQUIRE_LOCK(&sched_mutex);
+    tso->id = next_thread_id++;  // while we have the mutex
+    tso->global_link = all_threads;
+    all_threads = tso;
+    RELEASE_LOCK(&sched_mutex);
+    
+#if defined(DIST)
+    tso->dist.priority = MandatoryPriority; //by default that is...
+#endif
+    
+#if defined(GRAN)
+    tso->gran.pri = pri;
+# if defined(DEBUG)
+    tso->gran.magic = TSO_MAGIC; // debugging only
+# endif
+    tso->gran.sparkname   = 0;
+    tso->gran.startedat   = CURRENT_TIME; 
+    tso->gran.exported    = 0;
+    tso->gran.basicblocks = 0;
+    tso->gran.allocs      = 0;
+    tso->gran.exectime    = 0;
+    tso->gran.fetchtime   = 0;
+    tso->gran.fetchcount  = 0;
+    tso->gran.blocktime   = 0;
+    tso->gran.blockcount  = 0;
+    tso->gran.blockedat   = 0;
+    tso->gran.globalsparks = 0;
+    tso->gran.localsparks  = 0;
+    if (RtsFlags.GranFlags.Light)
+	tso->gran.clock  = Now; /* local clock */
+    else
+	tso->gran.clock  = 0;
+    
+    IF_DEBUG(gran,printTSO(tso));
+#elif defined(PARALLEL_HASKELL)
+# if defined(DEBUG)
+    tso->par.magic = TSO_MAGIC; // debugging only
+# endif
+    tso->par.sparkname   = 0;
+    tso->par.startedat   = CURRENT_TIME; 
+    tso->par.exported    = 0;
+    tso->par.basicblocks = 0;
+    tso->par.allocs      = 0;
+    tso->par.exectime    = 0;
+    tso->par.fetchtime   = 0;
+    tso->par.fetchcount  = 0;
+    tso->par.blocktime   = 0;
+    tso->par.blockcount  = 0;
+    tso->par.blockedat   = 0;
+    tso->par.globalsparks = 0;
+    tso->par.localsparks  = 0;
+#endif
+    
+#if defined(GRAN)
+    globalGranStats.tot_threads_created++;
+    globalGranStats.threads_created_on_PE[CurrentProc]++;
+    globalGranStats.tot_sq_len += spark_queue_len(CurrentProc);
+    globalGranStats.tot_sq_probes++;
+#elif defined(PARALLEL_HASKELL)
+    // collect parallel global statistics (currently done together with GC stats)
+    if (RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	//debugBelch("Creating thread %d @ %11.2f\n", tso->id, usertime()); 
+	globalParStats.tot_threads_created++;
+    }
+#endif 
+    
+#if defined(GRAN)
+    IF_GRAN_DEBUG(pri,
+		  sched_belch("==__ schedule: Created TSO %d (%p);",
+			      CurrentProc, tso, tso->id));
+#elif defined(PARALLEL_HASKELL)
+    IF_PAR_DEBUG(verbose,
+		 sched_belch("==__ schedule: Created TSO %d (%p); %d threads active",
+			     (long)tso->id, tso, advisory_thread_count));
+#else
+    IF_DEBUG(scheduler,sched_belch("created thread %ld, stack size = %lx words", 
+				   (long)tso->id, (long)tso->stack_size));
+#endif    
+    return tso;
+}
+
+#if defined(PAR)
+/* RFP:
+   all parallel thread creation calls should fall through the following routine.
+*/
+StgTSO *
+createThreadFromSpark(rtsSpark spark) 
+{ StgTSO *tso;
+  ASSERT(spark != (rtsSpark)NULL);
+// JB: TAKE CARE OF THIS COUNTER! BUGGY
+  if (advisory_thread_count >= RtsFlags.ParFlags.maxThreads) 
+  { threadsIgnored++;
+    barf("{createSparkThread}Daq ghuH: refusing to create another thread; no more than %d threads allowed (currently %d)",
+	  RtsFlags.ParFlags.maxThreads, advisory_thread_count);    
+    return END_TSO_QUEUE;
+  }
+  else
+  { threadsCreated++;
+    tso = createThread(RtsFlags.GcFlags.initialStkSize);
+    if (tso==END_TSO_QUEUE)	
+      barf("createSparkThread: Cannot create TSO");
+#if defined(DIST)
+    tso->priority = AdvisoryPriority;
+#endif
+    pushClosure(tso,spark);
+    addToRunQueue(tso);
+    advisory_thread_count++;  // JB: TAKE CARE OF THIS COUNTER! BUGGY
+  }
+  return tso;
+}
+#endif
+
+/*
+  Turn a spark into a thread.
+  ToDo: fix for SMP (needs to acquire SCHED_MUTEX!)
+*/
+#if 0
+StgTSO *
+activateSpark (rtsSpark spark) 
+{
+  StgTSO *tso;
+
+  tso = createSparkThread(spark);
+  if (RtsFlags.ParFlags.ParStats.Full) {   
+    //ASSERT(run_queue_hd == END_TSO_QUEUE); // I think ...
+      IF_PAR_DEBUG(verbose,
+		   debugBelch("==^^ activateSpark: turning spark of closure %p (%s) into a thread\n",
+			      (StgClosure *)spark, info_type((StgClosure *)spark)));
+  }
+  // ToDo: fwd info on local/global spark to thread -- HWL
+  // tso->gran.exported =  spark->exported;
+  // tso->gran.locked =   !spark->global;
+  // tso->gran.sparkname = spark->name;
+
+  return tso;
+}
+#endif
+
+/* ---------------------------------------------------------------------------
+ * scheduleThread()
+ *
+ * scheduleThread puts a thread on the end  of the runnable queue.
+ * This will usually be done immediately after a thread is created.
+ * The caller of scheduleThread must create the thread using e.g.
+ * createThread and push an appropriate closure
+ * on this thread's stack before the scheduler is invoked.
+ * ------------------------------------------------------------------------ */
+
+void
+scheduleThread(Capability *cap, StgTSO *tso)
+{
+    // The thread goes at the *end* of the run-queue, to avoid possible
+    // starvation of any threads already on the queue.
+    appendToRunQueue(cap,tso);
+}
+
+void
+scheduleThreadOn(Capability *cap, StgWord cpu USED_IF_THREADS, StgTSO *tso)
+{
+#if defined(THREADED_RTS)
+    tso->flags |= TSO_LOCKED; // we requested explicit affinity; don't
+			      // move this thread from now on.
+    cpu %= RtsFlags.ParFlags.nNodes;
+    if (cpu == cap->no) {
+	appendToRunQueue(cap,tso);
+    } else {
+	Capability *target_cap = &capabilities[cpu];
+	if (tso->bound) {
+	    tso->bound->cap = target_cap;
+	}
+	tso->cap = target_cap;
+	wakeupThreadOnCapability(target_cap,tso);
+    }
+#else
+    appendToRunQueue(cap,tso);
+#endif
+}
+
+Capability *
+scheduleWaitThread (StgTSO* tso, /*[out]*/HaskellObj* ret, Capability *cap)
+{
+    Task *task;
+
+    // We already created/initialised the Task
+    task = cap->running_task;
+
+    // This TSO is now a bound thread; make the Task and TSO
+    // point to each other.
+    tso->bound = task;
+    tso->cap = cap;
+
+    task->tso = tso;
+    task->ret = ret;
+    task->stat = NoStatus;
+
+    appendToRunQueue(cap,tso);
+
+    IF_DEBUG(scheduler, sched_belch("new bound thread (%d)", tso->id));
+
+#if defined(GRAN)
+    /* GranSim specific init */
+    CurrentTSO = m->tso;                // the TSO to run
+    procStatus[MainProc] = Busy;        // status of main PE
+    CurrentProc = MainProc;             // PE to run it on
+#endif
+
+    cap = schedule(cap,task);
+
+    ASSERT(task->stat != NoStatus);
+    ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
+
+    IF_DEBUG(scheduler, sched_belch("bound thread (%d) finished", task->tso->id));
+    return cap;
+}
+
+/* ----------------------------------------------------------------------------
+ * Starting Tasks
+ * ------------------------------------------------------------------------- */
+
+#if defined(THREADED_RTS)
+void
+workerStart(Task *task)
+{
+    Capability *cap;
+
+    // See startWorkerTask().
+    ACQUIRE_LOCK(&task->lock);
+    cap = task->cap;
+    RELEASE_LOCK(&task->lock);
+
+    // set the thread-local pointer to the Task:
+    taskEnter(task);
+
+    // schedule() runs without a lock.
+    cap = schedule(cap,task);
+
+    // On exit from schedule(), we have a Capability.
+    releaseCapability(cap);
+    taskStop(task);
+}
+#endif
+
+/* ---------------------------------------------------------------------------
+ * initScheduler()
+ *
+ * Initialise the scheduler.  This resets all the queues - if the
+ * queues contained any threads, they'll be garbage collected at the
+ * next pass.
+ *
+ * ------------------------------------------------------------------------ */
+
+void 
+initScheduler(void)
+{
+#if defined(GRAN)
+  nat i;
+  for (i=0; i<=MAX_PROC; i++) {
+    run_queue_hds[i]      = END_TSO_QUEUE;
+    run_queue_tls[i]      = END_TSO_QUEUE;
+    blocked_queue_hds[i]  = END_TSO_QUEUE;
+    blocked_queue_tls[i]  = END_TSO_QUEUE;
+    ccalling_threadss[i]  = END_TSO_QUEUE;
+    blackhole_queue[i]    = END_TSO_QUEUE;
+    sleeping_queue        = END_TSO_QUEUE;
+  }
+#elif !defined(THREADED_RTS)
+  blocked_queue_hd  = END_TSO_QUEUE;
+  blocked_queue_tl  = END_TSO_QUEUE;
+  sleeping_queue    = END_TSO_QUEUE;
+#endif
+
+  blackhole_queue   = END_TSO_QUEUE;
+  all_threads       = END_TSO_QUEUE;
+
+  context_switch = 0;
+  sched_state    = SCHED_RUNNING;
+
+  RtsFlags.ConcFlags.ctxtSwitchTicks =
+      RtsFlags.ConcFlags.ctxtSwitchTime / TICK_MILLISECS;
+      
+#if defined(THREADED_RTS)
+  /* Initialise the mutex and condition variables used by
+   * the scheduler. */
+  initMutex(&sched_mutex);
+#endif
+  
+  ACQUIRE_LOCK(&sched_mutex);
+
+  /* A capability holds the state a native thread needs in
+   * order to execute STG code. At least one capability is
+   * floating around (only THREADED_RTS builds have more than one).
+   */
+  initCapabilities();
+
+  initTaskManager();
+
+#if defined(THREADED_RTS) || defined(PARALLEL_HASKELL)
+  initSparkPools();
+#endif
+
+#if defined(THREADED_RTS)
+  /*
+   * Eagerly start one worker to run each Capability, except for
+   * Capability 0.  The idea is that we're probably going to start a
+   * bound thread on Capability 0 pretty soon, so we don't want a
+   * worker task hogging it.
+   */
+  { 
+      nat i;
+      Capability *cap;
+      for (i = 1; i < n_capabilities; i++) {
+	  cap = &capabilities[i];
+	  ACQUIRE_LOCK(&cap->lock);
+	  startWorkerTask(cap, workerStart);
+	  RELEASE_LOCK(&cap->lock);
+      }
+  }
+#endif
+
+  RELEASE_LOCK(&sched_mutex);
+}
+
+void
+exitScheduler( void )
+{
+    Task *task = NULL;
+
+#if defined(THREADED_RTS)
+    ACQUIRE_LOCK(&sched_mutex);
+    task = newBoundTask();
+    RELEASE_LOCK(&sched_mutex);
+#endif
+
+    // If we haven't killed all the threads yet, do it now.
+    if (sched_state < SCHED_INTERRUPTED) {
+	sched_state = SCHED_INTERRUPTING;
+	scheduleDoGC(NULL,task,rtsFalse,GetRoots);    
+    }
+    sched_state = SCHED_SHUTTING_DOWN;
+
+#if defined(THREADED_RTS)
+    { 
+	nat i;
+	
+	for (i = 0; i < n_capabilities; i++) {
+	    shutdownCapability(&capabilities[i], task);
+	}
+	boundTaskExiting(task);
+	stopTaskManager();
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------------------
+   Where are the roots that we know about?
+
+        - all the threads on the runnable queue
+        - all the threads on the blocked queue
+        - all the threads on the sleeping queue
+	- all the thread currently executing a _ccall_GC
+        - all the "main threads"
+     
+   ------------------------------------------------------------------------ */
+
+/* This has to be protected either by the scheduler monitor, or by the
+	garbage collection monitor (probably the latter).
+	KH @ 25/10/99
+*/
+
+void
+GetRoots( evac_fn evac )
+{
+    nat i;
+    Capability *cap;
+    Task *task;
+
+#if defined(GRAN)
+    for (i=0; i<=RtsFlags.GranFlags.proc; i++) {
+	if ((run_queue_hds[i] != END_TSO_QUEUE) && ((run_queue_hds[i] != NULL)))
+	    evac((StgClosure **)&run_queue_hds[i]);
+	if ((run_queue_tls[i] != END_TSO_QUEUE) && ((run_queue_tls[i] != NULL)))
+	    evac((StgClosure **)&run_queue_tls[i]);
+	
+	if ((blocked_queue_hds[i] != END_TSO_QUEUE) && ((blocked_queue_hds[i] != NULL)))
+	    evac((StgClosure **)&blocked_queue_hds[i]);
+	if ((blocked_queue_tls[i] != END_TSO_QUEUE) && ((blocked_queue_tls[i] != NULL)))
+	    evac((StgClosure **)&blocked_queue_tls[i]);
+	if ((ccalling_threadss[i] != END_TSO_QUEUE) && ((ccalling_threadss[i] != NULL)))
+	    evac((StgClosure **)&ccalling_threads[i]);
+    }
+
+    markEventQueue();
+
+#else /* !GRAN */
+
+    for (i = 0; i < n_capabilities; i++) {
+	cap = &capabilities[i];
+	evac((StgClosure **)(void *)&cap->run_queue_hd);
+	evac((StgClosure **)(void *)&cap->run_queue_tl);
+#if defined(THREADED_RTS)
+	evac((StgClosure **)(void *)&cap->wakeup_queue_hd);
+	evac((StgClosure **)(void *)&cap->wakeup_queue_tl);
+#endif
+	for (task = cap->suspended_ccalling_tasks; task != NULL; 
+	     task=task->next) {
+	    IF_DEBUG(scheduler,sched_belch("evac'ing suspended TSO %d", task->suspended_tso->id));
+	    evac((StgClosure **)(void *)&task->suspended_tso);
+	}
+
+    }
+    
+
+#if !defined(THREADED_RTS)
+    evac((StgClosure **)(void *)&blocked_queue_hd);
+    evac((StgClosure **)(void *)&blocked_queue_tl);
+    evac((StgClosure **)(void *)&sleeping_queue);
+#endif 
+#endif
+
+    // evac((StgClosure **)&blackhole_queue);
+
+#if defined(THREADED_RTS) || defined(PARALLEL_HASKELL) || defined(GRAN)
+    markSparkQueue(evac);
+#endif
+    
+#if defined(RTS_USER_SIGNALS)
+    // mark the signal handlers (signals should be already blocked)
+    markSignalHandlers(evac);
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+   performGC
+
+   This is the interface to the garbage collector from Haskell land.
+   We provide this so that external C code can allocate and garbage
+   collect when called from Haskell via _ccall_GC.
+
+   It might be useful to provide an interface whereby the programmer
+   can specify more roots (ToDo).
+   
+   This needs to be protected by the GC condition variable above.  KH.
+   -------------------------------------------------------------------------- */
+
+static void (*extra_roots)(evac_fn);
+
+static void
+performGC_(rtsBool force_major, void (*get_roots)(evac_fn))
+{
+    Task *task = myTask();
+
+    if (task == NULL) {
+	ACQUIRE_LOCK(&sched_mutex);
+	task = newBoundTask();
+	RELEASE_LOCK(&sched_mutex);
+	scheduleDoGC(NULL,task,force_major, get_roots);
+	boundTaskExiting(task);
+    } else {
+	scheduleDoGC(NULL,task,force_major, get_roots);
+    }
+}
+
+void
+performGC(void)
+{
+    performGC_(rtsFalse, GetRoots);
+}
+
+void
+performMajorGC(void)
+{
+    performGC_(rtsTrue, GetRoots);
+}
+
+static void
+AllRoots(evac_fn evac)
+{
+    GetRoots(evac);		// the scheduler's roots
+    extra_roots(evac);		// the user's roots
+}
+
+void
+performGCWithRoots(void (*get_roots)(evac_fn))
+{
+    extra_roots = get_roots;
+    performGC_(rtsFalse, AllRoots);
+}
+
+/* -----------------------------------------------------------------------------
+   Stack overflow
+
+   If the thread has reached its maximum stack size, then raise the
+   StackOverflow exception in the offending thread.  Otherwise
+   relocate the TSO into a larger chunk of memory and adjust its stack
+   size appropriately.
+   -------------------------------------------------------------------------- */
+
+static StgTSO *
+threadStackOverflow(Capability *cap, StgTSO *tso)
+{
+  nat new_stack_size, stack_words;
+  lnat new_tso_size;
+  StgPtr new_sp;
+  StgTSO *dest;
+
+  IF_DEBUG(sanity,checkTSO(tso));
+  if (tso->stack_size >= tso->max_stack_size) {
+
+    IF_DEBUG(gc,
+	     debugBelch("@@ threadStackOverflow of TSO %ld (%p): stack too large (now %ld; max is %ld)\n",
+		   (long)tso->id, tso, (long)tso->stack_size, (long)tso->max_stack_size);
+	     /* If we're debugging, just print out the top of the stack */
+	     printStackChunk(tso->sp, stg_min(tso->stack+tso->stack_size, 
+					      tso->sp+64)));
+
+    /* Send this thread the StackOverflow exception */
+    raiseAsync(cap, tso, (StgClosure *)stackOverflow_closure);
+    return tso;
+  }
+
+  /* Try to double the current stack size.  If that takes us over the
+   * maximum stack size for this thread, then use the maximum instead.
+   * Finally round up so the TSO ends up as a whole number of blocks.
+   */
+  new_stack_size = stg_min(tso->stack_size * 2, tso->max_stack_size);
+  new_tso_size   = (lnat)BLOCK_ROUND_UP(new_stack_size * sizeof(W_) + 
+				       TSO_STRUCT_SIZE)/sizeof(W_);
+  new_tso_size = round_to_mblocks(new_tso_size);  /* Be MBLOCK-friendly */
+  new_stack_size = new_tso_size - TSO_STRUCT_SIZEW;
+
+  IF_DEBUG(scheduler, sched_belch("increasing stack size from %ld words to %d.\n", (long)tso->stack_size, new_stack_size));
+
+  dest = (StgTSO *)allocate(new_tso_size);
+  TICK_ALLOC_TSO(new_stack_size,0);
+
+  /* copy the TSO block and the old stack into the new area */
+  memcpy(dest,tso,TSO_STRUCT_SIZE);
+  stack_words = tso->stack + tso->stack_size - tso->sp;
+  new_sp = (P_)dest + new_tso_size - stack_words;
+  memcpy(new_sp, tso->sp, stack_words * sizeof(W_));
+
+  /* relocate the stack pointers... */
+  dest->sp         = new_sp;
+  dest->stack_size = new_stack_size;
+	
+  /* Mark the old TSO as relocated.  We have to check for relocated
+   * TSOs in the garbage collector and any primops that deal with TSOs.
+   *
+   * It's important to set the sp value to just beyond the end
+   * of the stack, so we don't attempt to scavenge any part of the
+   * dead TSO's stack.
+   */
+  tso->what_next = ThreadRelocated;
+  tso->link = dest;
+  tso->sp = (P_)&(tso->stack[tso->stack_size]);
+  tso->why_blocked = NotBlocked;
+
+  IF_PAR_DEBUG(verbose,
+	       debugBelch("@@ threadStackOverflow of TSO %d (now at %p): stack size increased to %ld\n",
+		     tso->id, tso, tso->stack_size);
+	       /* If we're debugging, just print out the top of the stack */
+	       printStackChunk(tso->sp, stg_min(tso->stack+tso->stack_size, 
+						tso->sp+64)));
+  
+  IF_DEBUG(sanity,checkTSO(tso));
+#if 0
+  IF_DEBUG(scheduler,printTSO(dest));
+#endif
+
+  return dest;
+}
+
+/* ---------------------------------------------------------------------------
+   Wake up a queue that was blocked on some resource.
+   ------------------------------------------------------------------------ */
+
+#if defined(GRAN)
+STATIC_INLINE void
+unblockCount ( StgBlockingQueueElement *bqe, StgClosure *node )
+{
+}
+#elif defined(PARALLEL_HASKELL)
+STATIC_INLINE void
+unblockCount ( StgBlockingQueueElement *bqe, StgClosure *node )
+{
+  /* write RESUME events to log file and
+     update blocked and fetch time (depending on type of the orig closure) */
+  if (RtsFlags.ParFlags.ParStats.Full) {
+    DumpRawGranEvent(CURRENT_PROC, CURRENT_PROC, 
+		     GR_RESUMEQ, ((StgTSO *)bqe), ((StgTSO *)bqe)->block_info.closure,
+		     0, 0 /* spark_queue_len(ADVISORY_POOL) */);
+    if (emptyRunQueue())
+      emitSchedule = rtsTrue;
+
+    switch (get_itbl(node)->type) {
+	case FETCH_ME_BQ:
+	  ((StgTSO *)bqe)->par.fetchtime += CURRENT_TIME-((StgTSO *)bqe)->par.blockedat;
+	  break;
+	case RBH:
+	case FETCH_ME:
+	case BLACKHOLE_BQ:
+	  ((StgTSO *)bqe)->par.blocktime += CURRENT_TIME-((StgTSO *)bqe)->par.blockedat;
+	  break;
+#ifdef DIST
+        case MVAR:
+          break;
+#endif	  
+	default:
+	  barf("{unblockOne}Daq Qagh: unexpected closure in blocking queue");
+	}
+      }
+}
+#endif
+
+#if defined(GRAN)
+StgBlockingQueueElement *
+unblockOne(StgBlockingQueueElement *bqe, StgClosure *node)
+{
+    StgTSO *tso;
+    PEs node_loc, tso_loc;
+
+    node_loc = where_is(node); // should be lifted out of loop
+    tso = (StgTSO *)bqe;  // wastes an assignment to get the type right
+    tso_loc = where_is((StgClosure *)tso);
+    if (IS_LOCAL_TO(PROCS(node),tso_loc)) { // TSO is local
+      /* !fake_fetch => TSO is on CurrentProc is same as IS_LOCAL_TO */
+      ASSERT(CurrentProc!=node_loc || tso_loc==CurrentProc);
+      CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.lunblocktime;
+      // insertThread(tso, node_loc);
+      new_event(tso_loc, tso_loc, CurrentTime[CurrentProc],
+		ResumeThread,
+		tso, node, (rtsSpark*)NULL);
+      tso->link = END_TSO_QUEUE; // overwrite link just to be sure 
+      // len_local++;
+      // len++;
+    } else { // TSO is remote (actually should be FMBQ)
+      CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.mpacktime +
+                                  RtsFlags.GranFlags.Costs.gunblocktime +
+	                          RtsFlags.GranFlags.Costs.latency;
+      new_event(tso_loc, CurrentProc, CurrentTime[CurrentProc],
+		UnblockThread,
+		tso, node, (rtsSpark*)NULL);
+      tso->link = END_TSO_QUEUE; // overwrite link just to be sure 
+      // len++;
+    }
+    /* the thread-queue-overhead is accounted for in either Resume or UnblockThread */
+    IF_GRAN_DEBUG(bq,
+		  debugBelch(" %s TSO %d (%p) [PE %d] (block_info.closure=%p) (next=%p) ,",
+			  (node_loc==tso_loc ? "Local" : "Global"), 
+			  tso->id, tso, CurrentProc, tso->block_info.closure, tso->link));
+    tso->block_info.closure = NULL;
+    IF_DEBUG(scheduler,debugBelch("-- Waking up thread %ld (%p)\n", 
+			     tso->id, tso));
+}
+#elif defined(PARALLEL_HASKELL)
+StgBlockingQueueElement *
+unblockOne(StgBlockingQueueElement *bqe, StgClosure *node)
+{
+    StgBlockingQueueElement *next;
+
+    switch (get_itbl(bqe)->type) {
+    case TSO:
+      ASSERT(((StgTSO *)bqe)->why_blocked != NotBlocked);
+      /* if it's a TSO just push it onto the run_queue */
+      next = bqe->link;
+      ((StgTSO *)bqe)->link = END_TSO_QUEUE; // debugging?
+      APPEND_TO_RUN_QUEUE((StgTSO *)bqe); 
+      threadRunnable();
+      unblockCount(bqe, node);
+      /* reset blocking status after dumping event */
+      ((StgTSO *)bqe)->why_blocked = NotBlocked;
+      break;
+
+    case BLOCKED_FETCH:
+      /* if it's a BLOCKED_FETCH put it on the PendingFetches list */
+      next = bqe->link;
+      bqe->link = (StgBlockingQueueElement *)PendingFetches;
+      PendingFetches = (StgBlockedFetch *)bqe;
+      break;
+
+# if defined(DEBUG)
+      /* can ignore this case in a non-debugging setup; 
+	 see comments on RBHSave closures above */
+    case CONSTR:
+      /* check that the closure is an RBHSave closure */
+      ASSERT(get_itbl((StgClosure *)bqe) == &stg_RBH_Save_0_info ||
+	     get_itbl((StgClosure *)bqe) == &stg_RBH_Save_1_info ||
+	     get_itbl((StgClosure *)bqe) == &stg_RBH_Save_2_info);
+      break;
+
+    default:
+      barf("{unblockOne}Daq Qagh: Unexpected IP (%#lx; %s) in blocking queue at %#lx\n",
+	   get_itbl((StgClosure *)bqe), info_type((StgClosure *)bqe), 
+	   (StgClosure *)bqe);
+# endif
+    }
+  IF_PAR_DEBUG(bq, debugBelch(", %p (%s)\n", bqe, info_type((StgClosure*)bqe)));
+  return next;
+}
+#endif
+
+StgTSO *
+unblockOne(Capability *cap, StgTSO *tso)
+{
+  StgTSO *next;
+
+  ASSERT(get_itbl(tso)->type == TSO);
+  ASSERT(tso->why_blocked != NotBlocked);
+
+  tso->why_blocked = NotBlocked;
+  next = tso->link;
+  tso->link = END_TSO_QUEUE;
+
+#if defined(THREADED_RTS)
+  if (tso->cap == cap || (!tsoLocked(tso) && RtsFlags.ParFlags.wakeupMigrate)) {
+      // We are waking up this thread on the current Capability, which
+      // might involve migrating it from the Capability it was last on.
+      if (tso->bound) {
+	  ASSERT(tso->bound->cap == tso->cap);
+	  tso->bound->cap = cap;
+      }
+      tso->cap = cap;
+      appendToRunQueue(cap,tso);
+      // we're holding a newly woken thread, make sure we context switch
+      // quickly so we can migrate it if necessary.
+      context_switch = 1;
+  } else {
+      // we'll try to wake it up on the Capability it was last on.
+      wakeupThreadOnCapability(tso->cap, tso);
+  }
+#else
+  appendToRunQueue(cap,tso);
+  context_switch = 1;
+#endif
+
+  IF_DEBUG(scheduler,sched_belch("waking up thread %ld on cap %d", (long)tso->id, tso->cap->no));
+  return next;
+}
+
+
+#if defined(GRAN)
+void 
+awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node)
+{
+  StgBlockingQueueElement *bqe;
+  PEs node_loc;
+  nat len = 0; 
+
+  IF_GRAN_DEBUG(bq, 
+		debugBelch("##-_ AwBQ for node %p on PE %d @ %ld by TSO %d (%p): \n", \
+		      node, CurrentProc, CurrentTime[CurrentProc], 
+		      CurrentTSO->id, CurrentTSO));
+
+  node_loc = where_is(node);
+
+  ASSERT(q == END_BQ_QUEUE ||
+	 get_itbl(q)->type == TSO ||   // q is either a TSO or an RBHSave
+	 get_itbl(q)->type == CONSTR); // closure (type constructor)
+  ASSERT(is_unique(node));
+
+  /* FAKE FETCH: magically copy the node to the tso's proc;
+     no Fetch necessary because in reality the node should not have been 
+     moved to the other PE in the first place
+  */
+  if (CurrentProc!=node_loc) {
+    IF_GRAN_DEBUG(bq, 
+		  debugBelch("## node %p is on PE %d but CurrentProc is %d (TSO %d); assuming fake fetch and adjusting bitmask (old: %#x)\n",
+			node, node_loc, CurrentProc, CurrentTSO->id, 
+			// CurrentTSO, where_is(CurrentTSO),
+			node->header.gran.procs));
+    node->header.gran.procs = (node->header.gran.procs) | PE_NUMBER(CurrentProc);
+    IF_GRAN_DEBUG(bq, 
+		  debugBelch("## new bitmask of node %p is %#x\n",
+			node, node->header.gran.procs));
+    if (RtsFlags.GranFlags.GranSimStats.Global) {
+      globalGranStats.tot_fake_fetches++;
+    }
+  }
+
+  bqe = q;
+  // ToDo: check: ASSERT(CurrentProc==node_loc);
+  while (get_itbl(bqe)->type==TSO) { // q != END_TSO_QUEUE) {
+    //next = bqe->link;
+    /* 
+       bqe points to the current element in the queue
+       next points to the next element in the queue
+    */
+    //tso = (StgTSO *)bqe;  // wastes an assignment to get the type right
+    //tso_loc = where_is(tso);
+    len++;
+    bqe = unblockOne(bqe, node);
+  }
+
+  /* if this is the BQ of an RBH, we have to put back the info ripped out of
+     the closure to make room for the anchor of the BQ */
+  if (bqe!=END_BQ_QUEUE) {
+    ASSERT(get_itbl(node)->type == RBH && get_itbl(bqe)->type == CONSTR);
+    /*
+    ASSERT((info_ptr==&RBH_Save_0_info) ||
+	   (info_ptr==&RBH_Save_1_info) ||
+	   (info_ptr==&RBH_Save_2_info));
+    */
+    /* cf. convertToRBH in RBH.c for writing the RBHSave closure */
+    ((StgRBH *)node)->blocking_queue = (StgBlockingQueueElement *)((StgRBHSave *)bqe)->payload[0];
+    ((StgRBH *)node)->mut_link       = (StgMutClosure *)((StgRBHSave *)bqe)->payload[1];
+
+    IF_GRAN_DEBUG(bq,
+		  debugBelch("## Filled in RBH_Save for %p (%s) at end of AwBQ\n",
+			node, info_type(node)));
+  }
+
+  /* statistics gathering */
+  if (RtsFlags.GranFlags.GranSimStats.Global) {
+    // globalGranStats.tot_bq_processing_time += bq_processing_time;
+    globalGranStats.tot_bq_len += len;      // total length of all bqs awakened
+    // globalGranStats.tot_bq_len_local += len_local;  // same for local TSOs only
+    globalGranStats.tot_awbq++;             // total no. of bqs awakened
+  }
+  IF_GRAN_DEBUG(bq,
+		debugBelch("## BQ Stats of %p: [%d entries] %s\n",
+			node, len, (bqe!=END_BQ_QUEUE) ? "RBH" : ""));
+}
+#elif defined(PARALLEL_HASKELL)
+void 
+awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node)
+{
+  StgBlockingQueueElement *bqe;
+
+  IF_PAR_DEBUG(verbose, 
+	       debugBelch("##-_ AwBQ for node %p on [%x]: \n",
+		     node, mytid));
+#ifdef DIST  
+  //RFP
+  if(get_itbl(q)->type == CONSTR || q==END_BQ_QUEUE) {
+    IF_PAR_DEBUG(verbose, debugBelch("## ... nothing to unblock so lets just return. RFP (BUG?)\n"));
+    return;
+  }
+#endif
+  
+  ASSERT(q == END_BQ_QUEUE ||
+	 get_itbl(q)->type == TSO ||           
+  	 get_itbl(q)->type == BLOCKED_FETCH || 
+  	 get_itbl(q)->type == CONSTR); 
+
+  bqe = q;
+  while (get_itbl(bqe)->type==TSO || 
+	 get_itbl(bqe)->type==BLOCKED_FETCH) {
+    bqe = unblockOne(bqe, node);
+  }
+}
+
+#else   /* !GRAN && !PARALLEL_HASKELL */
+
+void
+awakenBlockedQueue(Capability *cap, StgTSO *tso)
+{
+    if (tso == NULL) return; // hack; see bug #1235728, and comments in
+	 		     // Exception.cmm
+    while (tso != END_TSO_QUEUE) {
+	tso = unblockOne(cap,tso);
+    }
+}
+#endif
+
+/* ---------------------------------------------------------------------------
+   Interrupt execution
+   - usually called inside a signal handler so it mustn't do anything fancy.   
+   ------------------------------------------------------------------------ */
+
+void
+interruptStgRts(void)
+{
+    sched_state = SCHED_INTERRUPTING;
+    context_switch = 1;
+#if defined(THREADED_RTS)
+    prodAllCapabilities();
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+   Unblock a thread
+
+   This is for use when we raise an exception in another thread, which
+   may be blocked.
+   This has nothing to do with the UnblockThread event in GranSim. -- HWL
+   -------------------------------------------------------------------------- */
+
+#if defined(GRAN) || defined(PARALLEL_HASKELL)
+/*
+  NB: only the type of the blocking queue is different in GranSim and GUM
+      the operations on the queue-elements are the same
+      long live polymorphism!
+
+  Locks: sched_mutex is held upon entry and exit.
+
+*/
+static void
+unblockThread(Capability *cap, StgTSO *tso)
+{
+  StgBlockingQueueElement *t, **last;
+
+  switch (tso->why_blocked) {
+
+  case NotBlocked:
+    return;  /* not blocked */
+
+  case BlockedOnSTM:
+    // Be careful: nothing to do here!  We tell the scheduler that the thread
+    // is runnable and we leave it to the stack-walking code to abort the 
+    // transaction while unwinding the stack.  We should perhaps have a debugging
+    // test to make sure that this really happens and that the 'zombie' transaction
+    // does not get committed.
+    goto done;
+
+  case BlockedOnMVar:
+    ASSERT(get_itbl(tso->block_info.closure)->type == MVAR);
+    {
+      StgBlockingQueueElement *last_tso = END_BQ_QUEUE;
+      StgMVar *mvar = (StgMVar *)(tso->block_info.closure);
+
+      last = (StgBlockingQueueElement **)&mvar->head;
+      for (t = (StgBlockingQueueElement *)mvar->head; 
+	   t != END_BQ_QUEUE; 
+	   last = &t->link, last_tso = t, t = t->link) {
+	if (t == (StgBlockingQueueElement *)tso) {
+	  *last = (StgBlockingQueueElement *)tso->link;
+	  if (mvar->tail == tso) {
+	    mvar->tail = (StgTSO *)last_tso;
+	  }
+	  goto done;
+	}
+      }
+      barf("unblockThread (MVAR): TSO not found");
+    }
+
+  case BlockedOnBlackHole:
+    ASSERT(get_itbl(tso->block_info.closure)->type == BLACKHOLE_BQ);
+    {
+      StgBlockingQueue *bq = (StgBlockingQueue *)(tso->block_info.closure);
+
+      last = &bq->blocking_queue;
+      for (t = bq->blocking_queue; 
+	   t != END_BQ_QUEUE; 
+	   last = &t->link, t = t->link) {
+	if (t == (StgBlockingQueueElement *)tso) {
+	  *last = (StgBlockingQueueElement *)tso->link;
+	  goto done;
+	}
+      }
+      barf("unblockThread (BLACKHOLE): TSO not found");
+    }
+
+  case BlockedOnException:
+    {
+      StgTSO *target  = tso->block_info.tso;
+
+      ASSERT(get_itbl(target)->type == TSO);
+
+      if (target->what_next == ThreadRelocated) {
+	  target = target->link;
+	  ASSERT(get_itbl(target)->type == TSO);
+      }
+
+      ASSERT(target->blocked_exceptions != NULL);
+
+      last = (StgBlockingQueueElement **)&target->blocked_exceptions;
+      for (t = (StgBlockingQueueElement *)target->blocked_exceptions; 
+	   t != END_BQ_QUEUE; 
+	   last = &t->link, t = t->link) {
+	ASSERT(get_itbl(t)->type == TSO);
+	if (t == (StgBlockingQueueElement *)tso) {
+	  *last = (StgBlockingQueueElement *)tso->link;
+	  goto done;
+	}
+      }
+      barf("unblockThread (Exception): TSO not found");
+    }
+
+  case BlockedOnRead:
+  case BlockedOnWrite:
+#if defined(mingw32_HOST_OS)
+  case BlockedOnDoProc:
+#endif
+    {
+      /* take TSO off blocked_queue */
+      StgBlockingQueueElement *prev = NULL;
+      for (t = (StgBlockingQueueElement *)blocked_queue_hd; t != END_BQ_QUEUE; 
+	   prev = t, t = t->link) {
+	if (t == (StgBlockingQueueElement *)tso) {
+	  if (prev == NULL) {
+	    blocked_queue_hd = (StgTSO *)t->link;
+	    if ((StgBlockingQueueElement *)blocked_queue_tl == t) {
+	      blocked_queue_tl = END_TSO_QUEUE;
+	    }
+	  } else {
+	    prev->link = t->link;
+	    if ((StgBlockingQueueElement *)blocked_queue_tl == t) {
+	      blocked_queue_tl = (StgTSO *)prev;
+	    }
+	  }
+#if defined(mingw32_HOST_OS)
+	  /* (Cooperatively) signal that the worker thread should abort
+	   * the request.
+	   */
+	  abandonWorkRequest(tso->block_info.async_result->reqID);
+#endif
+	  goto done;
+	}
+      }
+      barf("unblockThread (I/O): TSO not found");
+    }
+
+  case BlockedOnDelay:
+    {
+      /* take TSO off sleeping_queue */
+      StgBlockingQueueElement *prev = NULL;
+      for (t = (StgBlockingQueueElement *)sleeping_queue; t != END_BQ_QUEUE; 
+	   prev = t, t = t->link) {
+	if (t == (StgBlockingQueueElement *)tso) {
+	  if (prev == NULL) {
+	    sleeping_queue = (StgTSO *)t->link;
+	  } else {
+	    prev->link = t->link;
+	  }
+	  goto done;
+	}
+      }
+      barf("unblockThread (delay): TSO not found");
+    }
+
+  default:
+    barf("unblockThread");
+  }
+
+ done:
+  tso->link = END_TSO_QUEUE;
+  tso->why_blocked = NotBlocked;
+  tso->block_info.closure = NULL;
+  pushOnRunQueue(cap,tso);
+}
+#else
+static void
+unblockThread(Capability *cap, StgTSO *tso)
+{
+  StgTSO *t, **last;
+  
+  /* To avoid locking unnecessarily. */
+  if (tso->why_blocked == NotBlocked) {
+    return;
+  }
+
+  switch (tso->why_blocked) {
+
+  case BlockedOnSTM:
+    // Be careful: nothing to do here!  We tell the scheduler that the thread
+    // is runnable and we leave it to the stack-walking code to abort the 
+    // transaction while unwinding the stack.  We should perhaps have a debugging
+    // test to make sure that this really happens and that the 'zombie' transaction
+    // does not get committed.
+    goto done;
+
+  case BlockedOnMVar:
+    ASSERT(get_itbl(tso->block_info.closure)->type == MVAR);
+    {
+      StgTSO *last_tso = END_TSO_QUEUE;
+      StgMVar *mvar = (StgMVar *)(tso->block_info.closure);
+
+      last = &mvar->head;
+      for (t = mvar->head; t != END_TSO_QUEUE; 
+	   last = &t->link, last_tso = t, t = t->link) {
+	if (t == tso) {
+	  *last = tso->link;
+	  if (mvar->tail == tso) {
+	    mvar->tail = last_tso;
+	  }
+	  goto done;
+	}
+      }
+      barf("unblockThread (MVAR): TSO not found");
+    }
+
+  case BlockedOnBlackHole:
+    {
+      last = &blackhole_queue;
+      for (t = blackhole_queue; t != END_TSO_QUEUE; 
+	   last = &t->link, t = t->link) {
+	if (t == tso) {
+	  *last = tso->link;
+	  goto done;
+	}
+      }
+      barf("unblockThread (BLACKHOLE): TSO not found");
+    }
+
+  case BlockedOnException:
+    {
+      StgTSO *target  = tso->block_info.tso;
+
+      ASSERT(get_itbl(target)->type == TSO);
+
+      while (target->what_next == ThreadRelocated) {
+	  target = target->link;
+	  ASSERT(get_itbl(target)->type == TSO);
+      }
+      
+      ASSERT(target->blocked_exceptions != NULL);
+
+      last = &target->blocked_exceptions;
+      for (t = target->blocked_exceptions; t != END_TSO_QUEUE; 
+	   last = &t->link, t = t->link) {
+	ASSERT(get_itbl(t)->type == TSO);
+	if (t == tso) {
+	  *last = tso->link;
+	  goto done;
+	}
+      }
+      barf("unblockThread (Exception): TSO not found");
+    }
+
+#if !defined(THREADED_RTS)
+  case BlockedOnRead:
+  case BlockedOnWrite:
+#if defined(mingw32_HOST_OS)
+  case BlockedOnDoProc:
+#endif
+    {
+      StgTSO *prev = NULL;
+      for (t = blocked_queue_hd; t != END_TSO_QUEUE; 
+	   prev = t, t = t->link) {
+	if (t == tso) {
+	  if (prev == NULL) {
+	    blocked_queue_hd = t->link;
+	    if (blocked_queue_tl == t) {
+	      blocked_queue_tl = END_TSO_QUEUE;
+	    }
+	  } else {
+	    prev->link = t->link;
+	    if (blocked_queue_tl == t) {
+	      blocked_queue_tl = prev;
+	    }
+	  }
+#if defined(mingw32_HOST_OS)
+	  /* (Cooperatively) signal that the worker thread should abort
+	   * the request.
+	   */
+	  abandonWorkRequest(tso->block_info.async_result->reqID);
+#endif
+	  goto done;
+	}
+      }
+      barf("unblockThread (I/O): TSO not found");
+    }
+
+  case BlockedOnDelay:
+    {
+      StgTSO *prev = NULL;
+      for (t = sleeping_queue; t != END_TSO_QUEUE; 
+	   prev = t, t = t->link) {
+	if (t == tso) {
+	  if (prev == NULL) {
+	    sleeping_queue = t->link;
+	  } else {
+	    prev->link = t->link;
+	  }
+	  goto done;
+	}
+      }
+      barf("unblockThread (delay): TSO not found");
+    }
+#endif
+
+  default:
+    barf("unblockThread");
+  }
+
+ done:
+  tso->link = END_TSO_QUEUE;
+  tso->why_blocked = NotBlocked;
+  tso->block_info.closure = NULL;
+  appendToRunQueue(cap,tso);
+
+  // We might have just migrated this TSO to our Capability:
+  if (tso->bound) {
+      tso->bound->cap = cap;
+  }
+  tso->cap = cap;
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+ * checkBlackHoles()
+ *
+ * Check the blackhole_queue for threads that can be woken up.  We do
+ * this periodically: before every GC, and whenever the run queue is
+ * empty.
+ *
+ * An elegant solution might be to just wake up all the blocked
+ * threads with awakenBlockedQueue occasionally: they'll go back to
+ * sleep again if the object is still a BLACKHOLE.  Unfortunately this
+ * doesn't give us a way to tell whether we've actually managed to
+ * wake up any threads, so we would be busy-waiting.
+ *
+ * -------------------------------------------------------------------------- */
+
+static rtsBool
+checkBlackHoles (Capability *cap)
+{
+    StgTSO **prev, *t;
+    rtsBool any_woke_up = rtsFalse;
+    StgHalfWord type;
+
+    // blackhole_queue is global:
+    ASSERT_LOCK_HELD(&sched_mutex);
+
+    IF_DEBUG(scheduler, sched_belch("checking threads blocked on black holes"));
+
+    // ASSUMES: sched_mutex
+    prev = &blackhole_queue;
+    t = blackhole_queue;
+    while (t != END_TSO_QUEUE) {
+	ASSERT(t->why_blocked == BlockedOnBlackHole);
+	type = get_itbl(t->block_info.closure)->type;
+	if (type != BLACKHOLE && type != CAF_BLACKHOLE) {
+	    IF_DEBUG(sanity,checkTSO(t));
+	    t = unblockOne(cap, t);
+	    // urk, the threads migrate to the current capability
+	    // here, but we'd like to keep them on the original one.
+	    *prev = t;
+	    any_woke_up = rtsTrue;
+	} else {
+	    prev = &t->link;
+	    t = t->link;
+	}
+    }
+
+    return any_woke_up;
+}
+
+/* -----------------------------------------------------------------------------
+ * raiseAsync()
+ *
+ * The following function implements the magic for raising an
+ * asynchronous exception in an existing thread.
+ *
+ * We first remove the thread from any queue on which it might be
+ * blocked.  The possible blockages are MVARs and BLACKHOLE_BQs.
+ *
+ * We strip the stack down to the innermost CATCH_FRAME, building
+ * thunks in the heap for all the active computations, so they can 
+ * be restarted if necessary.  When we reach a CATCH_FRAME, we build
+ * an application of the handler to the exception, and push it on
+ * the top of the stack.
+ * 
+ * How exactly do we save all the active computations?  We create an
+ * AP_STACK for every UpdateFrame on the stack.  Entering one of these
+ * AP_STACKs pushes everything from the corresponding update frame
+ * upwards onto the stack.  (Actually, it pushes everything up to the
+ * next update frame plus a pointer to the next AP_STACK object.
+ * Entering the next AP_STACK object pushes more onto the stack until we
+ * reach the last AP_STACK object - at which point the stack should look
+ * exactly as it did when we killed the TSO and we can continue
+ * execution by entering the closure on top of the stack.
+ *
+ * We can also kill a thread entirely - this happens if either (a) the 
+ * exception passed to raiseAsync is NULL, or (b) there's no
+ * CATCH_FRAME on the stack.  In either case, we strip the entire
+ * stack and replace the thread with a zombie.
+ *
+ * ToDo: in THREADED_RTS mode, this function is only safe if either
+ * (a) we hold all the Capabilities (eg. in GC, or if there is only
+ * one Capability), or (b) we own the Capability that the TSO is
+ * currently blocked on or on the run queue of.
+ *
+ * -------------------------------------------------------------------------- */
+ 
+void
+raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception)
+{
+    raiseAsync_(cap, tso, exception, rtsFalse, NULL);
+}
+
+void
+suspendComputation(Capability *cap, StgTSO *tso, StgPtr stop_here)
+{
+    raiseAsync_(cap, tso, NULL, rtsFalse, stop_here);
+}
+
+static void
+raiseAsync_(Capability *cap, StgTSO *tso, StgClosure *exception, 
+	    rtsBool stop_at_atomically, StgPtr stop_here)
+{
+    StgRetInfoTable *info;
+    StgPtr sp, frame;
+    nat i;
+  
+    // Thread already dead?
+    if (tso->what_next == ThreadComplete || tso->what_next == ThreadKilled) {
+	return;
+    }
+
+    IF_DEBUG(scheduler, 
+	     sched_belch("raising exception in thread %ld.", (long)tso->id));
+    
+    // Remove it from any blocking queues
+    unblockThread(cap,tso);
+
+    // mark it dirty; we're about to change its stack.
+    dirtyTSO(tso);
+
+    sp = tso->sp;
+    
+    // The stack freezing code assumes there's a closure pointer on
+    // the top of the stack, so we have to arrange that this is the case...
+    //
+    if (sp[0] == (W_)&stg_enter_info) {
+	sp++;
+    } else {
+	sp--;
+	sp[0] = (W_)&stg_dummy_ret_closure;
+    }
+
+    frame = sp + 1;
+    while (stop_here == NULL || frame < stop_here) {
+
+	// 1. Let the top of the stack be the "current closure"
+	//
+	// 2. Walk up the stack until we find either an UPDATE_FRAME or a
+	// CATCH_FRAME.
+	//
+	// 3. If it's an UPDATE_FRAME, then make an AP_STACK containing the
+	// current closure applied to the chunk of stack up to (but not
+	// including) the update frame.  This closure becomes the "current
+	// closure".  Go back to step 2.
+	//
+	// 4. If it's a CATCH_FRAME, then leave the exception handler on
+	// top of the stack applied to the exception.
+	// 
+	// 5. If it's a STOP_FRAME, then kill the thread.
+        // 
+        // NB: if we pass an ATOMICALLY_FRAME then abort the associated 
+        // transaction
+       
+	info = get_ret_itbl((StgClosure *)frame);
+
+	switch (info->i.type) {
+
+	case UPDATE_FRAME:
+	{
+	    StgAP_STACK * ap;
+	    nat words;
+	    
+	    // First build an AP_STACK consisting of the stack chunk above the
+	    // current update frame, with the top word on the stack as the
+	    // fun field.
+	    //
+	    words = frame - sp - 1;
+	    ap = (StgAP_STACK *)allocateLocal(cap,AP_STACK_sizeW(words));
+	    
+	    ap->size = words;
+	    ap->fun  = (StgClosure *)sp[0];
+	    sp++;
+	    for(i=0; i < (nat)words; ++i) {
+		ap->payload[i] = (StgClosure *)*sp++;
+	    }
+	    
+	    SET_HDR(ap,&stg_AP_STACK_info,
+		    ((StgClosure *)frame)->header.prof.ccs /* ToDo */); 
+	    TICK_ALLOC_UP_THK(words+1,0);
+	    
+	    IF_DEBUG(scheduler,
+		     debugBelch("sched: Updating ");
+		     printPtr((P_)((StgUpdateFrame *)frame)->updatee); 
+		     debugBelch(" with ");
+		     printObj((StgClosure *)ap);
+		);
+
+	    // Replace the updatee with an indirection
+	    //
+	    // Warning: if we're in a loop, more than one update frame on
+	    // the stack may point to the same object.  Be careful not to
+	    // overwrite an IND_OLDGEN in this case, because we'll screw
+	    // up the mutable lists.  To be on the safe side, don't
+	    // overwrite any kind of indirection at all.  See also
+	    // threadSqueezeStack in GC.c, where we have to make a similar
+	    // check.
+	    //
+	    if (!closure_IND(((StgUpdateFrame *)frame)->updatee)) {
+		// revert the black hole
+		UPD_IND_NOLOCK(((StgUpdateFrame *)frame)->updatee,
+			       (StgClosure *)ap);
+	    }
+	    sp += sizeofW(StgUpdateFrame) - 1;
+	    sp[0] = (W_)ap; // push onto stack
+	    frame = sp + 1;
+	    continue; //no need to bump frame
+	}
+
+	case STOP_FRAME:
+	    // We've stripped the entire stack, the thread is now dead.
+	    tso->what_next = ThreadKilled;
+	    tso->sp = frame + sizeofW(StgStopFrame);
+	    return;
+
+	case CATCH_FRAME:
+	    // If we find a CATCH_FRAME, and we've got an exception to raise,
+	    // then build the THUNK raise(exception), and leave it on
+	    // top of the CATCH_FRAME ready to enter.
+	    //
+	{
+#ifdef PROFILING
+	    StgCatchFrame *cf = (StgCatchFrame *)frame;
+#endif
+	    StgThunk *raise;
+	    
+	    if (exception == NULL) break;
+
+	    // we've got an exception to raise, so let's pass it to the
+	    // handler in this frame.
+	    //
+	    raise = (StgThunk *)allocateLocal(cap,sizeofW(StgThunk)+1);
+	    TICK_ALLOC_SE_THK(1,0);
+	    SET_HDR(raise,&stg_raise_info,cf->header.prof.ccs);
+	    raise->payload[0] = exception;
+	    
+	    // throw away the stack from Sp up to the CATCH_FRAME.
+	    //
+	    sp = frame - 1;
+	    
+	    /* Ensure that async excpetions are blocked now, so we don't get
+	     * a surprise exception before we get around to executing the
+	     * handler.
+	     */
+	    if (tso->blocked_exceptions == NULL) {
+		tso->blocked_exceptions = END_TSO_QUEUE;
+	    }
+
+	    /* Put the newly-built THUNK on top of the stack, ready to execute
+	     * when the thread restarts.
+	     */
+	    sp[0] = (W_)raise;
+	    sp[-1] = (W_)&stg_enter_info;
+	    tso->sp = sp-1;
+	    tso->what_next = ThreadRunGHC;
+	    IF_DEBUG(sanity, checkTSO(tso));
+	    return;
+	}
+	    
+	case ATOMICALLY_FRAME:
+	    if (stop_at_atomically) {
+		ASSERT(stmGetEnclosingTRec(tso->trec) == NO_TREC);
+		stmCondemnTransaction(cap, tso -> trec);
+#ifdef REG_R1
+		tso->sp = frame;
+#else
+		// R1 is not a register: the return convention for IO in
+		// this case puts the return value on the stack, so we
+		// need to set up the stack to return to the atomically
+		// frame properly...
+		tso->sp = frame - 2;
+		tso->sp[1] = (StgWord) &stg_NO_FINALIZER_closure; // why not?
+		tso->sp[0] = (StgWord) &stg_ut_1_0_unreg_info;
+#endif
+		tso->what_next = ThreadRunGHC;
+		return;
+	    }
+	    // Not stop_at_atomically... fall through and abort the
+	    // transaction.
+	    
+	case CATCH_RETRY_FRAME:
+	    // IF we find an ATOMICALLY_FRAME then we abort the
+	    // current transaction and propagate the exception.  In
+	    // this case (unlike ordinary exceptions) we do not care
+	    // whether the transaction is valid or not because its
+	    // possible validity cannot have caused the exception
+	    // and will not be visible after the abort.
+	    IF_DEBUG(stm,
+		     debugBelch("Found atomically block delivering async exception\n"));
+            StgTRecHeader *trec = tso -> trec;
+            StgTRecHeader *outer = stmGetEnclosingTRec(trec);
+            stmAbortTransaction(cap, trec);
+            tso -> trec = outer;
+	    break;
+	    
+	default:
+	    break;
+	}
+
+	// move on to the next stack frame
+	frame += stack_frame_sizeW((StgClosure *)frame);
+    }
+
+    // if we got here, then we stopped at stop_here
+    ASSERT(stop_here != NULL);
+}
+
+/* -----------------------------------------------------------------------------
+   Deleting threads
+
+   This is used for interruption (^C) and forking, and corresponds to
+   raising an exception but without letting the thread catch the
+   exception.
+   -------------------------------------------------------------------------- */
+
+static void 
+deleteThread (Capability *cap, StgTSO *tso)
+{
+  if (tso->why_blocked != BlockedOnCCall &&
+      tso->why_blocked != BlockedOnCCall_NoUnblockExc) {
+      raiseAsync(cap,tso,NULL);
+  }
+}
+
+#ifdef FORKPROCESS_PRIMOP_SUPPORTED
+static void 
+deleteThread_(Capability *cap, StgTSO *tso)
+{ // for forkProcess only:
+  // like deleteThread(), but we delete threads in foreign calls, too.
+
+    if (tso->why_blocked == BlockedOnCCall ||
+	tso->why_blocked == BlockedOnCCall_NoUnblockExc) {
+	unblockOne(cap,tso);
+	tso->what_next = ThreadKilled;
+    } else {
+	deleteThread(cap,tso);
+    }
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+   raiseExceptionHelper
+   
+   This function is called by the raise# primitve, just so that we can
+   move some of the tricky bits of raising an exception from C-- into
+   C.  Who knows, it might be a useful re-useable thing here too.
+   -------------------------------------------------------------------------- */
+
+StgWord
+raiseExceptionHelper (StgRegTable *reg, StgTSO *tso, StgClosure *exception)
+{
+    Capability *cap = regTableToCapability(reg);
+    StgThunk *raise_closure = NULL;
+    StgPtr p, next;
+    StgRetInfoTable *info;
+    //
+    // This closure represents the expression 'raise# E' where E
+    // is the exception raise.  It is used to overwrite all the
+    // thunks which are currently under evaluataion.
+    //
+
+    // OLD COMMENT (we don't have MIN_UPD_SIZE now):
+    // LDV profiling: stg_raise_info has THUNK as its closure
+    // type. Since a THUNK takes at least MIN_UPD_SIZE words in its
+    // payload, MIN_UPD_SIZE is more approprate than 1.  It seems that
+    // 1 does not cause any problem unless profiling is performed.
+    // However, when LDV profiling goes on, we need to linearly scan
+    // small object pool, where raise_closure is stored, so we should
+    // use MIN_UPD_SIZE.
+    //
+    // raise_closure = (StgClosure *)RET_STGCALL1(P_,allocate,
+    // 				       sizeofW(StgClosure)+1);
+    //
+
+    //
+    // Walk up the stack, looking for the catch frame.  On the way,
+    // we update any closures pointed to from update frames with the
+    // raise closure that we just built.
+    //
+    p = tso->sp;
+    while(1) {
+	info = get_ret_itbl((StgClosure *)p);
+	next = p + stack_frame_sizeW((StgClosure *)p);
+	switch (info->i.type) {
+	    
+	case UPDATE_FRAME:
+	    // Only create raise_closure if we need to.
+	    if (raise_closure == NULL) {
+		raise_closure = 
+		    (StgThunk *)allocateLocal(cap,sizeofW(StgThunk)+1);
+		SET_HDR(raise_closure, &stg_raise_info, CCCS);
+		raise_closure->payload[0] = exception;
+	    }
+	    UPD_IND(((StgUpdateFrame *)p)->updatee,(StgClosure *)raise_closure);
+	    p = next;
+	    continue;
+
+        case ATOMICALLY_FRAME:
+            IF_DEBUG(stm, debugBelch("Found ATOMICALLY_FRAME at %p\n", p));
+            tso->sp = p;
+            return ATOMICALLY_FRAME;
+	    
+	case CATCH_FRAME:
+	    tso->sp = p;
+	    return CATCH_FRAME;
+
+        case CATCH_STM_FRAME:
+            IF_DEBUG(stm, debugBelch("Found CATCH_STM_FRAME at %p\n", p));
+            tso->sp = p;
+            return CATCH_STM_FRAME;
+	    
+	case STOP_FRAME:
+	    tso->sp = p;
+	    return STOP_FRAME;
+
+        case CATCH_RETRY_FRAME:
+	default:
+	    p = next; 
+	    continue;
+	}
+    }
+}
+
+
+/* -----------------------------------------------------------------------------
+   findRetryFrameHelper
+
+   This function is called by the retry# primitive.  It traverses the stack
+   leaving tso->sp referring to the frame which should handle the retry.  
+
+   This should either be a CATCH_RETRY_FRAME (if the retry# is within an orElse#) 
+   or should be a ATOMICALLY_FRAME (if the retry# reaches the top level).  
+
+   We skip CATCH_STM_FRAMEs because retries are not considered to be exceptions,
+   despite the similar implementation.
+
+   We should not expect to see CATCH_FRAME or STOP_FRAME because those should
+   not be created within memory transactions.
+   -------------------------------------------------------------------------- */
+
+StgWord
+findRetryFrameHelper (StgTSO *tso)
+{
+  StgPtr           p, next;
+  StgRetInfoTable *info;
+
+  p = tso -> sp;
+  while (1) {
+    info = get_ret_itbl((StgClosure *)p);
+    next = p + stack_frame_sizeW((StgClosure *)p);
+    switch (info->i.type) {
+      
+    case ATOMICALLY_FRAME:
+      IF_DEBUG(stm, debugBelch("Found ATOMICALLY_FRAME at %p during retrry\n", p));
+      tso->sp = p;
+      return ATOMICALLY_FRAME;
+      
+    case CATCH_RETRY_FRAME:
+      IF_DEBUG(stm, debugBelch("Found CATCH_RETRY_FRAME at %p during retrry\n", p));
+      tso->sp = p;
+      return CATCH_RETRY_FRAME;
+      
+    case CATCH_STM_FRAME:
+    default:
+      ASSERT(info->i.type != CATCH_FRAME);
+      ASSERT(info->i.type != STOP_FRAME);
+      p = next; 
+      continue;
+    }
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   resurrectThreads is called after garbage collection on the list of
+   threads found to be garbage.  Each of these threads will be woken
+   up and sent a signal: BlockedOnDeadMVar if the thread was blocked
+   on an MVar, or NonTermination if the thread was blocked on a Black
+   Hole.
+
+   Locks: assumes we hold *all* the capabilities.
+   -------------------------------------------------------------------------- */
+
+void
+resurrectThreads (StgTSO *threads)
+{
+    StgTSO *tso, *next;
+    Capability *cap;
+
+    for (tso = threads; tso != END_TSO_QUEUE; tso = next) {
+	next = tso->global_link;
+	tso->global_link = all_threads;
+	all_threads = tso;
+	IF_DEBUG(scheduler, sched_belch("resurrecting thread %d", tso->id));
+	
+	// Wake up the thread on the Capability it was last on
+	cap = tso->cap;
+	
+	switch (tso->why_blocked) {
+	case BlockedOnMVar:
+	case BlockedOnException:
+	    /* Called by GC - sched_mutex lock is currently held. */
+	    raiseAsync(cap, tso,(StgClosure *)BlockedOnDeadMVar_closure);
+	    break;
+	case BlockedOnBlackHole:
+	    raiseAsync(cap, tso,(StgClosure *)NonTermination_closure);
+	    break;
+	case BlockedOnSTM:
+	    raiseAsync(cap, tso,(StgClosure *)BlockedIndefinitely_closure);
+	    break;
+	case NotBlocked:
+	    /* This might happen if the thread was blocked on a black hole
+	     * belonging to a thread that we've just woken up (raiseAsync
+	     * can wake up threads, remember...).
+	     */
+	    continue;
+	default:
+	    barf("resurrectThreads: thread blocked in a strange way");
+	}
+    }
+}
+
+/* ----------------------------------------------------------------------------
+ * Debugging: why is a thread blocked
+ * [Also provides useful information when debugging threaded programs
+ *  at the Haskell source code level, so enable outside of DEBUG. --sof 7/02]
+   ------------------------------------------------------------------------- */
+
+#if DEBUG
+static void
+printThreadBlockage(StgTSO *tso)
+{
+  switch (tso->why_blocked) {
+  case BlockedOnRead:
+    debugBelch("is blocked on read from fd %d", (int)(tso->block_info.fd));
+    break;
+  case BlockedOnWrite:
+    debugBelch("is blocked on write to fd %d", (int)(tso->block_info.fd));
+    break;
+#if defined(mingw32_HOST_OS)
+    case BlockedOnDoProc:
+    debugBelch("is blocked on proc (request: %ld)", tso->block_info.async_result->reqID);
+    break;
+#endif
+  case BlockedOnDelay:
+    debugBelch("is blocked until %ld", (long)(tso->block_info.target));
+    break;
+  case BlockedOnMVar:
+    debugBelch("is blocked on an MVar @ %p", tso->block_info.closure);
+    break;
+  case BlockedOnException:
+    debugBelch("is blocked on delivering an exception to thread %d",
+	    tso->block_info.tso->id);
+    break;
+  case BlockedOnBlackHole:
+    debugBelch("is blocked on a black hole");
+    break;
+  case NotBlocked:
+    debugBelch("is not blocked");
+    break;
+#if defined(PARALLEL_HASKELL)
+  case BlockedOnGA:
+    debugBelch("is blocked on global address; local FM_BQ is %p (%s)",
+	    tso->block_info.closure, info_type(tso->block_info.closure));
+    break;
+  case BlockedOnGA_NoSend:
+    debugBelch("is blocked on global address (no send); local FM_BQ is %p (%s)",
+	    tso->block_info.closure, info_type(tso->block_info.closure));
+    break;
+#endif
+  case BlockedOnCCall:
+    debugBelch("is blocked on an external call");
+    break;
+  case BlockedOnCCall_NoUnblockExc:
+    debugBelch("is blocked on an external call (exceptions were already blocked)");
+    break;
+  case BlockedOnSTM:
+    debugBelch("is blocked on an STM operation");
+    break;
+  default:
+    barf("printThreadBlockage: strange tso->why_blocked: %d for TSO %d (%d)",
+	 tso->why_blocked, tso->id, tso);
+  }
+}
+
+void
+printThreadStatus(StgTSO *t)
+{
+    debugBelch("\tthread %4d @ %p ", t->id, (void *)t);
+    {
+      void *label = lookupThreadLabel(t->id);
+      if (label) debugBelch("[\"%s\"] ",(char *)label);
+    }
+    if (t->what_next == ThreadRelocated) {
+	debugBelch("has been relocated...\n");
+    } else {
+	switch (t->what_next) {
+	case ThreadKilled:
+	    debugBelch("has been killed");
+	    break;
+	case ThreadComplete:
+	    debugBelch("has completed");
+	    break;
+	default:
+	    printThreadBlockage(t);
+	}
+	debugBelch("\n");
+    }
+}
+
+void
+printAllThreads(void)
+{
+  StgTSO *t, *next;
+  nat i;
+  Capability *cap;
+
+# if defined(GRAN)
+  char time_string[TIME_STR_LEN], node_str[NODE_STR_LEN];
+  ullong_format_string(TIME_ON_PROC(CurrentProc), 
+		       time_string, rtsFalse/*no commas!*/);
+
+  debugBelch("all threads at [%s]:\n", time_string);
+# elif defined(PARALLEL_HASKELL)
+  char time_string[TIME_STR_LEN], node_str[NODE_STR_LEN];
+  ullong_format_string(CURRENT_TIME,
+		       time_string, rtsFalse/*no commas!*/);
+
+  debugBelch("all threads at [%s]:\n", time_string);
+# else
+  debugBelch("all threads:\n");
+# endif
+
+  for (i = 0; i < n_capabilities; i++) {
+      cap = &capabilities[i];
+      debugBelch("threads on capability %d:\n", cap->no);
+      for (t = cap->run_queue_hd; t != END_TSO_QUEUE; t = t->link) {
+	  printThreadStatus(t);
+      }
+  }
+
+  debugBelch("other threads:\n");
+  for (t = all_threads; t != END_TSO_QUEUE; t = next) {
+      if (t->why_blocked != NotBlocked) {
+	  printThreadStatus(t);
+      }
+      if (t->what_next == ThreadRelocated) {
+	  next = t->link;
+      } else {
+	  next = t->global_link;
+      }
+  }
+}
+
+// useful from gdb
+void 
+printThreadQueue(StgTSO *t)
+{
+    nat i = 0;
+    for (; t != END_TSO_QUEUE; t = t->link) {
+	printThreadStatus(t);
+	i++;
+    }
+    debugBelch("%d threads on queue\n", i);
+}
+
+/* 
+   Print a whole blocking queue attached to node (debugging only).
+*/
+# if defined(PARALLEL_HASKELL)
+void 
+print_bq (StgClosure *node)
+{
+  StgBlockingQueueElement *bqe;
+  StgTSO *tso;
+  rtsBool end;
+
+  debugBelch("## BQ of closure %p (%s): ",
+	  node, info_type(node));
+
+  /* should cover all closures that may have a blocking queue */
+  ASSERT(get_itbl(node)->type == BLACKHOLE_BQ ||
+	 get_itbl(node)->type == FETCH_ME_BQ ||
+	 get_itbl(node)->type == RBH ||
+	 get_itbl(node)->type == MVAR);
+    
+  ASSERT(node!=(StgClosure*)NULL);         // sanity check
+
+  print_bqe(((StgBlockingQueue*)node)->blocking_queue);
+}
+
+/* 
+   Print a whole blocking queue starting with the element bqe.
+*/
+void 
+print_bqe (StgBlockingQueueElement *bqe)
+{
+  rtsBool end;
+
+  /* 
+     NB: In a parallel setup a BQ of an RBH must end with an RBH_Save closure;
+  */
+  for (end = (bqe==END_BQ_QUEUE);
+       !end; // iterate until bqe points to a CONSTR
+       end = (get_itbl(bqe)->type == CONSTR) || (bqe->link==END_BQ_QUEUE), 
+       bqe = end ? END_BQ_QUEUE : bqe->link) {
+    ASSERT(bqe != END_BQ_QUEUE);                               // sanity check
+    ASSERT(bqe != (StgBlockingQueueElement *)NULL);            // sanity check
+    /* types of closures that may appear in a blocking queue */
+    ASSERT(get_itbl(bqe)->type == TSO ||           
+	   get_itbl(bqe)->type == BLOCKED_FETCH || 
+	   get_itbl(bqe)->type == CONSTR); 
+    /* only BQs of an RBH end with an RBH_Save closure */
+    //ASSERT(get_itbl(bqe)->type != CONSTR || get_itbl(node)->type == RBH);
+
+    switch (get_itbl(bqe)->type) {
+    case TSO:
+      debugBelch(" TSO %u (%x),",
+	      ((StgTSO *)bqe)->id, ((StgTSO *)bqe));
+      break;
+    case BLOCKED_FETCH:
+      debugBelch(" BF (node=%p, ga=((%x, %d, %x)),",
+	      ((StgBlockedFetch *)bqe)->node, 
+	      ((StgBlockedFetch *)bqe)->ga.payload.gc.gtid,
+	      ((StgBlockedFetch *)bqe)->ga.payload.gc.slot,
+	      ((StgBlockedFetch *)bqe)->ga.weight);
+      break;
+    case CONSTR:
+      debugBelch(" %s (IP %p),",
+	      (get_itbl(bqe) == &stg_RBH_Save_0_info ? "RBH_Save_0" :
+	       get_itbl(bqe) == &stg_RBH_Save_1_info ? "RBH_Save_1" :
+	       get_itbl(bqe) == &stg_RBH_Save_2_info ? "RBH_Save_2" :
+	       "RBH_Save_?"), get_itbl(bqe));
+      break;
+    default:
+      barf("Unexpected closure type %s in blocking queue", // of %p (%s)",
+	   info_type((StgClosure *)bqe)); // , node, info_type(node));
+      break;
+    }
+  } /* for */
+  debugBelch("\n");
+}
+# elif defined(GRAN)
+void 
+print_bq (StgClosure *node)
+{
+  StgBlockingQueueElement *bqe;
+  PEs node_loc, tso_loc;
+  rtsBool end;
+
+  /* should cover all closures that may have a blocking queue */
+  ASSERT(get_itbl(node)->type == BLACKHOLE_BQ ||
+	 get_itbl(node)->type == FETCH_ME_BQ ||
+	 get_itbl(node)->type == RBH);
+    
+  ASSERT(node!=(StgClosure*)NULL);         // sanity check
+  node_loc = where_is(node);
+
+  debugBelch("## BQ of closure %p (%s) on [PE %d]: ",
+	  node, info_type(node), node_loc);
+
+  /* 
+     NB: In a parallel setup a BQ of an RBH must end with an RBH_Save closure;
+  */
+  for (bqe = ((StgBlockingQueue*)node)->blocking_queue, end = (bqe==END_BQ_QUEUE);
+       !end; // iterate until bqe points to a CONSTR
+       end = (get_itbl(bqe)->type == CONSTR) || (bqe->link==END_BQ_QUEUE), bqe = end ? END_BQ_QUEUE : bqe->link) {
+    ASSERT(bqe != END_BQ_QUEUE);             // sanity check
+    ASSERT(bqe != (StgBlockingQueueElement *)NULL);  // sanity check
+    /* types of closures that may appear in a blocking queue */
+    ASSERT(get_itbl(bqe)->type == TSO ||           
+	   get_itbl(bqe)->type == CONSTR); 
+    /* only BQs of an RBH end with an RBH_Save closure */
+    ASSERT(get_itbl(bqe)->type != CONSTR || get_itbl(node)->type == RBH);
+
+    tso_loc = where_is((StgClosure *)bqe);
+    switch (get_itbl(bqe)->type) {
+    case TSO:
+      debugBelch(" TSO %d (%p) on [PE %d],",
+	      ((StgTSO *)bqe)->id, (StgTSO *)bqe, tso_loc);
+      break;
+    case CONSTR:
+      debugBelch(" %s (IP %p),",
+	      (get_itbl(bqe) == &stg_RBH_Save_0_info ? "RBH_Save_0" :
+	       get_itbl(bqe) == &stg_RBH_Save_1_info ? "RBH_Save_1" :
+	       get_itbl(bqe) == &stg_RBH_Save_2_info ? "RBH_Save_2" :
+	       "RBH_Save_?"), get_itbl(bqe));
+      break;
+    default:
+      barf("Unexpected closure type %s in blocking queue of %p (%s)",
+	   info_type((StgClosure *)bqe), node, info_type(node));
+      break;
+    }
+  } /* for */
+  debugBelch("\n");
+}
+# endif
+
+#if defined(PARALLEL_HASKELL)
+static nat
+run_queue_len(void)
+{
+    nat i;
+    StgTSO *tso;
+    
+    for (i=0, tso=run_queue_hd; 
+	 tso != END_TSO_QUEUE;
+	 i++, tso=tso->link) {
+	/* nothing */
+    }
+	
+    return i;
+}
+#endif
+
+void
+sched_belch(char *s, ...)
+{
+    va_list ap;
+    va_start(ap,s);
+#ifdef THREADED_RTS
+    debugBelch("sched (task %p): ", (void *)(unsigned long)(unsigned int)osThreadId());
+#elif defined(PARALLEL_HASKELL)
+    debugBelch("== ");
+#else
+    debugBelch("sched: ");
+#endif
+    vdebugBelch(s, ap);
+    debugBelch("\n");
+    va_end(ap);
+}
+
+#endif /* DEBUG */
diff --git a/rts/Schedule.h b/rts/Schedule.h
new file mode 100644
index 0000000000..37b07941f4
--- /dev/null
+++ b/rts/Schedule.h
@@ -0,0 +1,332 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-2005
+ *
+ * Prototypes for functions in Schedule.c 
+ * (RTS internal scheduler interface)
+ *
+ * -------------------------------------------------------------------------*/
+
+#ifndef SCHEDULE_H
+#define SCHEDULE_H
+
+#include "OSThreads.h"
+#include "Capability.h"
+
+/* initScheduler(), exitScheduler()
+ * Called from STG :  no
+ * Locks assumed   :  none
+ */
+void initScheduler (void);
+void exitScheduler (void);
+
+// Place a new thread on the run queue of the current Capability
+void scheduleThread (Capability *cap, StgTSO *tso);
+
+// Place a new thread on the run queue of a specified Capability
+// (cap is the currently owned Capability, cpu is the number of
+// the desired Capability).
+void scheduleThreadOn(Capability *cap, StgWord cpu, StgTSO *tso);
+
+/* awakenBlockedQueue()
+ *
+ * Takes a pointer to the beginning of a blocked TSO queue, and
+ * wakes up the entire queue.
+ * Called from STG :  yes
+ * Locks assumed   :  none
+ */
+#if defined(GRAN)
+void awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node);
+#elif defined(PAR)
+void awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node);
+#else
+void awakenBlockedQueue (Capability *cap, StgTSO *tso);
+#endif
+
+/* unblockOne()
+ *
+ * Put the specified thread on the run queue of the given Capability.
+ * Called from STG :  yes
+ * Locks assumed   :  we own the Capability.
+ */
+StgTSO * unblockOne(Capability *cap, StgTSO *tso);
+
+/* raiseAsync()
+ *
+ * Raises an exception asynchronously in the specified thread.
+ *
+ * Called from STG :  yes
+ * Locks assumed   :  none
+ */
+void raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception);
+
+/* suspendComputation()
+ *
+ * A variant of raiseAsync(), this strips the stack of the specified
+ * thread down to the stop_here point, leaving a current closure on
+ * top of the stack at [stop_here - 1].
+ */
+void suspendComputation(Capability *cap, StgTSO *tso, StgPtr stop_here);
+
+/* raiseExceptionHelper */
+StgWord raiseExceptionHelper (StgRegTable *reg, StgTSO *tso, StgClosure *exception);
+
+/* findRetryFrameHelper */
+StgWord findRetryFrameHelper (StgTSO *tso);
+
+/* GetRoots(evac_fn f)
+ *
+ * Call f() for each root known to the scheduler.
+ *
+ * Called from STG :  NO
+ * Locks assumed   :  ????
+ */
+void GetRoots(evac_fn);
+
+/* workerStart()
+ * 
+ * Entry point for a new worker task.
+ * Called from STG :  NO
+ * Locks assumed   :  none
+ */
+void workerStart(Task *task);
+
+#if defined(GRAN)
+void    awaken_blocked_queue(StgBlockingQueueElement *q, StgClosure *node);
+void    unlink_from_bq(StgTSO* tso, StgClosure* node);
+void    initThread(StgTSO *tso, nat stack_size, StgInt pri);
+#elif defined(PAR)
+nat     run_queue_len(void);
+void    awaken_blocked_queue(StgBlockingQueueElement *q, StgClosure *node);
+void    initThread(StgTSO *tso, nat stack_size);
+#else
+char   *info_type(StgClosure *closure);    // dummy
+char   *info_type_by_ip(StgInfoTable *ip); // dummy
+void    awaken_blocked_queue(StgTSO *q);
+void    initThread(StgTSO *tso, nat stack_size);
+#endif
+
+/* Context switch flag.
+ * Locks required  : none (conflicts are harmless)
+ */
+extern int RTS_VAR(context_switch);
+
+/* The state of the scheduler.  This is used to control the sequence
+ * of events during shutdown, and when the runtime is interrupted
+ * using ^C.
+ */
+#define SCHED_RUNNING       0  /* running as normal */
+#define SCHED_INTERRUPTING  1  /* ^C detected, before threads are deleted */
+#define SCHED_INTERRUPTED   2  /* ^C detected, after threads deleted */
+#define SCHED_SHUTTING_DOWN 3  /* final shutdown */
+
+extern rtsBool RTS_VAR(sched_state);
+
+/* 
+ * flag that tracks whether we have done any execution in this time slice.
+ */
+#define ACTIVITY_YES      0 /* there has been activity in the current slice */
+#define ACTIVITY_MAYBE_NO 1 /* no activity in the current slice */
+#define ACTIVITY_INACTIVE 2 /* a complete slice has passed with no activity */
+#define ACTIVITY_DONE_GC  3 /* like 2, but we've done a GC too */
+
+/* Recent activity flag.
+ * Locks required  : Transition from MAYBE_NO to INACTIVE
+ * happens in the timer signal, so it is atomic.  Trnasition from
+ * INACTIVE to DONE_GC happens under sched_mutex.  No lock required
+ * to set it to ACTIVITY_YES.
+ */
+extern nat recent_activity;
+
+/* Thread queues.
+ * Locks required  : sched_mutex
+ *
+ * In GranSim we have one run/blocked_queue per PE.
+ */
+#if defined(GRAN)
+// run_queue_hds defined in GranSim.h
+#else
+extern  StgTSO *RTS_VAR(blackhole_queue);
+#if !defined(THREADED_RTS)
+extern  StgTSO *RTS_VAR(blocked_queue_hd), *RTS_VAR(blocked_queue_tl);
+extern  StgTSO *RTS_VAR(sleeping_queue);
+#endif
+#endif
+
+/* Linked list of all threads.
+ * Locks required  : sched_mutex
+ */
+extern  StgTSO *RTS_VAR(all_threads);
+
+/* Set to rtsTrue if there are threads on the blackhole_queue, and
+ * it is possible that one or more of them may be available to run.
+ * This flag is set to rtsFalse after we've checked the queue, and
+ * set to rtsTrue just before we run some Haskell code.  It is used
+ * to decide whether we should yield the Capability or not.
+ * Locks required  : none (see scheduleCheckBlackHoles()).
+ */
+extern rtsBool blackholes_need_checking;
+
+#if defined(THREADED_RTS)
+extern Mutex RTS_VAR(sched_mutex);
+#endif
+
+StgBool isThreadBound(StgTSO *tso);
+
+SchedulerStatus rts_mainLazyIO(HaskellObj p, /*out*/HaskellObj *ret);
+
+/* Called by shutdown_handler(). */
+void interruptStgRts (void);
+
+nat  run_queue_len (void);
+
+void resurrectThreads (StgTSO *);
+
+void printAllThreads(void);
+
+/* debugging only 
+ */
+#ifdef DEBUG
+void print_bq (StgClosure *node);
+#endif
+#if defined(PAR)
+void print_bqe (StgBlockingQueueElement *bqe);
+#endif
+
+void labelThread(StgPtr tso, char *label);
+
+/* -----------------------------------------------------------------------------
+ * Some convenient macros/inline functions...
+ */
+
+#if !IN_STG_CODE
+
+/* END_TSO_QUEUE and friends now defined in includes/StgMiscClosures.h */
+
+/* Add a thread to the end of the run queue.
+ * NOTE: tso->link should be END_TSO_QUEUE before calling this macro.
+ * ASSUMES: cap->running_task is the current task.
+ */
+STATIC_INLINE void
+appendToRunQueue (Capability *cap, StgTSO *tso)
+{
+    ASSERT(tso->link == END_TSO_QUEUE);
+    if (cap->run_queue_hd == END_TSO_QUEUE) {
+	cap->run_queue_hd = tso;
+    } else {
+	cap->run_queue_tl->link = tso;
+    }
+    cap->run_queue_tl = tso;
+}
+
+/* Push a thread on the beginning of the run queue.  Used for
+ * newly awakened threads, so they get run as soon as possible.
+ * ASSUMES: cap->running_task is the current task.
+ */
+STATIC_INLINE void
+pushOnRunQueue (Capability *cap, StgTSO *tso)
+{
+    tso->link = cap->run_queue_hd;
+    cap->run_queue_hd = tso;
+    if (cap->run_queue_tl == END_TSO_QUEUE) {
+	cap->run_queue_tl = tso;
+    }
+}
+
+/* Pop the first thread off the runnable queue.
+ */
+STATIC_INLINE StgTSO *
+popRunQueue (Capability *cap)
+{ 
+    StgTSO *t = cap->run_queue_hd;
+    ASSERT(t != END_TSO_QUEUE);
+    cap->run_queue_hd = t->link;
+    t->link = END_TSO_QUEUE;
+    if (cap->run_queue_hd == END_TSO_QUEUE) {
+	cap->run_queue_tl = END_TSO_QUEUE;
+    }
+    return t;
+}
+
+/* Add a thread to the end of the blocked queue.
+ */
+#if !defined(THREADED_RTS)
+STATIC_INLINE void
+appendToBlockedQueue(StgTSO *tso)
+{
+    ASSERT(tso->link == END_TSO_QUEUE);
+    if (blocked_queue_hd == END_TSO_QUEUE) {
+	blocked_queue_hd = tso;
+    } else {
+	blocked_queue_tl->link = tso;
+    }
+    blocked_queue_tl = tso;
+}
+#endif
+
+#if defined(THREADED_RTS)
+STATIC_INLINE void
+appendToWakeupQueue (Capability *cap, StgTSO *tso)
+{
+    ASSERT(tso->link == END_TSO_QUEUE);
+    if (cap->wakeup_queue_hd == END_TSO_QUEUE) {
+	cap->wakeup_queue_hd = tso;
+    } else {
+	cap->wakeup_queue_tl->link = tso;
+    }
+    cap->wakeup_queue_tl = tso;
+}
+#endif
+
+/* Check whether various thread queues are empty
+ */
+STATIC_INLINE rtsBool
+emptyQueue (StgTSO *q)
+{
+    return (q == END_TSO_QUEUE);
+}
+
+STATIC_INLINE rtsBool
+emptyRunQueue(Capability *cap)
+{
+    return emptyQueue(cap->run_queue_hd);
+}
+
+#if defined(THREADED_RTS)
+STATIC_INLINE rtsBool
+emptyWakeupQueue(Capability *cap)
+{
+    return emptyQueue(cap->wakeup_queue_hd);
+}
+#endif
+
+#if !defined(THREADED_RTS)
+#define EMPTY_BLOCKED_QUEUE()  (emptyQueue(blocked_queue_hd))
+#define EMPTY_SLEEPING_QUEUE() (emptyQueue(sleeping_queue))
+#endif
+
+STATIC_INLINE rtsBool
+emptyThreadQueues(Capability *cap)
+{
+    return emptyRunQueue(cap)
+#if !defined(THREADED_RTS)
+	&& EMPTY_BLOCKED_QUEUE() && EMPTY_SLEEPING_QUEUE()
+#endif
+    ;
+}
+
+#ifdef DEBUG
+void sched_belch(char *s, ...)
+   GNU_ATTRIBUTE(format (printf, 1, 2));
+#endif
+
+#endif /* !IN_STG_CODE */
+
+STATIC_INLINE void
+dirtyTSO (StgTSO *tso)
+{
+    tso->flags |= TSO_DIRTY;
+}
+
+#endif /* SCHEDULE_H */
+
diff --git a/rts/Sparks.c b/rts/Sparks.c
new file mode 100644
index 0000000000..615d832e33
--- /dev/null
+++ b/rts/Sparks.c
@@ -0,0 +1,881 @@
+/* ---------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2000-2006
+ *
+ * Sparking support for PARALLEL_HASKELL and THREADED_RTS versions of the RTS.
+ *
+ * -------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "Schedule.h"
+#include "SchedAPI.h"
+#include "Storage.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "ParTicky.h"
+# if defined(PARALLEL_HASKELL)
+# include "ParallelRts.h"
+# include "GranSimRts.h"   // for GR_...
+# elif defined(GRAN)
+# include "GranSimRts.h"
+# endif
+#include "Sparks.h"
+
+#if defined(THREADED_RTS) || defined(PARALLEL_HASKELL)
+
+static INLINE_ME void bump_hd (StgSparkPool *p)
+{ p->hd++; if (p->hd == p->lim) p->hd = p->base; }
+
+static INLINE_ME void bump_tl (StgSparkPool *p)
+{ p->tl++; if (p->tl == p->lim) p->tl = p->base; }
+
+/* -----------------------------------------------------------------------------
+ * 
+ * Initialising spark pools.
+ *
+ * -------------------------------------------------------------------------- */
+
+static void 
+initSparkPool(StgSparkPool *pool)
+{
+    pool->base = stgMallocBytes(RtsFlags.ParFlags.maxLocalSparks
+				* sizeof(StgClosure *),
+				"initSparkPools");
+    pool->lim = pool->base + RtsFlags.ParFlags.maxLocalSparks;
+    pool->hd  = pool->base;
+    pool->tl  = pool->base;
+}
+
+void
+initSparkPools( void )
+{
+#ifdef THREADED_RTS
+    /* walk over the capabilities, allocating a spark pool for each one */
+    nat i;
+    for (i = 0; i < n_capabilities; i++) {
+	initSparkPool(&capabilities[i].r.rSparks);
+    }
+#else
+    /* allocate a single spark pool */
+    initSparkPool(&MainCapability.r.rSparks);
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+ * 
+ * findSpark: find a spark on the current Capability that we can fork
+ * into a thread.
+ *
+ * -------------------------------------------------------------------------- */
+
+StgClosure *
+findSpark (Capability *cap)
+{
+    StgSparkPool *pool;
+    StgClosure *spark;
+    
+    pool = &(cap->r.rSparks);
+    ASSERT_SPARK_POOL_INVARIANTS(pool);
+
+    while (pool->hd != pool->tl) {
+	spark = *pool->hd;
+	bump_hd(pool);
+	if (closure_SHOULD_SPARK(spark)) {
+#ifdef GRAN
+	    if (RtsFlags.ParFlags.ParStats.Sparks) 
+		DumpRawGranEvent(CURRENT_PROC, CURRENT_PROC, 
+				 GR_STEALING, ((StgTSO *)NULL), spark, 
+				 0, 0 /* spark_queue_len(ADVISORY_POOL) */);
+#endif
+	    return spark;
+	}
+    }
+    // spark pool is now empty
+    return NULL;
+}
+
+/* -----------------------------------------------------------------------------
+ * Mark all nodes pointed to by sparks in the spark queues (for GC) Does an
+ * implicit slide i.e. after marking all sparks are at the beginning of the
+ * spark pool and the spark pool only contains sparkable closures 
+ * -------------------------------------------------------------------------- */
+
+void
+markSparkQueue (evac_fn evac)
+{ 
+    StgClosure **sparkp, **to_sparkp;
+    nat i, n, pruned_sparks; // stats only
+    StgSparkPool *pool;
+    Capability *cap;
+    
+    PAR_TICKY_MARK_SPARK_QUEUE_START();
+    
+    n = 0;
+    pruned_sparks = 0;
+    for (i = 0; i < n_capabilities; i++) {
+	cap = &capabilities[i];
+	pool = &(cap->r.rSparks);
+	
+	ASSERT_SPARK_POOL_INVARIANTS(pool);
+
+#if defined(PARALLEL_HASKELL)
+	// stats only
+	n = 0;
+	pruned_sparks = 0;
+#endif
+	
+	sparkp = pool->hd;
+	to_sparkp = pool->hd;
+	while (sparkp != pool->tl) {
+	    ASSERT(to_sparkp<=sparkp);
+	    ASSERT(*sparkp!=NULL);
+	    ASSERT(LOOKS_LIKE_CLOSURE_PTR(((StgClosure *)*sparkp)));
+	    // ToDo?: statistics gathering here (also for GUM!)
+	    if (closure_SHOULD_SPARK(*sparkp)) {
+		evac(sparkp);
+		*to_sparkp++ = *sparkp;
+		n++;
+	    } else {
+		pruned_sparks++;
+	    }
+	    sparkp++;
+	    if (sparkp == pool->lim) {
+		sparkp = pool->base;
+	    }
+	}
+	pool->tl = to_sparkp;
+	
+	PAR_TICKY_MARK_SPARK_QUEUE_END(n);
+	
+#if defined(PARALLEL_HASKELL)
+	IF_DEBUG(scheduler,
+		 debugBelch("markSparkQueue: marked %d sparks and pruned %d sparks on [%x]",
+			    n, pruned_sparks, mytid));
+#else
+	IF_DEBUG(scheduler,
+	       debugBelch("markSparkQueue: marked %d sparks and pruned %d sparks\n",
+			  n, pruned_sparks));
+#endif
+	
+	IF_DEBUG(scheduler,
+		 debugBelch("markSparkQueue:   new spark queue len=%d; (hd=%p; tl=%p)\n",
+			    sparkPoolSize(pool), pool->hd, pool->tl));
+	
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * 
+ * Turn a spark into a real thread
+ *
+ * -------------------------------------------------------------------------- */
+
+void
+createSparkThread (Capability *cap, StgClosure *p)
+{
+    StgTSO *tso;
+
+    tso = createGenThread (cap, RtsFlags.GcFlags.initialStkSize, p);
+    appendToRunQueue(cap,tso);
+}
+
+/* -----------------------------------------------------------------------------
+ * 
+ * Create a new spark
+ *
+ * -------------------------------------------------------------------------- */
+
+#define DISCARD_NEW
+
+StgInt
+newSpark (StgRegTable *reg, StgClosure *p)
+{
+    StgSparkPool *pool = &(reg->rSparks);
+
+    ASSERT_SPARK_POOL_INVARIANTS(pool);
+
+    if (closure_SHOULD_SPARK(p)) {
+#ifdef DISCARD_NEW
+	StgClosure **new_tl;
+	new_tl = pool->tl + 1;
+	if (new_tl == pool->lim) { new_tl = pool->base; }
+	if (new_tl != pool->hd) {
+	    *pool->tl = p;
+	    pool->tl = new_tl;
+	} else if (!closure_SHOULD_SPARK(*pool->hd)) {
+	    // if the old closure is not sparkable, discard it and
+	    // keep the new one.  Otherwise, keep the old one.
+	    *pool->tl = p;
+	    bump_hd(pool);
+	}
+#else  /* DISCARD OLD */
+	*pool->tl = p;
+	bump_tl(pool);
+	if (pool->tl == pool->hd) { bump_hd(pool); }
+#endif
+    }	
+
+    ASSERT_SPARK_POOL_INVARIANTS(pool);
+    return 1;
+}
+
+#else
+
+StgInt
+newSpark (StgRegTable *reg STG_UNUSED, StgClosure *p STG_UNUSED)
+{
+    /* nothing */
+    return 1;
+}
+
+#endif /* PARALLEL_HASKELL || THREADED_RTS */
+
+
+/* -----------------------------------------------------------------------------
+ * 
+ * GRAN & PARALLEL_HASKELL stuff beyond here.
+ *
+ * -------------------------------------------------------------------------- */
+
+#if defined(PARALLEL_HASKELL) || defined(GRAN)
+
+static void slide_spark_pool( StgSparkPool *pool );
+
+rtsBool
+add_to_spark_queue( StgClosure *closure, StgSparkPool *pool )
+{
+  if (pool->tl == pool->lim)
+    slide_spark_pool(pool);
+
+  if (closure_SHOULD_SPARK(closure) && 
+      pool->tl < pool->lim) {
+    *(pool->tl++) = closure;
+
+#if defined(PARALLEL_HASKELL)
+    // collect parallel global statistics (currently done together with GC stats)
+    if (RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+      // debugBelch("Creating spark for %x @ %11.2f\n", closure, usertime()); 
+      globalParStats.tot_sparks_created++;
+    }
+#endif
+    return rtsTrue;
+  } else {
+#if defined(PARALLEL_HASKELL)
+    // collect parallel global statistics (currently done together with GC stats)
+    if (RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+      //debugBelch("Ignoring spark for %x @ %11.2f\n", closure, usertime()); 
+      globalParStats.tot_sparks_ignored++;
+    }
+#endif
+    return rtsFalse;
+  }
+}
+
+static void
+slide_spark_pool( StgSparkPool *pool )
+{
+  StgClosure **sparkp, **to_sparkp;
+
+  sparkp = pool->hd;
+  to_sparkp = pool->base;
+  while (sparkp < pool->tl) {
+    ASSERT(to_sparkp<=sparkp);
+    ASSERT(*sparkp!=NULL);
+    ASSERT(LOOKS_LIKE_GHC_INFO((*sparkp)->header.info));
+
+    if (closure_SHOULD_SPARK(*sparkp)) {
+      *to_sparkp++ = *sparkp++;
+    } else {
+      sparkp++;
+    }
+  }
+  pool->hd = pool->base;
+  pool->tl = to_sparkp;
+}
+
+void
+disposeSpark(spark)
+StgClosure *spark;
+{
+#if !defined(THREADED_RTS)
+  Capability *cap;
+  StgSparkPool *pool;
+
+  cap = &MainRegTable;
+  pool = &(cap->rSparks);
+  ASSERT(pool->hd <= pool->tl && pool->tl <= pool->lim);
+#endif
+  ASSERT(spark != (StgClosure *)NULL);
+  /* Do nothing */
+}
+
+
+#elif defined(GRAN)
+
+/* 
+   Search the spark queue of the proc in event for a spark that's worth
+   turning into a thread 
+   (was gimme_spark in the old RTS)
+*/
+void
+findLocalSpark (rtsEvent *event, rtsBool *found_res, rtsSparkQ *spark_res)
+{
+   PEs proc = event->proc,       /* proc to search for work */
+       creator = event->creator; /* proc that requested work */
+   StgClosure* node;
+   rtsBool found;
+   rtsSparkQ spark_of_non_local_node = NULL, 
+             spark_of_non_local_node_prev = NULL, 
+             low_priority_spark = NULL, 
+             low_priority_spark_prev = NULL,
+             spark = NULL, prev = NULL;
+  
+   /* Choose a spark from the local spark queue */
+   prev = (rtsSpark*)NULL;
+   spark = pending_sparks_hds[proc];
+   found = rtsFalse;
+
+   // ToDo: check this code & implement local sparking !! -- HWL  
+   while (!found && spark != (rtsSpark*)NULL)
+     {
+       ASSERT((prev!=(rtsSpark*)NULL || spark==pending_sparks_hds[proc]) &&
+	      (prev==(rtsSpark*)NULL || prev->next==spark) &&
+	      (spark->prev==prev));
+       node = spark->node;
+       if (!closure_SHOULD_SPARK(node)) 
+         {
+	   IF_GRAN_DEBUG(checkSparkQ,
+			 debugBelch("^^ pruning spark %p (node %p) in gimme_spark",
+			       spark, node));
+
+           if (RtsFlags.GranFlags.GranSimStats.Sparks)
+             DumpRawGranEvent(proc, (PEs)0, SP_PRUNED,(StgTSO*)NULL,
+			      spark->node, spark->name, spark_queue_len(proc));
+  
+	   ASSERT(spark != (rtsSpark*)NULL);
+	   ASSERT(SparksAvail>0);
+	   --SparksAvail;
+
+	   ASSERT(prev==(rtsSpark*)NULL || prev->next==spark);
+	   spark = delete_from_sparkq (spark, proc, rtsTrue);
+	   if (spark != (rtsSpark*)NULL)
+	     prev = spark->prev;
+	   continue;
+         }
+       /* -- node should eventually be sparked */
+       else if (RtsFlags.GranFlags.PreferSparksOfLocalNodes && 
+               !IS_LOCAL_TO(PROCS(node),CurrentProc)) 
+         {
+	   barf("Local sparking not yet implemented");
+
+           /* Remember first low priority spark */
+           if (spark_of_non_local_node==(rtsSpark*)NULL) {
+	     spark_of_non_local_node_prev = prev;
+             spark_of_non_local_node = spark;
+  	      }
+  
+           if (spark->next == (rtsSpark*)NULL) { 
+  	     /* ASSERT(spark==SparkQueueTl);  just for testing */
+  	     prev = spark_of_non_local_node_prev;
+  	     spark = spark_of_non_local_node;
+             found = rtsTrue;
+             break;
+           }
+  
+# if defined(GRAN) && defined(GRAN_CHECK)
+           /* Should never happen; just for testing 
+           if (spark==pending_sparks_tl) {
+             debugBelch("ReSchedule: Last spark != SparkQueueTl\n");
+	   	stg_exit(EXIT_FAILURE);
+		} */
+# endif
+  	   prev = spark; 
+  	   spark = spark->next;
+	   ASSERT(SparksAvail>0);
+           --SparksAvail;
+	   continue;
+         }
+       else if ( RtsFlags.GranFlags.DoPrioritySparking || 
+  		 (spark->gran_info >= RtsFlags.GranFlags.SparkPriority2) )
+         {
+	   if (RtsFlags.GranFlags.DoPrioritySparking)
+	     barf("Priority sparking not yet implemented");
+
+           found = rtsTrue;
+         }
+#if 0	   
+       else /* only used if SparkPriority2 is defined */
+         {
+	   /* ToDo: fix the code below and re-integrate it */
+           /* Remember first low priority spark */
+           if (low_priority_spark==(rtsSpark*)NULL) { 
+	     low_priority_spark_prev = prev;
+             low_priority_spark = spark;
+	   }
+  
+           if (spark->next == (rtsSpark*)NULL) { 
+	        /* ASSERT(spark==spark_queue_tl);  just for testing */
+	     prev = low_priority_spark_prev;
+	     spark = low_priority_spark;
+             found = rtsTrue;       /* take low pri spark => rc is 2  */
+             break;
+           }
+  
+           /* Should never happen; just for testing 
+           if (spark==pending_sparks_tl) {
+             debugBelch("ReSchedule: Last spark != SparkQueueTl\n");
+  		stg_exit(EXIT_FAILURE);
+             break;
+	   } */                
+	   prev = spark; 
+	   spark = spark->next;
+
+	   IF_GRAN_DEBUG(pri,
+			 debugBelch("++ Ignoring spark of priority %u (SparkPriority=%u); node=%p; name=%u\n", 
+			       spark->gran_info, RtsFlags.GranFlags.SparkPriority, 
+			       spark->node, spark->name);)
+           }
+#endif
+   }  /* while (spark!=NULL && !found) */
+
+   *spark_res = spark;
+   *found_res = found;
+}
+
+/*
+  Turn the spark into a thread.
+  In GranSim this basically means scheduling a StartThread event for the
+  node pointed to by the spark at some point in the future.
+  (was munch_spark in the old RTS)
+*/
+rtsBool
+activateSpark (rtsEvent *event, rtsSparkQ spark) 
+{
+  PEs proc = event->proc,       /* proc to search for work */
+      creator = event->creator; /* proc that requested work */
+  StgTSO* tso;
+  StgClosure* node;
+  rtsTime spark_arrival_time;
+
+  /* 
+     We've found a node on PE proc requested by PE creator.
+     If proc==creator we can turn the spark into a thread immediately;
+     otherwise we schedule a MoveSpark event on the requesting PE
+  */
+     
+  /* DaH Qu' yIchen */
+  if (proc!=creator) { 
+
+    /* only possible if we simulate GUM style fishing */
+    ASSERT(RtsFlags.GranFlags.Fishing);
+
+    /* Message packing costs for sending a Fish; qeq jabbI'ID */
+    CurrentTime[proc] += RtsFlags.GranFlags.Costs.mpacktime;
+  
+    if (RtsFlags.GranFlags.GranSimStats.Sparks)
+      DumpRawGranEvent(proc, (PEs)0, SP_EXPORTED,
+		       (StgTSO*)NULL, spark->node,
+		       spark->name, spark_queue_len(proc));
+
+    /* time of the spark arrival on the remote PE */
+    spark_arrival_time = CurrentTime[proc] + RtsFlags.GranFlags.Costs.latency;
+
+    new_event(creator, proc, spark_arrival_time,
+	      MoveSpark,
+	      (StgTSO*)NULL, spark->node, spark);
+
+    CurrentTime[proc] += RtsFlags.GranFlags.Costs.mtidytime;
+	    
+  } else { /* proc==creator i.e. turn the spark into a thread */
+
+    if ( RtsFlags.GranFlags.GranSimStats.Global && 
+	 spark->gran_info < RtsFlags.GranFlags.SparkPriority2 ) {
+
+      globalGranStats.tot_low_pri_sparks++;
+      IF_GRAN_DEBUG(pri,
+		    debugBelch("++ No high priority spark available; low priority (%u) spark chosen: node=%p; name=%u\n",
+			  spark->gran_info, 
+			  spark->node, spark->name));
+    } 
+    
+    CurrentTime[proc] += RtsFlags.GranFlags.Costs.threadcreatetime;
+    
+    node = spark->node;
+    
+# if 0
+    /* ToDo: fix the GC interface and move to StartThread handling-- HWL */
+    if (GARBAGE COLLECTION IS NECESSARY) {
+      /* Some kind of backoff needed here in case there's too little heap */
+#  if defined(GRAN_CHECK) && defined(GRAN)
+      if (RtsFlags.GcFlags.giveStats)
+	fprintf(RtsFlags.GcFlags.statsFile,"***** vIS Qu' chen veQ boSwI'; spark=%p, node=%p;  name=%u\n", 
+		/* (found==2 ? "no hi pri spark" : "hi pri spark"), */
+		spark, node, spark->name);
+#  endif
+      new_event(CurrentProc, CurrentProc, CurrentTime[CurrentProc]+1,
+    		  FindWork,
+    		  (StgTSO*)NULL, (StgClosure*)NULL, (rtsSpark*)NULL);
+      barf("//// activateSpark: out of heap ; ToDo: call GarbageCollect()");
+      GarbageCollect(GetRoots, rtsFalse);
+      // HWL old: ReallyPerformThreadGC(TSO_HS+TSO_CTS_SIZE,rtsFalse);
+      // HWL old: SAVE_Hp -= TSO_HS+TSO_CTS_SIZE;
+      spark = NULL;
+      return; /* was: continue; */ /* to the next event, eventually */
+    }
+# endif
+    
+    if (RtsFlags.GranFlags.GranSimStats.Sparks)
+      DumpRawGranEvent(CurrentProc,(PEs)0,SP_USED,(StgTSO*)NULL,
+		       spark->node, spark->name,
+		       spark_queue_len(CurrentProc));
+    
+    new_event(proc, proc, CurrentTime[proc],
+	      StartThread, 
+	      END_TSO_QUEUE, node, spark); // (rtsSpark*)NULL);
+    
+    procStatus[proc] = Starting;
+  }
+}
+
+/* -------------------------------------------------------------------------
+   This is the main point where handling granularity information comes into
+   play. 
+   ------------------------------------------------------------------------- */
+
+#define MAX_RAND_PRI    100
+
+/* 
+   Granularity info transformers. 
+   Applied to the GRAN_INFO field of a spark.
+*/
+STATIC_INLINE nat  ID(nat x) { return(x); };
+STATIC_INLINE nat  INV(nat x) { return(-x); };
+STATIC_INLINE nat  IGNORE(nat x) { return (0); };
+STATIC_INLINE nat  RAND(nat x) { return ((random() % MAX_RAND_PRI) + 1); }
+
+/* NB: size_info and par_info are currently unused (what a shame!) -- HWL */
+rtsSpark *
+newSpark(node,name,gran_info,size_info,par_info,local)
+StgClosure *node;
+nat name, gran_info, size_info, par_info, local;
+{
+  nat pri;
+  rtsSpark *newspark;
+
+  pri = RtsFlags.GranFlags.RandomPriorities ? RAND(gran_info) :
+        RtsFlags.GranFlags.InversePriorities ? INV(gran_info) :
+	RtsFlags.GranFlags.IgnorePriorities ? IGNORE(gran_info) :
+                           ID(gran_info);
+
+  if ( RtsFlags.GranFlags.SparkPriority!=0 && 
+       pri<RtsFlags.GranFlags.SparkPriority ) {
+    IF_GRAN_DEBUG(pri,
+      debugBelch(",, NewSpark: Ignoring spark of priority %u (SparkPriority=%u); node=%#x; name=%u\n", 
+	      pri, RtsFlags.GranFlags.SparkPriority, node, name));
+    return ((rtsSpark*)NULL);
+  }
+
+  newspark = (rtsSpark*) stgMallocBytes(sizeof(rtsSpark), "NewSpark");
+  newspark->prev = newspark->next = (rtsSpark*)NULL;
+  newspark->node = node;
+  newspark->name = (name==1) ? CurrentTSO->gran.sparkname : name;
+  newspark->gran_info = pri;
+  newspark->global = !local;      /* Check that with parAt, parAtAbs !!*/
+
+  if (RtsFlags.GranFlags.GranSimStats.Global) {
+    globalGranStats.tot_sparks_created++;
+    globalGranStats.sparks_created_on_PE[CurrentProc]++;
+  }
+
+  return(newspark);
+}
+
+void
+disposeSpark(spark)
+rtsSpark *spark;
+{
+  ASSERT(spark!=NULL);
+  stgFree(spark);
+}
+
+void 
+disposeSparkQ(spark)
+rtsSparkQ spark;
+{
+  if (spark==NULL) 
+    return;
+
+  disposeSparkQ(spark->next);
+
+# ifdef GRAN_CHECK
+  if (SparksAvail < 0) {
+    debugBelch("disposeSparkQ: SparksAvail<0 after disposing sparkq @ %p\n", &spark);
+    print_spark(spark);
+  }
+# endif
+
+  stgFree(spark);
+}
+
+/*
+   With PrioritySparking add_to_spark_queue performs an insert sort to keep
+   the spark queue sorted. Otherwise the spark is just added to the end of
+   the queue. 
+*/
+
+void
+add_to_spark_queue(spark)
+rtsSpark *spark;
+{
+  rtsSpark *prev = NULL, *next = NULL;
+  nat count = 0;
+  rtsBool found = rtsFalse;
+
+  if ( spark == (rtsSpark *)NULL ) {
+    return;
+  }
+
+  if (RtsFlags.GranFlags.DoPrioritySparking && (spark->gran_info != 0) ) {
+    /* Priority sparking is enabled i.e. spark queues must be sorted */
+
+    for (prev = NULL, next = pending_sparks_hd, count=0;
+	 (next != NULL) && 
+	 !(found = (spark->gran_info >= next->gran_info));
+	 prev = next, next = next->next, count++) 
+     {}
+
+  } else {   /* 'utQo' */
+    /* Priority sparking is disabled */
+    
+    found = rtsFalse;   /* to add it at the end */
+
+  }
+
+  if (found) {
+    /* next points to the first spark with a gran_info smaller than that
+       of spark; therefore, add spark before next into the spark queue */
+    spark->next = next;
+    if ( next == NULL ) {
+      pending_sparks_tl = spark;
+    } else {
+      next->prev = spark;
+    }
+    spark->prev = prev;
+    if ( prev == NULL ) {
+      pending_sparks_hd = spark;
+    } else {
+      prev->next = spark;
+    }
+  } else {  /* (RtsFlags.GranFlags.DoPrioritySparking && !found) || !DoPrioritySparking */
+    /* add the spark at the end of the spark queue */
+    spark->next = NULL;			       
+    spark->prev = pending_sparks_tl;
+    if (pending_sparks_hd == NULL)
+      pending_sparks_hd = spark;
+    else
+      pending_sparks_tl->next = spark;
+    pending_sparks_tl = spark;	  
+  } 
+  ++SparksAvail;
+
+  /* add costs for search in priority sparking */
+  if (RtsFlags.GranFlags.DoPrioritySparking) {
+    CurrentTime[CurrentProc] += count * RtsFlags.GranFlags.Costs.pri_spark_overhead;
+  }
+
+  IF_GRAN_DEBUG(checkSparkQ,
+		debugBelch("++ Spark stats after adding spark %p (node %p) to queue on PE %d",
+		      spark, spark->node, CurrentProc);
+		print_sparkq_stats());
+
+#  if defined(GRAN_CHECK)
+  if (RtsFlags.GranFlags.Debug.checkSparkQ) {
+    for (prev = NULL, next =  pending_sparks_hd;
+	 (next != NULL);
+	 prev = next, next = next->next) 
+      {}
+    if ( (prev!=NULL) && (prev!=pending_sparks_tl) )
+      debugBelch("SparkQ inconsistency after adding spark %p: (PE %u) pending_sparks_tl (%p) not end of queue (%p)\n",
+	      spark,CurrentProc, 
+	      pending_sparks_tl, prev);
+  }
+#  endif
+
+#  if defined(GRAN_CHECK)
+  /* Check if the sparkq is still sorted. Just for testing, really!  */
+  if ( RtsFlags.GranFlags.Debug.checkSparkQ &&
+       RtsFlags.GranFlags.Debug.pri ) {
+    rtsBool sorted = rtsTrue;
+    rtsSpark *prev, *next;
+
+    if (pending_sparks_hd == NULL ||
+	pending_sparks_hd->next == NULL ) {
+      /* just 1 elem => ok */
+    } else {
+      for (prev = pending_sparks_hd,
+	   next = pending_sparks_hd->next;
+	   (next != NULL) ;
+	   prev = next, next = next->next) {
+	sorted = sorted && 
+	         (prev->gran_info >= next->gran_info);
+      }
+    }
+    if (!sorted) {
+      debugBelch("ghuH: SPARKQ on PE %d is not sorted:\n",
+	      CurrentProc);
+      print_sparkq(CurrentProc);
+    }
+  }
+#  endif
+}
+
+nat
+spark_queue_len(proc) 
+PEs proc;
+{
+ rtsSpark *prev, *spark;                     /* prev only for testing !! */
+ nat len;
+
+ for (len = 0, prev = NULL, spark = pending_sparks_hds[proc]; 
+      spark != NULL; 
+      len++, prev = spark, spark = spark->next)
+   {}
+
+#  if defined(GRAN_CHECK)
+  if ( RtsFlags.GranFlags.Debug.checkSparkQ ) 
+    if ( (prev!=NULL) && (prev!=pending_sparks_tls[proc]) )
+      debugBelch("ERROR in spark_queue_len: (PE %u) pending_sparks_tl (%p) not end of queue (%p)\n",
+	      proc, pending_sparks_tls[proc], prev);
+#  endif
+
+ return (len);
+}
+
+/* 
+   Take spark out of the spark queue on PE p and nuke the spark. Adjusts
+   hd and tl pointers of the spark queue. Returns a pointer to the next
+   spark in the queue.
+*/
+rtsSpark *
+delete_from_sparkq (spark, p, dispose_too)     /* unlink and dispose spark */
+rtsSpark *spark;
+PEs p;
+rtsBool dispose_too;
+{
+  rtsSpark *new_spark;
+
+  if (spark==NULL) 
+    barf("delete_from_sparkq: trying to delete NULL spark\n");
+
+#  if defined(GRAN_CHECK)
+  if ( RtsFlags.GranFlags.Debug.checkSparkQ ) {
+    debugBelch("## |%p:%p| (%p)<-spark=%p->(%p) <-(%p)\n",
+	    pending_sparks_hd, pending_sparks_tl,
+	    spark->prev, spark, spark->next, 
+	    (spark->next==NULL ? 0 : spark->next->prev));
+  }
+#  endif
+
+  if (spark->prev==NULL) {
+    /* spark is first spark of queue => adjust hd pointer */
+    ASSERT(pending_sparks_hds[p]==spark);
+    pending_sparks_hds[p] = spark->next;
+  } else {
+    spark->prev->next = spark->next;
+  }
+  if (spark->next==NULL) {
+    ASSERT(pending_sparks_tls[p]==spark);
+    /* spark is first spark of queue => adjust tl pointer */
+    pending_sparks_tls[p] = spark->prev;
+  } else {
+    spark->next->prev = spark->prev;
+  }
+  new_spark = spark->next;
+  
+#  if defined(GRAN_CHECK)
+  if ( RtsFlags.GranFlags.Debug.checkSparkQ ) {
+    debugBelch("## |%p:%p| (%p)<-spark=%p->(%p) <-(%p); spark=%p will be deleted NOW \n",
+	    pending_sparks_hd, pending_sparks_tl,
+	    spark->prev, spark, spark->next, 
+	    (spark->next==NULL ? 0 : spark->next->prev), spark);
+  }
+#  endif
+
+  if (dispose_too)
+    disposeSpark(spark);
+                  
+  return new_spark;
+}
+
+/* Mark all nodes pointed to by sparks in the spark queues (for GC) */
+void
+markSparkQueue(void)
+{ 
+  StgClosure *MarkRoot(StgClosure *root); // prototype
+  PEs p;
+  rtsSpark *sp;
+
+  for (p=0; p<RtsFlags.GranFlags.proc; p++)
+    for (sp=pending_sparks_hds[p]; sp!=NULL; sp=sp->next) {
+      ASSERT(sp->node!=NULL);
+      ASSERT(LOOKS_LIKE_GHC_INFO(sp->node->header.info));
+      // ToDo?: statistics gathering here (also for GUM!)
+      sp->node = (StgClosure *)MarkRoot(sp->node);
+    }
+  IF_DEBUG(gc,
+	   debugBelch("@@ markSparkQueue: spark statistics at start of GC:");
+	   print_sparkq_stats());
+}
+
+void
+print_spark(spark)
+rtsSpark *spark;
+{ 
+  char str[16];
+
+  if (spark==NULL) {
+    debugBelch("Spark: NIL\n");
+    return;
+  } else {
+    sprintf(str,
+	    ((spark->node==NULL) ? "______" : "%#6lx"), 
+	    stgCast(StgPtr,spark->node));
+
+    debugBelch("Spark: Node %8s, Name %#6x, Global %5s, Creator %5x, Prev %6p, Next %6p\n",
+	    str, spark->name, 
+            ((spark->global)==rtsTrue?"True":"False"), spark->creator, 
+            spark->prev, spark->next);
+  }
+}
+
+void
+print_sparkq(proc)
+PEs proc;
+// rtsSpark *hd;
+{
+  rtsSpark *x = pending_sparks_hds[proc];
+
+  debugBelch("Spark Queue of PE %d with root at %p:\n", proc, x);
+  for (; x!=(rtsSpark*)NULL; x=x->next) {
+    print_spark(x);
+  }
+}
+
+/* 
+   Print a statistics of all spark queues.
+*/
+void
+print_sparkq_stats(void)
+{
+  PEs p;
+
+  debugBelch("SparkQs: [");
+  for (p=0; p<RtsFlags.GranFlags.proc; p++)
+    debugBelch(", PE %d: %d", p, spark_queue_len(p));
+  debugBelch("\n");
+}
+
+#endif
diff --git a/rts/Sparks.h b/rts/Sparks.h
new file mode 100644
index 0000000000..77d280bea8
--- /dev/null
+++ b/rts/Sparks.h
@@ -0,0 +1,104 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2000-2006
+ *
+ * Sparking support for GRAN, PAR and THREADED_RTS versions of the RTS.
+ * 
+ * ---------------------------------------------------------------------------*/
+
+#ifndef SPARKS_H
+#define SPARKS_H
+
+#if !defined(GRAN)
+StgInt newSpark (StgRegTable *reg, StgClosure *p);
+#endif
+
+#if defined(PARALLEL_HASKELL) || defined(THREADED_RTS)
+StgClosure * findSpark         (Capability *cap);
+void         initSparkPools    (void);
+void         markSparkQueue    (evac_fn evac);
+void         createSparkThread (Capability *cap, StgClosure *p);
+
+INLINE_HEADER void     discardSparks  (StgSparkPool *pool);
+INLINE_HEADER nat      sparkPoolSize  (StgSparkPool *pool);
+INLINE_HEADER rtsBool  emptySparkPool (StgSparkPool *pool);
+
+INLINE_HEADER void     discardSparksCap  (Capability *cap);
+INLINE_HEADER nat      sparkPoolSizeCap  (Capability *cap);
+INLINE_HEADER rtsBool  emptySparkPoolCap (Capability *cap);
+#endif
+
+#if defined(PARALLEL_HASKELL)
+StgTSO      *activateSpark (rtsSpark spark) ;
+rtsBool      add_to_spark_queue( StgClosure *closure, StgSparkPool *pool );
+void         markSparkQueue( void );
+nat          spark_queue_len( StgSparkPool *pool );
+void         disposeSpark( StgClosure *spark );
+#endif
+
+#if defined(GRAN)
+void      findLocalSpark (rtsEvent *event, rtsBool *found_res, rtsSparkQ *spark_res);
+rtsBool   activateSpark (rtsEvent *event, rtsSparkQ spark);
+rtsSpark *newSpark(StgClosure *node, nat name, nat gran_info, 
+		   nat size_info, nat par_info, nat local);
+void      add_to_spark_queue(rtsSpark *spark);
+rtsSpark *delete_from_sparkq (rtsSpark *spark, PEs p, rtsBool dispose_too);
+void 	  disposeSpark(rtsSpark *spark);
+void 	  disposeSparkQ(rtsSparkQ spark);
+void 	  print_spark(rtsSpark *spark);
+void      print_sparkq(PEs proc);
+void 	  print_sparkq_stats(void);
+nat  	  spark_queue_len(PEs proc);
+void      markSparkQueue(void);
+#endif
+
+/* -----------------------------------------------------------------------------
+ * PRIVATE below here
+ * -------------------------------------------------------------------------- */
+
+#if defined(PARALLEL_HASKELL) || defined(THREADED_RTS)
+
+INLINE_HEADER rtsBool
+emptySparkPool (StgSparkPool *pool)
+{
+    return (pool->hd == pool->tl);
+}
+
+INLINE_HEADER rtsBool
+emptySparkPoolCap (Capability *cap) 
+{ return emptySparkPool(&cap->r.rSparks); }
+
+INLINE_HEADER nat
+sparkPoolSize (StgSparkPool *pool) 
+{
+    if (pool->hd <= pool->tl) {
+	return (pool->hd - pool->tl);
+    } else {
+	return (pool->lim - pool->hd + pool->tl - pool->base);
+    }
+}
+
+INLINE_HEADER nat
+sparkPoolSizeCap (Capability *cap) 
+{ return sparkPoolSize(&cap->r.rSparks); }
+
+INLINE_HEADER void
+discardSparks (StgSparkPool *pool)
+{
+    pool->hd = pool->tl;
+}
+
+INLINE_HEADER void
+discardSparksCap (Capability *cap) 
+{ return discardSparks(&cap->r.rSparks); }
+
+
+#elif defined(THREADED_RTS) 
+
+INLINE_HEADER rtsBool
+emptySparkPoolCap (Capability *cap STG_UNUSED)
+{ return rtsTrue; }
+
+#endif
+
+#endif /* SPARKS_H */
diff --git a/rts/Stable.c b/rts/Stable.c
new file mode 100644
index 0000000000..a4db5cd749
--- /dev/null
+++ b/rts/Stable.c
@@ -0,0 +1,460 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2002
+ *
+ * Stable names and stable pointers.
+ *
+ * ---------------------------------------------------------------------------*/
+
+// Make static versions of inline functions in Stable.h:
+#define RTS_STABLE_C
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "Hash.h"
+#include "RtsUtils.h"
+#include "OSThreads.h"
+#include "Storage.h"
+#include "RtsAPI.h"
+#include "RtsFlags.h"
+#include "OSThreads.h"
+
+/* Comment from ADR's implementation in old RTS:
+
+  This files (together with @ghc/runtime/storage/PerformIO.lhc@ and a
+  small change in @HpOverflow.lc@) consists of the changes in the
+  runtime system required to implement "Stable Pointers". But we're
+  getting a bit ahead of ourselves --- what is a stable pointer and what
+  is it used for?
+
+  When Haskell calls C, it normally just passes over primitive integers,
+  floats, bools, strings, etc.  This doesn't cause any problems at all
+  for garbage collection because the act of passing them makes a copy
+  from the heap, stack or wherever they are onto the C-world stack.
+  However, if we were to pass a heap object such as a (Haskell) @String@
+  and a garbage collection occured before we finished using it, we'd run
+  into problems since the heap object might have been moved or even
+  deleted.
+
+  So, if a C call is able to cause a garbage collection or we want to
+  store a pointer to a heap object between C calls, we must be careful
+  when passing heap objects. Our solution is to keep a table of all
+  objects we've given to the C-world and to make sure that the garbage
+  collector collects these objects --- updating the table as required to
+  make sure we can still find the object.
+
+
+  Of course, all this rather begs the question: why would we want to
+  pass a boxed value?
+
+  One very good reason is to preserve laziness across the language
+  interface. Rather than evaluating an integer or a string because it
+  {\em might\/} be required by the C function, we can wait until the C
+  function actually wants the value and then force an evaluation.
+
+  Another very good reason (the motivating reason!) is that the C code
+  might want to execute an object of sort $IO ()$ for the side-effects
+  it will produce. For example, this is used when interfacing to an X
+  widgets library to allow a direct implementation of callbacks.
+
+
+  The @makeStablePointer :: a -> IO (StablePtr a)@ function
+  converts a value into a stable pointer.  It is part of the @PrimIO@
+  monad, because we want to be sure we don't allocate one twice by
+  accident, and then only free one of the copies.
+
+  \begin{verbatim}
+  makeStablePtr#  :: a -> State# RealWorld -> (# RealWorld, a #)
+  freeStablePtr#  :: StablePtr# a -> State# RealWorld -> State# RealWorld
+  deRefStablePtr# :: StablePtr# a -> State# RealWorld -> 
+        (# State# RealWorld, a #)
+  \end{verbatim}
+
+  There may be additional functions on the C side to allow evaluation,
+  application, etc of a stable pointer.
+
+*/
+
+snEntry *stable_ptr_table = NULL;
+static snEntry *stable_ptr_free = NULL;
+
+static unsigned int SPT_size = 0;
+
+#ifdef THREADED_RTS
+static Mutex stable_mutex;
+#endif
+
+/* This hash table maps Haskell objects to stable names, so that every
+ * call to lookupStableName on a given object will return the same
+ * stable name.
+ *
+ * OLD COMMENTS about reference counting follow.  The reference count
+ * in a stable name entry is now just a counter.
+ *
+ * Reference counting
+ * ------------------
+ * A plain stable name entry has a zero reference count, which means
+ * the entry will dissappear when the object it points to is
+ * unreachable.  For stable pointers, we need an entry that sticks
+ * around and keeps the object it points to alive, so each stable name
+ * entry has an associated reference count.
+ *
+ * A stable pointer has a weighted reference count N attached to it
+ * (actually in its upper 5 bits), which represents the weight
+ * 2^(N-1).  The stable name entry keeps a 32-bit reference count, which
+ * represents any weight between 1 and 2^32 (represented as zero).
+ * When the weight is 2^32, the stable name table owns "all" of the
+ * stable pointers to this object, and the entry can be garbage
+ * collected if the object isn't reachable.
+ *
+ * A new stable pointer is given the weight log2(W/2), where W is the
+ * weight stored in the table entry.  The new weight in the table is W
+ * - 2^log2(W/2).
+ *
+ * A stable pointer can be "split" into two stable pointers, by
+ * dividing the weight by 2 and giving each pointer half.
+ * When freeing a stable pointer, the weight of the pointer is added
+ * to the weight stored in the table entry.
+ * */
+
+static HashTable *addrToStableHash = NULL;
+
+#define INIT_SPT_SIZE 64
+
+STATIC_INLINE void
+initFreeList(snEntry *table, nat n, snEntry *free)
+{
+  snEntry *p;
+
+  for (p = table + n - 1; p >= table; p--) {
+    p->addr   = (P_)free;
+    p->old    = NULL;
+    p->ref    = 0;
+    p->sn_obj = NULL;
+    free = p;
+  }
+  stable_ptr_free = table;
+}
+
+void
+initStablePtrTable(void)
+{
+	if (SPT_size > 0)
+		return;
+
+    SPT_size = INIT_SPT_SIZE;
+    stable_ptr_table = stgMallocBytes(SPT_size * sizeof(snEntry),
+				      "initStablePtrTable");
+
+    /* we don't use index 0 in the stable name table, because that
+     * would conflict with the hash table lookup operations which
+     * return NULL if an entry isn't found in the hash table.
+     */
+    initFreeList(stable_ptr_table+1,INIT_SPT_SIZE-1,NULL);
+    addrToStableHash = allocHashTable();
+
+#ifdef THREADED_RTS
+    initMutex(&stable_mutex);
+#endif
+}
+
+/*
+ * get at the real stuff...remove indirections.
+ *
+ * ToDo: move to a better home.
+ */
+static
+StgClosure*
+removeIndirections(StgClosure* p)
+{
+  StgClosure* q = p;
+
+  while (get_itbl(q)->type == IND ||
+         get_itbl(q)->type == IND_STATIC ||
+         get_itbl(q)->type == IND_OLDGEN ||
+         get_itbl(q)->type == IND_PERM ||
+         get_itbl(q)->type == IND_OLDGEN_PERM ) {
+      q = ((StgInd *)q)->indirectee;
+  }
+  return q;
+}
+
+static StgWord
+lookupStableName_(StgPtr p)
+{
+  StgWord sn;
+  void* sn_tmp;
+
+  if (stable_ptr_free == NULL) {
+    enlargeStablePtrTable();
+  }
+
+  /* removing indirections increases the likelihood
+   * of finding a match in the stable name hash table.
+   */
+  p = (StgPtr)removeIndirections((StgClosure*)p);
+
+  sn_tmp = lookupHashTable(addrToStableHash,(W_)p);
+  sn = (StgWord)sn_tmp;
+  
+  if (sn != 0) {
+    ASSERT(stable_ptr_table[sn].addr == p);
+    IF_DEBUG(stable,debugBelch("cached stable name %ld at %p\n",sn,p));
+    return sn;
+  } else {
+    sn = stable_ptr_free - stable_ptr_table;
+    stable_ptr_free  = (snEntry*)(stable_ptr_free->addr);
+    stable_ptr_table[sn].ref = 0;
+    stable_ptr_table[sn].addr = p;
+    stable_ptr_table[sn].sn_obj = NULL;
+    /* IF_DEBUG(stable,debugBelch("new stable name %d at %p\n",sn,p)); */
+    
+    /* add the new stable name to the hash table */
+    insertHashTable(addrToStableHash, (W_)p, (void *)sn);
+
+    return sn;
+  }
+}
+
+StgWord
+lookupStableName(StgPtr p)
+{
+    StgWord res;
+
+    initStablePtrTable();
+    ACQUIRE_LOCK(&stable_mutex);
+    res = lookupStableName_(p);
+    RELEASE_LOCK(&stable_mutex);
+    return res;
+}
+
+STATIC_INLINE void
+freeStableName(snEntry *sn)
+{
+  ASSERT(sn->sn_obj == NULL);
+  if (sn->addr != NULL) {
+      removeHashTable(addrToStableHash, (W_)sn->addr, NULL);
+  }
+  sn->addr = (P_)stable_ptr_free;
+  stable_ptr_free = sn;
+}
+
+StgStablePtr
+getStablePtr(StgPtr p)
+{
+  StgWord sn;
+
+  initStablePtrTable();
+  ACQUIRE_LOCK(&stable_mutex);
+  sn = lookupStableName_(p);
+  stable_ptr_table[sn].ref++;
+  RELEASE_LOCK(&stable_mutex);
+  return (StgStablePtr)(sn);
+}
+
+void
+freeStablePtr(StgStablePtr sp)
+{
+    snEntry *sn;
+
+	initStablePtrTable();
+    ACQUIRE_LOCK(&stable_mutex);
+
+    sn = &stable_ptr_table[(StgWord)sp];
+    
+    ASSERT((StgWord)sp < SPT_size  &&  sn->addr != NULL  &&  sn->ref > 0);
+
+    sn->ref--;
+
+    // If this entry has no StableName attached, then just free it
+    // immediately.  This is important; it might be a while before the
+    // next major GC which actually collects the entry.
+    if (sn->sn_obj == NULL && sn->ref == 0) {
+	freeStableName(sn);
+    }
+
+    RELEASE_LOCK(&stable_mutex);
+}
+
+void
+enlargeStablePtrTable(void)
+{
+  nat old_SPT_size = SPT_size;
+
+    // 2nd and subsequent times
+  SPT_size *= 2;
+  stable_ptr_table =
+    stgReallocBytes(stable_ptr_table,
+		      SPT_size * sizeof(snEntry),
+		      "enlargeStablePtrTable");
+
+  initFreeList(stable_ptr_table + old_SPT_size, old_SPT_size, NULL);
+}
+
+/* -----------------------------------------------------------------------------
+ * Treat stable pointers as roots for the garbage collector.
+ *
+ * A stable pointer is any stable name entry with a ref > 0.  We'll
+ * take the opportunity to zero the "keep" flags at the same time.
+ * -------------------------------------------------------------------------- */
+
+void
+markStablePtrTable(evac_fn evac)
+{
+    snEntry *p, *end_stable_ptr_table;
+    StgPtr q;
+    
+    end_stable_ptr_table = &stable_ptr_table[SPT_size];
+    
+    // Mark all the stable *pointers* (not stable names).
+    // _starting_ at index 1; index 0 is unused.
+    for (p = stable_ptr_table+1; p < end_stable_ptr_table; p++) {
+	q = p->addr;
+
+	// Internal pointers are free slots.  If q == NULL, it's a
+	// stable name where the object has been GC'd, but the
+	// StableName object (sn_obj) is still alive.
+	if (q && (q < (P_)stable_ptr_table || q >= (P_)end_stable_ptr_table)) {
+
+	    // save the current addr away: we need to be able to tell
+	    // whether the objects moved in order to be able to update
+	    // the hash table later.
+	    p->old = p->addr;
+
+	    // if the ref is non-zero, treat addr as a root
+	    if (p->ref != 0) {
+		evac((StgClosure **)&p->addr);
+	    }
+	}
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Thread the stable pointer table for compacting GC.
+ * 
+ * Here we must call the supplied evac function for each pointer into
+ * the heap from the stable pointer table, because the compacting
+ * collector may move the object it points to.
+ * -------------------------------------------------------------------------- */
+
+void
+threadStablePtrTable( evac_fn evac )
+{
+    snEntry *p, *end_stable_ptr_table;
+    StgPtr q;
+    
+    end_stable_ptr_table = &stable_ptr_table[SPT_size];
+    
+    for (p = stable_ptr_table+1; p < end_stable_ptr_table; p++) {
+	
+	if (p->sn_obj != NULL) {
+	    evac((StgClosure **)&p->sn_obj);
+	}
+
+	q = p->addr;
+	if (q && (q < (P_)stable_ptr_table || q >= (P_)end_stable_ptr_table)) {
+	    evac((StgClosure **)&p->addr);
+	}
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Garbage collect any dead entries in the stable pointer table.
+ *
+ * A dead entry has:
+ *
+ *          - a zero reference count
+ *          - a dead sn_obj
+ *
+ * Both of these conditions must be true in order to re-use the stable
+ * name table entry.  We can re-use stable name table entries for live
+ * heap objects, as long as the program has no StableName objects that
+ * refer to the entry.
+ * -------------------------------------------------------------------------- */
+
+void
+gcStablePtrTable( void )
+{
+    snEntry *p, *end_stable_ptr_table;
+    StgPtr q;
+    
+    end_stable_ptr_table = &stable_ptr_table[SPT_size];
+    
+    // NOTE: _starting_ at index 1; index 0 is unused.
+    for (p = stable_ptr_table + 1; p < end_stable_ptr_table; p++) {
+	
+	// Update the pointer to the StableName object, if there is one
+	if (p->sn_obj != NULL) {
+	    p->sn_obj = isAlive(p->sn_obj);
+	}
+	
+	// Internal pointers are free slots.  If q == NULL, it's a
+	// stable name where the object has been GC'd, but the
+	// StableName object (sn_obj) is still alive.
+	q = p->addr;
+	if (q && (q < (P_)stable_ptr_table || q >= (P_)end_stable_ptr_table)) {
+
+	    // StableNames only:
+	    if (p->ref == 0) {
+		if (p->sn_obj == NULL) {
+		    // StableName object is dead
+		    freeStableName(p);
+		    IF_DEBUG(stable, debugBelch("GC'd Stable name %ld\n", 
+						p - stable_ptr_table));
+		    continue;
+		    
+		} else {
+		  p->addr = (StgPtr)isAlive((StgClosure *)p->addr);
+		    IF_DEBUG(stable, debugBelch("Stable name %ld still alive at %p, ref %ld\n", p - stable_ptr_table, p->addr, p->ref));
+		}
+	    }
+	}
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Update the StablePtr/StableName hash table
+ *
+ * The boolean argument 'full' indicates that a major collection is
+ * being done, so we might as well throw away the hash table and build
+ * a new one.  For a minor collection, we just re-hash the elements
+ * that changed.
+ * -------------------------------------------------------------------------- */
+
+void
+updateStablePtrTable(rtsBool full)
+{
+    snEntry *p, *end_stable_ptr_table;
+    
+    if (full && addrToStableHash != NULL) {
+	freeHashTable(addrToStableHash,NULL);
+	addrToStableHash = allocHashTable();
+    }
+    
+    end_stable_ptr_table = &stable_ptr_table[SPT_size];
+    
+    // NOTE: _starting_ at index 1; index 0 is unused.
+    for (p = stable_ptr_table + 1; p < end_stable_ptr_table; p++) {
+	
+	if (p->addr == NULL) {
+	    if (p->old != NULL) {
+		// The target has been garbage collected.  Remove its
+		// entry from the hash table.
+		removeHashTable(addrToStableHash, (W_)p->old, NULL);
+		p->old = NULL;
+	    }
+	}
+	else if (p->addr < (P_)stable_ptr_table 
+		 || p->addr >= (P_)end_stable_ptr_table) {
+	    // Target still alive, Re-hash this stable name 
+	    if (full) {
+		insertHashTable(addrToStableHash, (W_)p->addr, 
+				(void *)(p - stable_ptr_table));
+	    } else if (p->addr != p->old) {
+		removeHashTable(addrToStableHash, (W_)p->old, NULL);
+		insertHashTable(addrToStableHash, (W_)p->addr, 
+				(void *)(p - stable_ptr_table));
+	    }
+	}
+    }
+}
diff --git a/rts/Stats.c b/rts/Stats.c
new file mode 100644
index 0000000000..28d09bdbed
--- /dev/null
+++ b/rts/Stats.c
@@ -0,0 +1,632 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Statistics and timing-related functions.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "MBlock.h"
+#include "Schedule.h"
+#include "Stats.h"
+#include "ParTicky.h"                       /* ToDo: move into Rts.h */
+#include "Profiling.h"
+#include "Storage.h"
+#include "GetTime.h"
+
+/* huh? */
+#define BIG_STRING_LEN              512
+
+#define TICK_TO_DBL(t) ((double)(t) / TICKS_PER_SECOND)
+
+static Ticks ElapsedTimeStart = 0;
+
+static Ticks InitUserTime     = 0;
+static Ticks InitElapsedTime  = 0;
+static Ticks InitElapsedStamp = 0;
+
+static Ticks MutUserTime      = 0;
+static Ticks MutElapsedTime   = 0;
+static Ticks MutElapsedStamp  = 0;
+
+static Ticks ExitUserTime     = 0;
+static Ticks ExitElapsedTime  = 0;
+
+static ullong GC_tot_alloc        = 0;
+static ullong GC_tot_copied       = 0;
+static ullong GC_tot_scavd_copied = 0;
+
+static Ticks GC_start_time = 0,  GC_tot_time  = 0;  /* User GC Time */
+static Ticks GCe_start_time = 0, GCe_tot_time = 0;  /* Elapsed GC time */
+
+#ifdef PROFILING
+static Ticks RP_start_time  = 0, RP_tot_time  = 0;  /* retainer prof user time */
+static Ticks RPe_start_time = 0, RPe_tot_time = 0;  /* retainer prof elap time */
+
+static Ticks HC_start_time, HC_tot_time = 0;     // heap census prof user time
+static Ticks HCe_start_time, HCe_tot_time = 0;   // heap census prof elap time
+#endif
+
+#ifdef PROFILING
+#define PROF_VAL(x)   (x)
+#else
+#define PROF_VAL(x)   0
+#endif
+
+static lnat MaxResidency = 0;     // in words; for stats only
+static lnat AvgResidency = 0;
+static lnat ResidencySamples = 0; // for stats only
+
+static lnat GC_start_faults = 0, GC_end_faults = 0;
+
+static Ticks *GC_coll_times;
+
+static void statsPrintf( char *s, ... ) 
+    GNUC3_ATTRIBUTE(format (printf, 1, 2));
+
+static void statsFlush( void );
+static void statsClose( void );
+
+Ticks stat_getElapsedGCTime(void)
+{
+    return GCe_tot_time;
+}
+
+/* mut_user_time_during_GC() and mut_user_time()
+ *
+ * The former function can be used to get the current mutator time
+ * *during* a GC, i.e. between stat_startGC and stat_endGC.  This is
+ * used in the heap profiler for accurately time stamping the heap
+ * sample.  
+ *
+ * ATTENTION: mut_user_time_during_GC() relies on GC_start_time being 
+ *	      defined in stat_startGC() - to minimise system calls, 
+ *	      GC_start_time is, however, only defined when really needed (check
+ *	      stat_startGC() for details)
+ */
+double
+mut_user_time_during_GC( void )
+{
+  return TICK_TO_DBL(GC_start_time - GC_tot_time - PROF_VAL(RP_tot_time + HC_tot_time));
+}
+
+double
+mut_user_time( void )
+{
+    Ticks user;
+    user = getProcessCPUTime();
+    return TICK_TO_DBL(user - GC_tot_time - PROF_VAL(RP_tot_time + HC_tot_time));
+}
+
+#ifdef PROFILING
+/*
+  mut_user_time_during_RP() is similar to mut_user_time_during_GC();
+  it returns the MUT time during retainer profiling.
+  The same is for mut_user_time_during_HC();
+ */
+double
+mut_user_time_during_RP( void )
+{
+  return TICK_TO_DBL(RP_start_time - GC_tot_time - RP_tot_time - HC_tot_time);
+}
+
+double
+mut_user_time_during_heap_census( void )
+{
+  return TICK_TO_DBL(HC_start_time - GC_tot_time - RP_tot_time - HC_tot_time);
+}
+#endif /* PROFILING */
+
+void
+initStats(void)
+{
+    nat i;
+  
+    if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
+	statsPrintf("    Alloc    Copied     Live    GC    GC     TOT     TOT  Page Flts\n");
+	statsPrintf("    bytes     bytes     bytes  user  elap    user    elap\n");
+    }
+    GC_coll_times = 
+	(Ticks *)stgMallocBytes(
+	    sizeof(Ticks)*RtsFlags.GcFlags.generations,
+	    "initStats");
+    for (i = 0; i < RtsFlags.GcFlags.generations; i++) {
+	GC_coll_times[i] = 0;
+    }
+}    
+
+/* -----------------------------------------------------------------------------
+   Initialisation time...
+   -------------------------------------------------------------------------- */
+
+void
+stat_startInit(void)
+{
+    Ticks elapsed;
+
+    elapsed = getProcessElapsedTime();
+    ElapsedTimeStart = elapsed;
+}
+
+void 
+stat_endInit(void)
+{
+    Ticks user, elapsed;
+
+    getProcessTimes(&user, &elapsed);
+
+    InitUserTime = user;
+    InitElapsedStamp = elapsed; 
+    if (ElapsedTimeStart > elapsed) {
+	InitElapsedTime = 0;
+    } else {
+	InitElapsedTime = elapsed - ElapsedTimeStart;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   stat_startExit and stat_endExit
+   
+   These two measure the time taken in shutdownHaskell().
+   -------------------------------------------------------------------------- */
+
+void
+stat_startExit(void)
+{
+    Ticks user, elapsed;
+
+    getProcessTimes(&user, &elapsed);
+
+    MutElapsedStamp = elapsed;
+    MutElapsedTime = elapsed - GCe_tot_time -
+	PROF_VAL(RPe_tot_time + HCe_tot_time) - InitElapsedStamp;
+    if (MutElapsedTime < 0) { MutElapsedTime = 0; }	/* sometimes -0.00 */
+
+    MutUserTime = user - GC_tot_time - PROF_VAL(RP_tot_time + HC_tot_time) - InitUserTime;
+    if (MutUserTime < 0) { MutUserTime = 0; }
+}
+
+void
+stat_endExit(void)
+{
+    Ticks user, elapsed;
+
+    getProcessTimes(&user, &elapsed);
+
+    ExitUserTime = user - MutUserTime - GC_tot_time - PROF_VAL(RP_tot_time + HC_tot_time) - InitUserTime;
+    ExitElapsedTime = elapsed - MutElapsedStamp;
+    if (ExitUserTime < 0) {
+	ExitUserTime = 0;
+    }
+    if (ExitElapsedTime < 0) {
+	ExitElapsedTime = 0;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   Called at the beginning of each GC
+   -------------------------------------------------------------------------- */
+
+static nat rub_bell = 0;
+
+/*  initialise global variables needed during GC
+ *
+ *  * GC_start_time is read in mut_user_time_during_GC(), which in turn is 
+ *    needed if either PROFILING or DEBUGing is enabled
+ */
+void
+stat_startGC(void)
+{
+    nat bell = RtsFlags.GcFlags.ringBell;
+
+    if (bell) {
+	if (bell > 1) {
+	    debugBelch(" GC ");
+	    rub_bell = 1;
+	} else {
+	    debugBelch("\007");
+	}
+    }
+
+#if defined(PROFILING) || defined(DEBUG)
+    GC_start_time = getProcessCPUTime();  // needed in mut_user_time_during_GC()
+#endif
+
+    if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
+#if !defined(PROFILING) && !defined(DEBUG)
+        GC_start_time = getProcessCPUTime();
+#endif
+	GCe_start_time = getProcessElapsedTime();
+	if (RtsFlags.GcFlags.giveStats) {
+	    GC_start_faults = getPageFaults();
+	}
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   Called at the end of each GC
+   -------------------------------------------------------------------------- */
+
+void
+stat_endGC (lnat alloc, lnat live, lnat copied, 
+	    lnat scavd_copied, lnat gen)
+{
+    if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
+	Ticks time, etime, gc_time, gc_etime;
+	
+	getProcessTimes(&time, &etime);
+	gc_time  = time - GC_start_time;
+	gc_etime = etime - GCe_start_time;
+	
+	if (RtsFlags.GcFlags.giveStats == VERBOSE_GC_STATS) {
+	    nat faults = getPageFaults();
+	    
+	    statsPrintf("%9ld %9ld %9ld",
+		    alloc*sizeof(W_), (copied+scavd_copied)*sizeof(W_), 
+			live*sizeof(W_));
+	    statsPrintf(" %5.2f %5.2f %7.2f %7.2f %4ld %4ld  (Gen: %2ld)\n", 
+		    TICK_TO_DBL(gc_time),
+		    TICK_TO_DBL(gc_etime),
+		    TICK_TO_DBL(time),
+		    TICK_TO_DBL(etime - ElapsedTimeStart),
+		    faults - GC_start_faults,
+		    GC_start_faults - GC_end_faults,
+		    gen);
+
+	    GC_end_faults = faults;
+	    statsFlush();
+	}
+
+	GC_coll_times[gen] += gc_time;
+
+	GC_tot_copied += (ullong) copied;
+	GC_tot_scavd_copied += (ullong) scavd_copied;
+	GC_tot_alloc  += (ullong) alloc;
+	GC_tot_time   += gc_time;
+	GCe_tot_time  += gc_etime;
+	
+#if defined(THREADED_RTS)
+	{
+	    Task *task;
+	    if ((task = myTask()) != NULL) {
+		task->gc_time += gc_time;
+		task->gc_etime += gc_etime;
+	    }
+	}
+#endif
+
+	if (gen == RtsFlags.GcFlags.generations-1) { /* major GC? */
+	    if (live > MaxResidency) {
+		MaxResidency = live;
+	    }
+	    ResidencySamples++;
+	    AvgResidency += live;
+	}
+    }
+
+    if (rub_bell) {
+	debugBelch("\b\b\b  \b\b\b");
+	rub_bell = 0;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   Called at the beginning of each Retainer Profiliing
+   -------------------------------------------------------------------------- */
+#ifdef PROFILING
+void
+stat_startRP(void)
+{
+    Ticks user, elapsed;
+    getProcessTimes( &user, &elapsed );
+
+    RP_start_time = user;
+    RPe_start_time = elapsed;
+}
+#endif /* PROFILING */
+
+/* -----------------------------------------------------------------------------
+   Called at the end of each Retainer Profiliing
+   -------------------------------------------------------------------------- */
+
+#ifdef PROFILING
+void
+stat_endRP(
+  nat retainerGeneration,
+#ifdef DEBUG_RETAINER
+  nat maxCStackSize,
+  int maxStackSize,
+#endif
+  double averageNumVisit)
+{
+    Ticks user, elapsed;
+    getProcessTimes( &user, &elapsed );
+
+    RP_tot_time += user - RP_start_time;
+    RPe_tot_time += elapsed - RPe_start_time;
+
+  fprintf(prof_file, "Retainer Profiling: %d, at %f seconds\n", 
+    retainerGeneration, mut_user_time_during_RP());
+#ifdef DEBUG_RETAINER
+  fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize);
+  fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
+#endif
+  fprintf(prof_file, "\tAverage number of visits per object = %f\n", averageNumVisit);
+}
+#endif /* PROFILING */
+
+/* -----------------------------------------------------------------------------
+   Called at the beginning of each heap census
+   -------------------------------------------------------------------------- */
+#ifdef PROFILING
+void
+stat_startHeapCensus(void)
+{
+    Ticks user, elapsed;
+    getProcessTimes( &user, &elapsed );
+
+    HC_start_time = user;
+    HCe_start_time = elapsed;
+}
+#endif /* PROFILING */
+
+/* -----------------------------------------------------------------------------
+   Called at the end of each heap census
+   -------------------------------------------------------------------------- */
+#ifdef PROFILING
+void
+stat_endHeapCensus(void) 
+{
+    Ticks user, elapsed;
+    getProcessTimes( &user, &elapsed );
+
+    HC_tot_time += user - HC_start_time;
+    HCe_tot_time += elapsed - HCe_start_time;
+}
+#endif /* PROFILING */
+
+/* -----------------------------------------------------------------------------
+   Called at the end of execution
+
+   NOTE: number of allocations is not entirely accurate: it doesn't
+   take into account the few bytes at the end of the heap that
+   were left unused when the heap-check failed.
+   -------------------------------------------------------------------------- */
+
+void
+stat_exit(int alloc)
+{
+    if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
+
+	char temp[BIG_STRING_LEN];
+	Ticks time;
+	Ticks etime;
+	nat g, total_collections = 0;
+
+	getProcessTimes( &time, &etime );
+	etime -= ElapsedTimeStart;
+
+	GC_tot_alloc += alloc;
+
+	/* Count total garbage collections */
+	for (g = 0; g < RtsFlags.GcFlags.generations; g++)
+	    total_collections += generations[g].collections;
+
+	/* avoid divide by zero if time is measured as 0.00 seconds -- SDM */
+	if (time  == 0.0)  time = 1;
+	if (etime == 0.0) etime = 1;
+	
+	if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
+	    statsPrintf("%9ld %9.9s %9.9s", (lnat)alloc*sizeof(W_), "", "");
+	    statsPrintf(" %5.2f %5.2f\n\n", 0.0, 0.0);
+	}
+
+	if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) {
+	    ullong_format_string(GC_tot_alloc*sizeof(W_), 
+				 temp, rtsTrue/*commas*/);
+	    statsPrintf("%11s bytes allocated in the heap\n", temp);
+
+	    ullong_format_string(GC_tot_copied*sizeof(W_), 
+				 temp, rtsTrue/*commas*/);
+	    statsPrintf("%11s bytes copied during GC (scavenged)\n", temp);
+
+	    ullong_format_string(GC_tot_scavd_copied*sizeof(W_), 
+				 temp, rtsTrue/*commas*/);
+	    statsPrintf("%11s bytes copied during GC (not scavenged)\n", temp);
+  
+	    if ( ResidencySamples > 0 ) {
+		ullong_format_string(MaxResidency*sizeof(W_), 
+				     temp, rtsTrue/*commas*/);
+		statsPrintf("%11s bytes maximum residency (%ld sample(s))\n",
+			temp, ResidencySamples);
+	    }
+	    statsPrintf("\n");
+
+	    /* Print garbage collections in each gen */
+	    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+		statsPrintf("%11d collections in generation %d (%6.2fs)\n", 
+			generations[g].collections, g, 
+			TICK_TO_DBL(GC_coll_times[g]));
+	    }
+
+	    statsPrintf("\n%11ld Mb total memory in use\n\n", 
+		    mblocks_allocated * MBLOCK_SIZE / (1024 * 1024));
+
+#if defined(THREADED_RTS)
+	    {
+		nat i;
+		Task *task;
+		for (i = 0, task = all_tasks; 
+		     task != NULL; 
+		     i++, task = task->all_link) {
+		    statsPrintf("  Task %2d %-8s :  MUT time: %6.2fs  (%6.2fs elapsed)\n"
+			    "                      GC  time: %6.2fs  (%6.2fs elapsed)\n\n", 
+				i,
+				(task->tso == NULL) ? "(worker)" : "(bound)",
+				TICK_TO_DBL(task->mut_time),
+				TICK_TO_DBL(task->mut_etime),
+				TICK_TO_DBL(task->gc_time),
+				TICK_TO_DBL(task->gc_etime));
+		}
+	    }
+#endif
+
+	    statsPrintf("  INIT  time  %6.2fs  (%6.2fs elapsed)\n",
+		    TICK_TO_DBL(InitUserTime), TICK_TO_DBL(InitElapsedTime));
+	    statsPrintf("  MUT   time  %6.2fs  (%6.2fs elapsed)\n",
+		    TICK_TO_DBL(MutUserTime), TICK_TO_DBL(MutElapsedTime));
+	    statsPrintf("  GC    time  %6.2fs  (%6.2fs elapsed)\n",
+		    TICK_TO_DBL(GC_tot_time), TICK_TO_DBL(GCe_tot_time));
+#ifdef PROFILING
+	    statsPrintf("  RP    time  %6.2fs  (%6.2fs elapsed)\n",
+		    TICK_TO_DBL(RP_tot_time), TICK_TO_DBL(RPe_tot_time));
+	    statsPrintf("  PROF  time  %6.2fs  (%6.2fs elapsed)\n",
+		    TICK_TO_DBL(HC_tot_time), TICK_TO_DBL(HCe_tot_time));
+#endif 
+	    statsPrintf("  EXIT  time  %6.2fs  (%6.2fs elapsed)\n",
+		    TICK_TO_DBL(ExitUserTime), TICK_TO_DBL(ExitElapsedTime));
+	    statsPrintf("  Total time  %6.2fs  (%6.2fs elapsed)\n\n",
+		    TICK_TO_DBL(time), TICK_TO_DBL(etime));
+	    statsPrintf("  %%GC time     %5.1f%%  (%.1f%% elapsed)\n\n",
+		    TICK_TO_DBL(GC_tot_time)*100/TICK_TO_DBL(time),
+		    TICK_TO_DBL(GCe_tot_time)*100/TICK_TO_DBL(etime));
+
+	    if (time - GC_tot_time - PROF_VAL(RP_tot_time + HC_tot_time) == 0)
+		ullong_format_string(0, temp, rtsTrue/*commas*/);
+	    else
+		ullong_format_string(
+		    (ullong)((GC_tot_alloc*sizeof(W_))/
+			     TICK_TO_DBL(time - GC_tot_time - 
+					 PROF_VAL(RP_tot_time + HC_tot_time))),
+		    temp, rtsTrue/*commas*/);
+	    
+	    statsPrintf("  Alloc rate    %s bytes per MUT second\n\n", temp);
+	
+	    statsPrintf("  Productivity %5.1f%% of total user, %.1f%% of total elapsed\n\n",
+		    TICK_TO_DBL(time - GC_tot_time - 
+				PROF_VAL(RP_tot_time + HC_tot_time) - InitUserTime) * 100 
+		    / TICK_TO_DBL(time), 
+		    TICK_TO_DBL(time - GC_tot_time - 
+				PROF_VAL(RP_tot_time + HC_tot_time) - InitUserTime) * 100 
+		    / TICK_TO_DBL(etime));
+	}
+
+	if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) {
+	  /* print the long long separately to avoid bugginess on mingwin (2001-07-02, mingw-0.5) */
+	  statsPrintf("<<ghc: %llu bytes, ", GC_tot_alloc*(ullong)sizeof(W_));
+	  statsPrintf("%d GCs, %ld/%ld avg/max bytes residency (%ld samples), %luM in use, %.2f INIT (%.2f elapsed), %.2f MUT (%.2f elapsed), %.2f GC (%.2f elapsed) :ghc>>\n",
+		    total_collections,
+		    ResidencySamples == 0 ? 0 : 
+		        AvgResidency*sizeof(W_)/ResidencySamples, 
+		    MaxResidency*sizeof(W_), 
+		    ResidencySamples,
+		    (unsigned long)(mblocks_allocated * MBLOCK_SIZE / (1024L * 1024L)),
+		    TICK_TO_DBL(InitUserTime), TICK_TO_DBL(InitElapsedTime),
+		    TICK_TO_DBL(MutUserTime), TICK_TO_DBL(MutElapsedTime),
+		    TICK_TO_DBL(GC_tot_time), TICK_TO_DBL(GCe_tot_time));
+	}
+
+	statsFlush();
+	statsClose();
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   stat_describe_gens
+
+   Produce some detailed info on the state of the generational GC.
+   -------------------------------------------------------------------------- */
+#ifdef DEBUG
+void
+statDescribeGens(void)
+{
+  nat g, s, mut, lge;
+  lnat live;
+  bdescr *bd;
+  step *step;
+
+  debugBelch(
+"     Gen    Steps      Max   Mutable  Step   Blocks     Live    Large\n"
+"                     Blocks Closures                          Objects\n");
+
+  mut = 0;
+  for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+      for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
+	  mut += bd->free - bd->start;
+      }
+
+    debugBelch("%8d %8d %8d %9d", g, generations[g].n_steps,
+	    generations[g].max_blocks, mut);
+
+    for (s = 0; s < generations[g].n_steps; s++) {
+      step = &generations[g].steps[s];
+      live = 0;
+      for (bd = step->large_objects, lge = 0; bd; bd = bd->link) {
+	lge++;
+      }
+      live = step->n_large_blocks * BLOCK_SIZE;
+      bd = step->blocks;
+      // This live figure will be slightly less that the "live" figure
+      // given by +RTS -Sstderr, because we take don't count the
+      // slop at the end of each block.
+      for (; bd; bd = bd->link) {
+	live += (bd->free - bd->start) * sizeof(W_);
+      }
+      if (s != 0) {
+	debugBelch("%36s","");
+      }
+      debugBelch("%6d %8d %8d %8d\n", s, step->n_blocks,
+	      live, lge);
+    }
+  }
+  debugBelch("\n");
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+   Stats available via a programmatic interface, so eg. GHCi can time
+   each compilation and expression evaluation.
+   -------------------------------------------------------------------------- */
+
+extern HsInt64 getAllocations( void ) 
+{ return (HsInt64)total_allocated * sizeof(W_); }
+
+/* -----------------------------------------------------------------------------
+   Dumping stuff in the stats file, or via the debug message interface
+   -------------------------------------------------------------------------- */
+
+static void
+statsPrintf( char *s, ... )
+{
+    FILE *sf = RtsFlags.GcFlags.statsFile;
+    va_list ap;
+    
+    va_start(ap,s);
+    if (sf == NULL) {
+	vdebugBelch(s,ap);
+    } else {
+	vfprintf(sf, s, ap);
+    }
+    va_end(ap);
+}
+
+static void
+statsFlush( void )
+{
+    FILE *sf = RtsFlags.GcFlags.statsFile;
+    if (sf != NULL) {
+	fflush(sf);
+    }
+}
+
+static void
+statsClose( void )
+{
+    FILE *sf = RtsFlags.GcFlags.statsFile;
+    if (sf != NULL) {
+	fclose(sf);
+    }
+}
diff --git a/rts/Stats.h b/rts/Stats.h
new file mode 100644
index 0000000000..20bc0155ad
--- /dev/null
+++ b/rts/Stats.h
@@ -0,0 +1,56 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Statistics and timing-related functions.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef STATS_H
+#define STATS_H
+
+#include "GetTime.h"
+
+void      stat_startInit(void);
+void      stat_endInit(void);
+
+void      stat_startGC(void);
+void      stat_endGC (lnat alloc, lnat live, 
+		      lnat copied, lnat scavd_copied, lnat gen);
+
+#ifdef PROFILING
+void      stat_startRP(void);
+void      stat_endRP(nat, 
+#ifdef DEBUG_RETAINER
+                            nat, int, 
+#endif
+                            double);
+#endif /* PROFILING */
+
+#if defined(PROFILING) || defined(DEBUG)
+void      stat_startHeapCensus(void);
+void      stat_endHeapCensus(void);
+#endif
+
+void      stat_startExit(void);
+void      stat_endExit(void);
+
+void      stat_exit(int alloc);
+void      stat_workerStop(void);
+
+void      initStats(void);
+
+double    mut_user_time_during_GC(void);
+double    mut_user_time(void);
+
+#ifdef PROFILING
+double    mut_user_time_during_RP(void);
+double    mut_user_time_during_heap_census(void);
+#endif /* PROFILING */
+
+void      statDescribeGens( void );
+HsInt64   getAllocations( void );
+
+Ticks stat_getElapsedGCTime(void);
+
+#endif /* STATS_H */
diff --git a/rts/StgCRun.c b/rts/StgCRun.c
new file mode 100644
index 0000000000..c1afc16559
--- /dev/null
+++ b/rts/StgCRun.c
@@ -0,0 +1,897 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2003
+ *
+ * STG-to-C glue.
+ *
+ * To run an STG function from C land, call
+ *
+ *		rv = StgRun(f,BaseReg);
+ *
+ * where "f" is the STG function to call, and BaseReg is the address of the
+ * RegTable for this run (we might have separate RegTables if we're running
+ * multiple threads on an SMP machine).
+ *
+ * In the end, "f" must JMP to StgReturn (defined below),
+ * passing the return-value "rv" in R1,
+ * to return to the caller of StgRun returning "rv" in
+ * the whatever way C returns a value.
+ *
+ * NOTE: StgRun/StgReturn do *NOT* load or store Hp or any
+ * other registers (other than saving the C callee-saves
+ * registers).  Instead, the called function "f" must do that
+ * in STG land.
+ *
+ * GCC will have assumed that pushing/popping of C-stack frames is
+ * going on when it generated its code, and used stack space
+ * accordingly.  However, we actually {\em post-process away} all
+ * such stack-framery (see \tr{ghc/driver/ghc-asm.lprl}). Things will
+ * be OK however, if we initially make sure there are
+ * @RESERVED_C_STACK_BYTES@ on the C-stack to begin with, for local
+ * variables.
+ *
+ * -------------------------------------------------------------------------- */
+
+#include "PosixSource.h"
+
+
+/*
+ * We define the following (unused) global register variables, because for
+ * some reason gcc generates sub-optimal code for StgRun() on the Alpha
+ * (unnecessarily saving extra registers on the stack) if we don't.
+ *
+ * Why do it at the top of this file, rather than near StgRun() below?  Because
+ * gcc doesn't let us define global register variables after any function
+ * definition has been read.  Any point after #include "Stg.h" would be too
+ * late.
+ *
+ * We define alpha_EXTRA_CAREFUL here to save $s6, $f8 and $f9 -- registers
+ * that we don't use but which are callee-save registers.  The __divq() routine
+ * in libc.a clobbers $s6.
+ */
+#include "ghcconfig.h"
+#ifdef alpha_HOST_ARCH
+#define alpha_EXTRA_CAREFUL
+register long   fake_ra __asm__("$26");
+register long   fake_gp __asm__("$29");
+#ifdef alpha_EXTRA_CAREFUL
+register long   fake_s6 __asm__("$15");
+register double fake_f8 __asm__("$f8");
+register double fake_f9 __asm__("$f9");
+#endif
+#endif
+
+/* include Stg.h first because we want real machine regs in here: we
+ * have to get the value of R1 back from Stg land to C land intact.
+ */
+#include "Stg.h"
+#include "Rts.h"
+#include "StgRun.h"
+#include "RtsFlags.h"
+#include "OSThreads.h"
+#include "Capability.h"
+
+#ifdef DEBUG
+#include "RtsUtils.h"
+#include "Printer.h"
+#endif
+
+#ifdef USE_MINIINTERPRETER
+
+/* -----------------------------------------------------------------------------
+   any architecture (using miniinterpreter)
+   -------------------------------------------------------------------------- */
+
+StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg STG_UNUSED)
+{
+    while (f) {
+	IF_DEBUG(interpreter,
+	    debugBelch("Jumping to ");
+	    printPtr((P_)f); fflush(stdout);
+	    debugBelch("\n");
+	    );
+	f = (StgFunPtr) (f)();
+    }
+    return (StgRegTable *)R1.p;
+}
+
+StgFunPtr StgReturn(void)
+{
+    return 0;
+}
+
+#else /* !USE_MINIINTERPRETER */
+
+#ifdef LEADING_UNDERSCORE
+#define STG_RETURN "_StgReturn"
+#else
+#define STG_RETURN "StgReturn"
+#endif
+
+/* -----------------------------------------------------------------------------
+   x86 architecture
+   -------------------------------------------------------------------------- */
+
+#ifdef i386_HOST_ARCH
+
+#ifdef darwin_TARGET_OS
+#define STG_GLOBAL ".globl "
+#else
+#define STG_GLOBAL ".global "
+#endif
+
+StgRegTable *
+StgRun(StgFunPtr f, StgRegTable *basereg) {
+
+    unsigned char space[ RESERVED_C_STACK_BYTES + 4*sizeof(void *) ];
+    StgRegTable * r;
+
+    __asm__ volatile (
+	/*
+	 * save callee-saves registers on behalf of the STG code.
+	 */
+	"movl %%esp, %%eax\n\t"
+	"addl %4, %%eax\n\t"
+        "movl %%ebx,0(%%eax)\n\t"
+        "movl %%esi,4(%%eax)\n\t"
+        "movl %%edi,8(%%eax)\n\t"
+        "movl %%ebp,12(%%eax)\n\t"
+	/*
+	 * Set BaseReg
+	 */
+	"movl %3,%%ebx\n\t"
+	/*
+	 * grab the function argument from the stack
+	 */
+        "movl %2,%%eax\n\t"
+        
+	/*
+	 * Darwin note:
+	 * The stack pointer has to be aligned to a multiple of 16 bytes at
+	 * this point. This works out correctly with gcc 4.0.1, but it might
+	 * break at any time in the future. TODO: Make this future-proof.
+	 */
+
+	/*
+	 * jump to it
+	 */
+        "jmp *%%eax\n\t"
+
+	STG_GLOBAL STG_RETURN "\n"
+       	STG_RETURN ":\n\t"
+
+	"movl %%esi, %%eax\n\t"   /* Return value in R1  */
+
+	/*
+	 * restore callee-saves registers.  (Don't stomp on %%eax!)
+	 */
+	"movl %%esp, %%edx\n\t"
+	"addl %4, %%edx\n\t"
+        "movl 0(%%edx),%%ebx\n\t"	/* restore the registers saved above */
+        "movl 4(%%edx),%%esi\n\t"
+        "movl 8(%%edx),%%edi\n\t"
+        "movl 12(%%edx),%%ebp\n\t"
+
+      : "=&a" (r), "=m" (space)
+      : "m" (f), "m" (basereg), "i" (RESERVED_C_STACK_BYTES)
+      : "edx" /* stomps on %edx */
+    );
+
+    return r;
+}
+
+#endif
+
+/* ----------------------------------------------------------------------------
+   x86-64 is almost the same as plain x86.
+
+   I've done it using entirely inline assembler, because I couldn't
+   get gcc to generate the correct subtraction from %rsp by using
+   the local array variable trick.  It didn't seem to reserve
+   enough space.  Oh well, it's not much harder this way.
+
+   ------------------------------------------------------------------------- */
+
+#ifdef x86_64_HOST_ARCH
+
+extern StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg);
+
+static void GNUC3_ATTRIBUTE(used)
+StgRunIsImplementedInAssembler(void)
+{
+    __asm__ volatile (
+	/*
+	 * save callee-saves registers on behalf of the STG code.
+	 */
+	".globl StgRun\n"
+	"StgRun:\n\t"
+	"subq %0, %%rsp\n\t"
+	"movq %%rsp, %%rax\n\t"
+	"addq %0-48, %%rax\n\t"
+        "movq %%rbx,0(%%rax)\n\t"
+        "movq %%rbp,8(%%rax)\n\t"
+        "movq %%r12,16(%%rax)\n\t"
+        "movq %%r13,24(%%rax)\n\t"
+        "movq %%r14,32(%%rax)\n\t"
+        "movq %%r15,40(%%rax)\n\t"
+	/*
+	 * Set BaseReg
+	 */
+	"movq %%rsi,%%r13\n\t"
+	/*
+	 * grab the function argument from the stack, and jump to it.
+	 */
+        "movq %%rdi,%%rax\n\t"
+        "jmp *%%rax\n\t"
+
+	".global " STG_RETURN "\n"
+       	STG_RETURN ":\n\t"
+
+	"movq %%rbx, %%rax\n\t"   /* Return value in R1  */
+
+	/*
+	 * restore callee-saves registers.  (Don't stomp on %%rax!)
+	 */
+	"movq %%rsp, %%rdx\n\t"
+	"addq %0-48, %%rdx\n\t"
+        "movq 0(%%rdx),%%rbx\n\t"	/* restore the registers saved above */
+        "movq 8(%%rdx),%%rbp\n\t"
+        "movq 16(%%rdx),%%r12\n\t"
+        "movq 24(%%rdx),%%r13\n\t"
+        "movq 32(%%rdx),%%r14\n\t"
+        "movq 40(%%rdx),%%r15\n\t"
+	"addq %0, %%rsp\n\t"
+	"retq"
+
+	: : "i"(RESERVED_C_STACK_BYTES+48+8 /*stack frame size*/));
+    /* 
+       HACK alert!
+
+       The x86_64 ABI specifies that on a procedure call, %rsp is
+       aligned on a 16-byte boundary + 8.  That is, the first
+       argument on the stack after the return address will be
+       16-byte aligned.  
+       
+       Which should be fine: RESERVED_C_STACK_BYTES+48 is a multiple
+       of 16 bytes.  
+       
+       BUT... when we do a C-call from STG land, gcc likes to put the
+       stack alignment adjustment in the prolog.  eg. if we're calling
+       a function with arguments in regs, gcc will insert 'subq $8,%rsp'
+       in the prolog, to keep %rsp aligned (the return address is 8
+       bytes, remember).  The mangler throws away the prolog, so we
+       lose the stack alignment.
+
+       The hack is to add this extra 8 bytes to our %rsp adjustment
+       here, so that throughout STG code, %rsp is 16-byte aligned,
+       ready for a C-call.  
+
+       A quick way to see if this is wrong is to compile this code:
+
+          main = System.Exit.exitWith ExitSuccess
+
+       And run it with +RTS -sstderr.  The stats code in the RTS, in
+       particular statsPrintf(), relies on the stack alignment because
+       it saves the %xmm regs on the stack, so it'll fall over if the
+       stack isn't aligned, and calling exitWith from Haskell invokes
+       shutdownHaskellAndExit using a C call.
+
+       Future gcc releases will almost certainly break this hack...
+    */
+}
+
+#endif /* x86-64 */
+
+/* -----------------------------------------------------------------------------
+   Sparc architecture
+
+   --
+   OLD COMMENT from GHC-3.02:
+
+   We want tailjumps to be calls, because `call xxx' is the only Sparc
+   branch that allows an arbitrary label as a target.  (Gcc's ``goto
+   *target'' construct ends up loading the label into a register and
+   then jumping, at the cost of two extra instructions for the 32-bit
+   load.)
+
+   When entering the threaded world, we stash our return address in a
+   known location so that \tr{%i7} is available as an extra
+   callee-saves register.  Of course, we have to restore this when
+   coming out of the threaded world.
+
+   I hate this god-forsaken architecture.  Since the top of the
+   reserved stack space is used for globals and the bottom is reserved
+   for outgoing arguments, we have to stick our return address
+   somewhere in the middle.  Currently, I'm allowing 100 extra
+   outgoing arguments beyond the first 6.  --JSM
+
+   Updated info (GHC 4.06): we don't appear to use %i7 any more, so
+   I'm not sure whether we still need to save it.  Incedentally, what
+   does the last paragraph above mean when it says "the top of the
+   stack is used for globals"?  What globals?  --SDM
+
+   Updated info (GHC 4.08.2): not saving %i7 any more (see below).
+   -------------------------------------------------------------------------- */
+
+#ifdef sparc_HOST_ARCH
+
+StgRegTable *
+StgRun(StgFunPtr f, StgRegTable *basereg) {
+
+    unsigned char space[RESERVED_C_STACK_BYTES];
+#if 0
+    register void *i7 __asm__("%i7");
+    ((void **)(space))[100] = i7;
+#endif
+    f();
+    __asm__ volatile (
+	    ".align 4\n"
+            ".global " STG_RETURN "\n"
+       	    STG_RETURN ":"
+	    : : : "l0","l1","l2","l3","l4","l5","l6","l7");
+    /* we tell the C compiler that l0-l7 are clobbered on return to
+     * StgReturn, otherwise it tries to use these to save eg. the
+     * address of space[100] across the call.  The correct thing
+     * to do would be to save all the callee-saves regs, but we
+     * can't be bothered to do that.
+     *
+     * The code that gcc generates for this little fragment is now
+     * terrible.  We could do much better by coding it directly in
+     * assembler.
+     */
+#if 0
+    /* updated 4.08.2: we don't save %i7 in the middle of the reserved
+     * space any more, since gcc tries to save its address across the
+     * call to f(), this gets clobbered in STG land and we end up
+     * dereferencing a bogus pointer in StgReturn.
+     */
+    __asm__ volatile ("ld %1,%0"
+		      : "=r" (i7) : "m" (((void **)(space))[100]));
+#endif
+    return (StgRegTable *)R1.i;
+}
+
+#endif
+
+/* -----------------------------------------------------------------------------
+   alpha architecture
+
+   "The stack pointer (SP) must at all times denote an address that has octaword
+    alignment. (This restriction has the side effect that the in-memory portion
+    of the argument list, if any, will start on an octaword boundary.) Note that
+    the stack grows toward lower addresses. During a procedure invocation, SP
+    can never be set to a value that is higher than the value of SP at entry to
+    that procedure invocation.
+
+   "The contents of the stack, located above the portion of the argument list
+    (if any) that is passed in memory, belong to the calling procedure. Because
+    they are part of the calling procedure, they should not be read or written
+    by the called procedure, except as specified by indirect arguments or
+    language-controlled up-level references.
+
+   "The SP value might be used by the hardware when raising exceptions and
+    asynchronous interrupts. It must be assumed that the contents of the stack
+    below the current SP value and within the stack for the current thread are
+    continually and unpredictably modified, as specified in the _Alpha
+    Architecture Reference Manual_, and as a result of asynchronous software
+    actions."
+
+   -- Compaq Computer Corporation, Houston. Tru64 UNIX Calling Standard for
+      Alpha Systems, 5.1 edition, August 2000, section 3.2.1.  http://www.
+      tru64unix.compaq.com/docs/base_doc/DOCUMENTATION/V51_PDF/ARH9MBTE.PDF
+   -------------------------------------------------------------------------- */
+
+#ifdef alpha_HOST_ARCH
+
+StgRegTable *
+StgRun(StgFunPtr f, StgRegTable *basereg)
+{
+    register long   real_ra __asm__("$26"); volatile long   save_ra;
+    register long   real_gp __asm__("$29"); volatile long   save_gp;
+
+    register long   real_s0 __asm__("$9" ); volatile long   save_s0;
+    register long   real_s1 __asm__("$10"); volatile long   save_s1;
+    register long   real_s2 __asm__("$11"); volatile long   save_s2;
+    register long   real_s3 __asm__("$12"); volatile long   save_s3;
+    register long   real_s4 __asm__("$13"); volatile long   save_s4;
+    register long   real_s5 __asm__("$14"); volatile long   save_s5;
+#ifdef alpha_EXTRA_CAREFUL
+    register long   real_s6 __asm__("$15"); volatile long   save_s6;
+#endif
+
+    register double real_f2 __asm__("$f2"); volatile double save_f2;
+    register double real_f3 __asm__("$f3"); volatile double save_f3;
+    register double real_f4 __asm__("$f4"); volatile double save_f4;
+    register double real_f5 __asm__("$f5"); volatile double save_f5;
+    register double real_f6 __asm__("$f6"); volatile double save_f6;
+    register double real_f7 __asm__("$f7"); volatile double save_f7;
+#ifdef alpha_EXTRA_CAREFUL
+    register double real_f8 __asm__("$f8"); volatile double save_f8;
+    register double real_f9 __asm__("$f9"); volatile double save_f9;
+#endif
+
+    register StgFunPtr real_pv __asm__("$27");
+
+    StgRegTable * ret;
+
+    save_ra = real_ra;
+    save_gp = real_gp;
+
+    save_s0 = real_s0;
+    save_s1 = real_s1;
+    save_s2 = real_s2;
+    save_s3 = real_s3;
+    save_s4 = real_s4;
+    save_s5 = real_s5;
+#ifdef alpha_EXTRA_CAREFUL
+    save_s6 = real_s6;
+#endif
+
+    save_f2 = real_f2;
+    save_f3 = real_f3;
+    save_f4 = real_f4;
+    save_f5 = real_f5;
+    save_f6 = real_f6;
+    save_f7 = real_f7;
+#ifdef alpha_EXTRA_CAREFUL
+    save_f8 = real_f8;
+    save_f9 = real_f9;
+#endif
+
+    real_pv = f;
+
+    __asm__ volatile(	"lda $30,-%0($30)"	"\n"
+		"\t"	"jmp ($27)"		"\n"
+		"\t"	".align 3"		"\n"
+		".globl " STG_RETURN		"\n"
+		STG_RETURN ":"			"\n"
+		"\t"	"lda $30,%0($30)"	"\n"
+		: : "K" (RESERVED_C_STACK_BYTES));
+
+    ret = real_s5;
+
+    real_s0 = save_s0;
+    real_s1 = save_s1;
+    real_s2 = save_s2;
+    real_s3 = save_s3;
+    real_s4 = save_s4;
+    real_s5 = save_s5;
+#ifdef alpha_EXTRA_CAREFUL
+    real_s6 = save_s6;
+#endif
+
+    real_f2 = save_f2;
+    real_f3 = save_f3;
+    real_f4 = save_f4;
+    real_f5 = save_f5;
+    real_f6 = save_f6;
+    real_f7 = save_f7;
+#ifdef alpha_EXTRA_CAREFUL
+    real_f8 = save_f8;
+    real_f9 = save_f9;
+#endif
+
+    real_ra = save_ra;
+    real_gp = save_gp;
+
+    return ret;
+}
+
+#endif /* alpha_HOST_ARCH */
+
+/* -----------------------------------------------------------------------------
+   HP-PA architecture
+   -------------------------------------------------------------------------- */
+
+#ifdef hppa1_1_HOST_ARCH
+
+StgRegTable *
+StgRun(StgFunPtr f, StgRegTable *basereg)
+{
+    StgChar space[RESERVED_C_STACK_BYTES+16*sizeof(long)+10*sizeof(double)];
+    StgRegTable * ret;
+
+    __asm__ volatile ("ldo %0(%%r30),%%r19\n"
+		      "\tstw %%r3, 0(0,%%r19)\n"
+                      "\tstw %%r4, 4(0,%%r19)\n"
+                      "\tstw %%r5, 8(0,%%r19)\n"
+                      "\tstw %%r6,12(0,%%r19)\n"
+                      "\tstw %%r7,16(0,%%r19)\n"
+                      "\tstw %%r8,20(0,%%r19)\n"
+                      "\tstw %%r9,24(0,%%r19)\n"
+		      "\tstw %%r10,28(0,%%r19)\n"
+                      "\tstw %%r11,32(0,%%r19)\n"
+                      "\tstw %%r12,36(0,%%r19)\n"
+                      "\tstw %%r13,40(0,%%r19)\n"
+                      "\tstw %%r14,44(0,%%r19)\n"
+                      "\tstw %%r15,48(0,%%r19)\n"
+                      "\tstw %%r16,52(0,%%r19)\n"
+                      "\tstw %%r17,56(0,%%r19)\n"
+                      "\tstw %%r18,60(0,%%r19)\n"
+		      "\tldo 80(%%r19),%%r19\n"
+		      "\tfstds %%fr12,-16(0,%%r19)\n"
+		      "\tfstds %%fr13, -8(0,%%r19)\n"
+		      "\tfstds %%fr14,  0(0,%%r19)\n"
+		      "\tfstds %%fr15,  8(0,%%r19)\n"
+		      "\tldo 32(%%r19),%%r19\n"
+		      "\tfstds %%fr16,-16(0,%%r19)\n"
+		      "\tfstds %%fr17, -8(0,%%r19)\n"
+		      "\tfstds %%fr18,  0(0,%%r19)\n"
+		      "\tfstds %%fr19,  8(0,%%r19)\n"
+		      "\tldo 32(%%r19),%%r19\n"
+		      "\tfstds %%fr20,-16(0,%%r19)\n"
+		      "\tfstds %%fr21, -8(0,%%r19)\n" : :
+                      "n" (-(116 * sizeof(long) + 10 * sizeof(double))) : "%r19"
+		      );
+
+    f();
+
+    __asm__ volatile (".align 4\n"
+               	      "\t.EXPORT " STG_RETURN ",CODE\n"
+		      "\t.EXPORT " STG_RETURN ",ENTRY,PRIV_LEV=3\n"
+                      STG_RETURN "\n"
+                      /* "\tldo %0(%%r3),%%r19\n" */
+                      "\tldo %1(%%r30),%%r19\n"
+                      "\tcopy %%r11, %0\n"  /* save R1 */
+		      "\tldw  0(0,%%r19),%%r3\n"
+                      "\tldw  4(0,%%r19),%%r4\n"
+                      "\tldw  8(0,%%r19),%%r5\n"
+                      "\tldw 12(0,%%r19),%%r6\n"
+                      "\tldw 16(0,%%r19),%%r7\n"
+                      "\tldw 20(0,%%r19),%%r8\n"
+                      "\tldw 24(0,%%r19),%%r9\n"
+		      "\tldw 28(0,%%r19),%%r10\n"
+                      "\tldw 32(0,%%r19),%%r11\n"
+                      "\tldw 36(0,%%r19),%%r12\n"
+                      "\tldw 40(0,%%r19),%%r13\n"
+                      "\tldw 44(0,%%r19),%%r14\n"
+                      "\tldw 48(0,%%r19),%%r15\n"
+                      "\tldw 52(0,%%r19),%%r16\n"
+                      "\tldw 56(0,%%r19),%%r17\n"
+                      "\tldw 60(0,%%r19),%%r18\n"
+		      "\tldo 80(%%r19),%%r19\n"
+		      "\tfldds -16(0,%%r19),%%fr12\n"
+		      "\tfldds  -8(0,%%r19),%%fr13\n"
+		      "\tfldds   0(0,%%r19),%%fr14\n"
+		      "\tfldds   8(0,%%r19),%%fr15\n"
+		      "\tldo 32(%%r19),%%r19\n"
+		      "\tfldds -16(0,%%r19),%%fr16\n"
+		      "\tfldds  -8(0,%%r19),%%fr17\n"
+		      "\tfldds   0(0,%%r19),%%fr18\n"
+		      "\tfldds   8(0,%%r19),%%fr19\n"
+		      "\tldo 32(%%r19),%%r19\n"
+		      "\tfldds -16(0,%%r19),%%fr20\n"
+		      "\tfldds  -8(0,%%r19),%%fr21\n"
+		         : "=r" (ret)
+		         : "n" (-(116 * sizeof(long) + 10 * sizeof(double)))
+		         : "%r19"
+		      );
+
+    return ret;
+}
+
+#endif /* hppa1_1_HOST_ARCH */
+
+/* -----------------------------------------------------------------------------
+   PowerPC architecture
+
+   Everything is in assembler, so we don't have to deal with GCC...
+   
+   -------------------------------------------------------------------------- */
+
+#ifdef powerpc_HOST_ARCH
+
+extern StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg);
+
+#ifdef darwin_HOST_OS
+void StgRunIsImplementedInAssembler(void)
+{
+#if HAVE_SUBSECTIONS_VIA_SYMBOLS
+            // if the toolchain supports deadstripping, we have to
+            // prevent it here (it tends to get confused here).
+        __asm__ volatile (".no_dead_strip _StgRunIsImplementedInAssembler");
+#endif
+	__asm__ volatile (
+		"\n.globl _StgRun\n"
+		"_StgRun:\n"
+		"\tmflr r0\n"
+		"\tbl saveFP # f14\n"
+		"\tstmw r13,-220(r1)\n"
+		"\tstwu r1,-%0(r1)\n"
+                "\tmr r27,r4\n" // BaseReg == r27
+		"\tmtctr r3\n"
+		"\tmr r12,r3\n"
+		"\tbctr\n"
+		".globl _StgReturn\n"
+		"_StgReturn:\n"
+		"\tmr r3,r14\n"
+		"\tla r1,%0(r1)\n"
+		"\tlmw r13,-220(r1)\n"
+		"\tb restFP # f14\n"
+	: : "i"(RESERVED_C_STACK_BYTES+224 /*stack frame size*/));
+}
+#else
+
+// This version is for PowerPC Linux.
+
+// Differences from the Darwin/Mac OS X version:
+// *) Different Assembler Syntax
+// *) Doesn't use Register Saving Helper Functions (although they exist somewhere)
+// *) We may not access positive stack offsets
+//    (no "Red Zone" as in the Darwin ABI)
+// *) The Link Register is saved to a different offset in the caller's stack frame
+//    (Linux: 4(r1), Darwin 8(r1))
+
+static void GNUC3_ATTRIBUTE(used)
+StgRunIsImplementedInAssembler(void)
+{
+	__asm__ volatile (
+		"\t.globl StgRun\n"
+		"\t.type StgRun,@function\n"
+		"StgRun:\n"
+		"\tmflr 0\n"
+		"\tstw 0,4(1)\n"
+		"\tmr 5,1\n"
+		"\tstwu 1,-%0(1)\n"
+		"\tstmw 13,-220(5)\n"
+		"\tstfd 14,-144(5)\n"
+		"\tstfd 15,-136(5)\n"
+		"\tstfd 16,-128(5)\n"
+		"\tstfd 17,-120(5)\n"
+		"\tstfd 18,-112(5)\n"
+		"\tstfd 19,-104(5)\n"
+		"\tstfd 20,-96(5)\n"
+		"\tstfd 21,-88(5)\n"
+		"\tstfd 22,-80(5)\n"
+		"\tstfd 23,-72(5)\n"
+		"\tstfd 24,-64(5)\n"
+		"\tstfd 25,-56(5)\n"
+		"\tstfd 26,-48(5)\n"
+		"\tstfd 27,-40(5)\n"
+		"\tstfd 28,-32(5)\n"
+		"\tstfd 29,-24(5)\n"
+		"\tstfd 30,-16(5)\n"
+		"\tstfd 31,-8(5)\n"
+		"\tmr 27,4\n"  // BaseReg == r27
+		"\tmtctr 3\n"
+		"\tmr 12,3\n"
+		"\tbctr\n"
+		".globl StgReturn\n"
+		"\t.type StgReturn,@function\n"
+		"StgReturn:\n"
+		"\tmr 3,14\n"
+		"\tla 5,%0(1)\n"
+		"\tlmw 13,-220(5)\n"
+		"\tlfd 14,-144(5)\n"
+		"\tlfd 15,-136(5)\n"
+		"\tlfd 16,-128(5)\n"
+		"\tlfd 17,-120(5)\n"
+		"\tlfd 18,-112(5)\n"
+		"\tlfd 19,-104(5)\n"
+		"\tlfd 20,-96(5)\n"
+		"\tlfd 21,-88(5)\n"
+		"\tlfd 22,-80(5)\n"
+		"\tlfd 23,-72(5)\n"
+		"\tlfd 24,-64(5)\n"
+		"\tlfd 25,-56(5)\n"
+		"\tlfd 26,-48(5)\n"
+		"\tlfd 27,-40(5)\n"
+		"\tlfd 28,-32(5)\n"
+		"\tlfd 29,-24(5)\n"
+		"\tlfd 30,-16(5)\n"
+		"\tlfd 31,-8(5)\n"
+		"\tmr 1,5\n"
+		"\tlwz 0,4(1)\n"
+		"\tmtlr 0\n"
+		"\tblr\n"
+	: : "i"(RESERVED_C_STACK_BYTES+224 /*stack frame size*/));
+}
+#endif
+
+#endif
+
+/* -----------------------------------------------------------------------------
+   PowerPC 64 architecture
+
+   Everything is in assembler, so we don't have to deal with GCC...
+   
+   -------------------------------------------------------------------------- */
+
+#ifdef powerpc64_HOST_ARCH
+
+#ifdef linux_HOST_OS
+extern StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg);
+
+static void GNUC3_ATTRIBUTE(used)
+StgRunIsImplementedInAssembler(void)
+{
+        // r0 volatile
+	// r1 stack pointer
+	// r2 toc - needs to be saved
+	// r3-r10 argument passing, volatile
+	// r11, r12 very volatile (not saved across cross-module calls)
+	// r13 thread local state (never modified, don't need to save)
+	// r14-r31 callee-save
+	__asm__ volatile (
+		".section \".opd\",\"aw\"\n"
+		".align 3\n"
+		".globl StgRun\n"
+		"StgRun:\n"
+			"\t.quad\t.StgRun,.TOC.@tocbase,0\n"
+			"\t.size StgRun,24\n"
+		".globl StgReturn\n"
+		"StgReturn:\n"
+			"\t.quad\t.StgReturn,.TOC.@tocbase,0\n"
+			"\t.size StgReturn,24\n"
+		".previous\n"
+		".globl .StgRun\n"
+		".type .StgRun,@function\n"
+		".StgRun:\n"
+			"\tmflr 0\n"
+			"\tmr 5, 1\n"
+			"\tstd 0, 16(1)\n"
+			"\tstdu 1, -%0(1)\n"
+			"\tstd 2, -296(5)\n"
+			"\tstd 14, -288(5)\n"
+			"\tstd 15, -280(5)\n"
+			"\tstd 16, -272(5)\n"
+			"\tstd 17, -264(5)\n"
+			"\tstd 18, -256(5)\n"
+			"\tstd 19, -248(5)\n"
+			"\tstd 20, -240(5)\n"
+			"\tstd 21, -232(5)\n"
+			"\tstd 22, -224(5)\n"
+			"\tstd 23, -216(5)\n"
+			"\tstd 24, -208(5)\n"
+			"\tstd 25, -200(5)\n"
+			"\tstd 26, -192(5)\n"
+			"\tstd 27, -184(5)\n"
+			"\tstd 28, -176(5)\n"
+			"\tstd 29, -168(5)\n"
+			"\tstd 30, -160(5)\n"
+			"\tstd 31, -152(5)\n"
+			"\tstfd 14, -144(5)\n"
+			"\tstfd 15, -136(5)\n"
+			"\tstfd 16, -128(5)\n"
+			"\tstfd 17, -120(5)\n"
+			"\tstfd 18, -112(5)\n"
+			"\tstfd 19, -104(5)\n"
+			"\tstfd 20, -96(5)\n"
+			"\tstfd 21, -88(5)\n"
+			"\tstfd 22, -80(5)\n"
+			"\tstfd 23, -72(5)\n"
+			"\tstfd 24, -64(5)\n"
+			"\tstfd 25, -56(5)\n"
+			"\tstfd 26, -48(5)\n"
+			"\tstfd 27, -40(5)\n"
+			"\tstfd 28, -32(5)\n"
+			"\tstfd 29, -24(5)\n"
+			"\tstfd 30, -16(5)\n"
+			"\tstfd 31, -8(5)\n"
+			"\tmr 27, 4\n"  // BaseReg == r27
+			"\tld 2, 8(3)\n"
+			"\tld 3, 0(3)\n"
+			"\tmtctr 3\n"
+			"\tbctr\n"
+		".globl .StgReturn\n"
+		".type .StgReturn,@function\n"
+		".StgReturn:\n"
+			"\tmr 3,14\n"
+			"\tla 5, %0(1)\n" // load address == addi r5, r1, %0
+			"\tld 2, -296(5)\n"
+			"\tld 14, -288(5)\n"
+			"\tld 15, -280(5)\n"
+			"\tld 16, -272(5)\n"
+			"\tld 17, -264(5)\n"
+			"\tld 18, -256(5)\n"
+			"\tld 19, -248(5)\n"
+			"\tld 20, -240(5)\n"
+			"\tld 21, -232(5)\n"
+			"\tld 22, -224(5)\n"
+			"\tld 23, -216(5)\n"
+			"\tld 24, -208(5)\n"
+			"\tld 25, -200(5)\n"
+			"\tld 26, -192(5)\n"
+			"\tld 27, -184(5)\n"
+			"\tld 28, -176(5)\n"
+			"\tld 29, -168(5)\n"
+			"\tld 30, -160(5)\n"
+			"\tld 31, -152(5)\n"
+			"\tlfd 14, -144(5)\n"
+			"\tlfd 15, -136(5)\n"
+			"\tlfd 16, -128(5)\n"
+			"\tlfd 17, -120(5)\n"
+			"\tlfd 18, -112(5)\n"
+			"\tlfd 19, -104(5)\n"
+			"\tlfd 20, -96(5)\n"
+			"\tlfd 21, -88(5)\n"
+			"\tlfd 22, -80(5)\n"
+			"\tlfd 23, -72(5)\n"
+			"\tlfd 24, -64(5)\n"
+			"\tlfd 25, -56(5)\n"
+			"\tlfd 26, -48(5)\n"
+			"\tlfd 27, -40(5)\n"
+			"\tlfd 28, -32(5)\n"
+			"\tlfd 29, -24(5)\n"
+			"\tlfd 30, -16(5)\n"
+			"\tlfd 31, -8(5)\n"
+			"\tmr 1, 5\n"
+			"\tld 0, 16(1)\n"
+			"\tmtlr 0\n"
+			"\tblr\n"
+	: : "i"(RESERVED_C_STACK_BYTES+304 /*stack frame size*/));
+}
+#else // linux_HOST_OS
+#error Only linux support for power64 right now.
+#endif
+
+#endif
+
+/* -----------------------------------------------------------------------------
+   IA64 architecture
+
+   Again, in assembler - so we can fiddle with the register stack, and because
+   gcc doesn't handle asm-clobbered callee-saves correctly.
+
+   loc0  - loc15: preserved locals
+   loc16 - loc28: STG registers
+           loc29: saved ar.pfs
+           loc30: saved b0
+           loc31: saved gp (gcc 3.3 uses this slot)
+   -------------------------------------------------------------------------- */
+
+#ifdef ia64_HOST_ARCH
+
+/* the memory stack is rarely used, so 16K is excessive */
+#undef RESERVED_C_STACK_BYTES
+#define RESERVED_C_STACK_BYTES 1024
+
+#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
+/* gcc 3.3+: leave an extra slot for gp saves */
+#define LOCALS 32
+#else
+#define LOCALS 31
+#endif
+
+static void GNUC3_ATTRIBUTE(used)
+StgRunIsImplementedInAssembler(void)
+{
+    __asm__ volatile(
+		".global StgRun\n"
+		"StgRun:\n"
+		"\talloc loc29 = ar.pfs, 0, %1, 8, 0\n"	/* setup register frame */
+		"\tld8 r18 = [r32],8\n"			/* get procedure address */
+		"\tadds sp = -%0, sp ;;\n"		/* setup stack */
+		"\tld8 gp = [r32]\n"			/* get procedure GP */
+		"\tadds r16 = %0-(6*16), sp\n"
+		"\tadds r17 = %0-(5*16), sp ;;\n"
+		"\tstf.spill [r16] = f16,32\n"		/* spill callee-saved fp regs */
+		"\tstf.spill [r17] = f17,32\n"
+		"\tmov b6 = r18 ;;\n"			/* set target address */
+		"\tstf.spill [r16] = f18,32\n"
+		"\tstf.spill [r17] = f19,32\n"
+		"\tmov loc30 = b0 ;;\n"			/* save return address */
+		"\tstf.spill [r16] = f20,32\n"
+		"\tstf.spill [r17] = f21,32\n"
+		"\tbr.few b6 ;;\n"			/* branch to function */
+		".global StgReturn\n"
+		"StgReturn:\n"
+		"\tmov r8 = loc16\n"		/* return value in r8 */
+		"\tadds r16 = %0-(6*16), sp\n"
+	    	"\tadds r17 = %0-(5*16), sp ;;\n"
+		"\tldf.fill f16 = [r16],32\n"	/* start restoring fp regs */
+		"\tldf.fill f17 = [r17],32\n"
+		"\tmov ar.pfs = loc29 ;;\n"	/* restore register frame */
+		"\tldf.fill f18 = [r16],32\n"
+		"\tldf.fill f19 = [r17],32\n"
+		"\tmov b0 = loc30 ;;\n"		/* restore return address */
+		"\tldf.fill f20 = [r16],32\n"
+		"\tldf.fill f21 = [r17],32\n"
+		"\tadds sp = %0, sp\n"		/* restore stack */
+		"\tbr.ret.sptk.many b0 ;;\n"	/* return */
+	: : "i"(RESERVED_C_STACK_BYTES + 6*16), "i"(LOCALS));
+}
+
+#endif
+
+#endif /* !USE_MINIINTERPRETER */
diff --git a/rts/StgMiscClosures.cmm b/rts/StgMiscClosures.cmm
new file mode 100644
index 0000000000..70d08aeb0e
--- /dev/null
+++ b/rts/StgMiscClosures.cmm
@@ -0,0 +1,953 @@
+/* ----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * Entry code for various built-in closure types.
+ *
+ * This file is written in a subset of C--, extended with various
+ * features specific to GHC.  It is compiled by GHC directly.  For the
+ * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
+ *
+ * --------------------------------------------------------------------------*/
+
+#include "Cmm.h"
+
+/* ----------------------------------------------------------------------------
+   Support for the bytecode interpreter.
+   ------------------------------------------------------------------------- */
+
+/* 9 bits of return code for constructors created by the interpreter. */
+stg_interp_constr_entry
+{ 
+    /* R1 points at the constructor */
+    jump %ENTRY_CODE(Sp(0));
+}
+
+stg_interp_constr1_entry { jump %RET_VEC(Sp(0),0); }
+stg_interp_constr2_entry { jump %RET_VEC(Sp(0),1); }
+stg_interp_constr3_entry { jump %RET_VEC(Sp(0),2); }
+stg_interp_constr4_entry { jump %RET_VEC(Sp(0),3); }
+stg_interp_constr5_entry { jump %RET_VEC(Sp(0),4); }
+stg_interp_constr6_entry { jump %RET_VEC(Sp(0),5); }
+stg_interp_constr7_entry { jump %RET_VEC(Sp(0),6); }
+stg_interp_constr8_entry { jump %RET_VEC(Sp(0),7); }
+
+/* Some info tables to be used when compiled code returns a value to
+   the interpreter, i.e. the interpreter pushes one of these onto the
+   stack before entering a value.  What the code does is to
+   impedance-match the compiled return convention (in R1p/R1n/F1/D1 etc) to
+   the interpreter's convention (returned value is on top of stack),
+   and then cause the scheduler to enter the interpreter.
+
+   On entry, the stack (growing down) looks like this:
+
+      ptr to BCO holding return continuation
+      ptr to one of these info tables.
+ 
+   The info table code, both direct and vectored, must:
+      * push R1/F1/D1 on the stack, and its tag if necessary
+      * push the BCO (so it's now on the stack twice)
+      * Yield, ie, go to the scheduler.
+
+   Scheduler examines the t.o.s, discovers it is a BCO, and proceeds
+   directly to the bytecode interpreter.  That pops the top element
+   (the BCO, containing the return continuation), and interprets it.
+   Net result: return continuation gets interpreted, with the
+   following stack:
+
+      ptr to this BCO
+      ptr to the info table just jumped thru
+      return value
+
+   which is just what we want -- the "standard" return layout for the
+   interpreter.  Hurrah!
+
+   Don't ask me how unboxed tuple returns are supposed to work.  We
+   haven't got a good story about that yet.
+*/
+
+INFO_TABLE_RET( stg_ctoi_R1p, 
+		0/*size*/, 0/*bitmap*/,    /* special layout! */
+		RET_BCO,
+		RET_LBL(stg_ctoi_R1p),
+		RET_LBL(stg_ctoi_R1p),
+		RET_LBL(stg_ctoi_R1p),
+		RET_LBL(stg_ctoi_R1p),
+		RET_LBL(stg_ctoi_R1p),
+		RET_LBL(stg_ctoi_R1p),
+		RET_LBL(stg_ctoi_R1p),
+		RET_LBL(stg_ctoi_R1p))
+{
+    Sp_adj(-2);
+    Sp(1) = R1;
+    Sp(0) = stg_enter_info;
+    jump stg_yield_to_interpreter;
+}
+
+#if MAX_VECTORED_RTN != 8
+#error MAX_VECTORED_RTN has changed: please modify stg_ctoi_R1p too.
+#endif
+
+/*
+ * When the returned value is a pointer, but unlifted, in R1 ... 
+ */
+INFO_TABLE_RET( stg_ctoi_R1unpt,
+		0/*size*/, 0/*bitmap*/,    /* special layout! */
+		RET_BCO )
+{
+    Sp_adj(-2);
+    Sp(1) = R1;
+    Sp(0) = stg_gc_unpt_r1_info;
+    jump stg_yield_to_interpreter;
+}
+
+/*
+ * When the returned value is a non-pointer in R1 ...
+ */
+INFO_TABLE_RET( stg_ctoi_R1n,
+		0/*size*/, 0/*bitmap*/,    /* special layout! */
+		RET_BCO )
+{
+    Sp_adj(-2);
+    Sp(1) = R1;
+    Sp(0) = stg_gc_unbx_r1_info;
+    jump stg_yield_to_interpreter;
+}
+
+/*
+ * When the returned value is in F1
+ */
+INFO_TABLE_RET( stg_ctoi_F1,
+		0/*size*/, 0/*bitmap*/,    /* special layout! */
+		RET_BCO )
+{
+    Sp_adj(-2);
+    F_[Sp + WDS(1)] = F1;
+    Sp(0) = stg_gc_f1_info;
+    jump stg_yield_to_interpreter;
+}
+
+/*
+ * When the returned value is in D1
+ */
+INFO_TABLE_RET( stg_ctoi_D1,
+		0/*size*/, 0/*bitmap*/,    /* special layout! */
+		RET_BCO )
+{
+    Sp_adj(-1) - SIZEOF_DOUBLE;
+    D_[Sp + WDS(1)] = D1;
+    Sp(0) = stg_gc_d1_info;
+    jump stg_yield_to_interpreter;
+}
+
+/*
+ * When the returned value is in L1
+ */
+INFO_TABLE_RET( stg_ctoi_L1,
+		0/*size*/, 0/*bitmap*/,    /* special layout! */
+		RET_BCO )
+{
+    Sp_adj(-1) - 8;
+    L_[Sp + WDS(1)] = L1;
+    Sp(0) = stg_gc_l1_info;
+    jump stg_yield_to_interpreter;
+}
+
+/*
+ * When the returned value is a void
+ */
+INFO_TABLE_RET( stg_ctoi_V,
+		0/*size*/, 0/*bitmap*/,    /* special layout! */
+		RET_BCO )
+{
+    Sp_adj(-1);
+    Sp(0) = stg_gc_void_info;
+    jump stg_yield_to_interpreter;
+}
+
+/*
+ * Dummy info table pushed on the top of the stack when the interpreter
+ * should apply the BCO on the stack to its arguments, also on the
+ * stack.
+ */
+INFO_TABLE_RET( stg_apply_interp,
+		0/*size*/, 0/*bitmap*/,    /* special layout! */
+		RET_BCO )
+{
+    /* Just in case we end up in here... (we shouldn't) */
+    jump stg_yield_to_interpreter;
+}
+
+/* ----------------------------------------------------------------------------
+   Entry code for a BCO
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE_FUN( stg_BCO, 4, 0, BCO, "BCO", "BCO", ARG_BCO )
+{
+  /* entering a BCO means "apply it", same as a function */
+  Sp_adj(-2);
+  Sp(1) = R1;
+  Sp(0) = stg_apply_interp_info;
+  jump stg_yield_to_interpreter;
+}
+
+/* ----------------------------------------------------------------------------
+   Info tables for indirections.
+
+   SPECIALISED INDIRECTIONS: we have a specialised indirection for each
+   kind of return (direct, vectored 0-7), so that we can avoid entering
+   the object when we know what kind of return it will do.  The update
+   code (Updates.hc) updates objects with the appropriate kind of
+   indirection.  We only do this for young-gen indirections.
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_IND,1,0,IND,"IND","IND")
+{
+    TICK_ENT_DYN_IND();	/* tick */
+    R1 = StgInd_indirectee(R1);
+    TICK_ENT_VIA_NODE();
+    jump %GET_ENTRY(R1);
+}
+
+#define IND_SPEC(label,ret) \
+INFO_TABLE(label,1,0,IND,"IND","IND") \
+{						\
+    TICK_ENT_DYN_IND();	/* tick */		\
+    R1 = StgInd_indirectee(R1);			\
+    TICK_ENT_VIA_NODE();			\
+    jump ret;					\
+}
+
+IND_SPEC(stg_IND_direct, %ENTRY_CODE(Sp(0)))
+IND_SPEC(stg_IND_0, %RET_VEC(Sp(0),0))
+IND_SPEC(stg_IND_1, %RET_VEC(Sp(0),1))
+IND_SPEC(stg_IND_2, %RET_VEC(Sp(0),2))
+IND_SPEC(stg_IND_3, %RET_VEC(Sp(0),3))
+IND_SPEC(stg_IND_4, %RET_VEC(Sp(0),4))
+IND_SPEC(stg_IND_5, %RET_VEC(Sp(0),5))
+IND_SPEC(stg_IND_6, %RET_VEC(Sp(0),6))
+IND_SPEC(stg_IND_7, %RET_VEC(Sp(0),7))
+
+INFO_TABLE(stg_IND_STATIC,1,0,IND_STATIC,"IND_STATIC","IND_STATIC")
+{
+    TICK_ENT_STATIC_IND();	/* tick */
+    R1 = StgInd_indirectee(R1);
+    TICK_ENT_VIA_NODE();
+    jump %GET_ENTRY(R1);
+}
+
+INFO_TABLE(stg_IND_PERM,1,0,IND_PERM,"IND_PERM","IND_PERM")
+{
+    /* Don't add INDs to granularity cost */
+
+    /* Don't: TICK_ENT_STATIC_IND(Node); for ticky-ticky; this ind is
+       here only to help profiling */
+
+#if defined(TICKY_TICKY) && !defined(PROFILING)
+    /* TICKY_TICKY && !PROFILING means PERM_IND *replaces* an IND, rather than
+       being extra  */
+    TICK_ENT_PERM_IND();
+#endif
+
+    LDV_ENTER(R1);
+
+    /* Enter PAP cost centre */
+    ENTER_CCS_PAP_CL(R1);
+
+    /* For ticky-ticky, change the perm_ind to a normal ind on first
+     * entry, so the number of ent_perm_inds is the number of *thunks*
+     * entered again, not the number of subsequent entries.
+     *
+     * Since this screws up cost centres, we die if profiling and
+     * ticky_ticky are on at the same time.  KSW 1999-01.
+     */
+#ifdef TICKY_TICKY
+#  ifdef PROFILING
+#    error Profiling and ticky-ticky do not mix at present!
+#  endif  /* PROFILING */
+    StgHeader_info(R1) = stg_IND_info;
+#endif /* TICKY_TICKY */
+
+    R1 = StgInd_indirectee(R1);
+
+#if defined(TICKY_TICKY) && !defined(PROFILING)
+    TICK_ENT_VIA_NODE();
+#endif
+
+    jump %GET_ENTRY(R1);
+}  
+
+
+INFO_TABLE(stg_IND_OLDGEN,1,0,IND_OLDGEN,"IND_OLDGEN","IND_OLDGEN")
+{
+    TICK_ENT_STATIC_IND();	/* tick */
+    R1 = StgInd_indirectee(R1);
+    TICK_ENT_VIA_NODE();
+    jump %GET_ENTRY(R1);
+}
+
+INFO_TABLE(stg_IND_OLDGEN_PERM,1,0,IND_OLDGEN_PERM,"IND_OLDGEN_PERM","IND_OLDGEN_PERM")
+{
+    /* Don't: TICK_ENT_STATIC_IND(Node); for ticky-ticky; 
+       this ind is here only to help profiling */
+
+#if defined(TICKY_TICKY) && !defined(PROFILING)
+    /* TICKY_TICKY && !PROFILING means PERM_IND *replaces* an IND, 
+       rather than being extra  */
+    TICK_ENT_PERM_IND(R1); /* tick */
+#endif
+
+    LDV_ENTER(R1);
+
+    /* Enter PAP cost centre -- lexical scoping only */
+    ENTER_CCS_PAP_CL(R1);
+
+    /* see comment in IND_PERM */
+#ifdef TICKY_TICKY
+#  ifdef PROFILING
+#    error Profiling and ticky-ticky do not mix at present!
+#  endif  /* PROFILING */
+    StgHeader_info(R1) = stg_IND_OLDGEN_info;
+#endif /* TICKY_TICKY */
+
+    R1 = StgInd_indirectee(R1);
+
+    TICK_ENT_VIA_NODE();
+    jump %GET_ENTRY(R1);
+}
+
+/* ----------------------------------------------------------------------------
+   Black holes.
+
+   Entering a black hole normally causes a cyclic data dependency, but
+   in the concurrent world, black holes are synchronization points,
+   and they are turned into blocking queues when there are threads
+   waiting for the evaluation of the closure to finish.
+   ------------------------------------------------------------------------- */
+
+/* Note: a BLACKHOLE must be big enough to be
+ * overwritten with an indirection/evacuee/catch.  Thus we claim it
+ * has 1 non-pointer word of payload. 
+ */
+INFO_TABLE(stg_BLACKHOLE,0,1,BLACKHOLE,"BLACKHOLE","BLACKHOLE")
+{
+#if defined(GRAN)
+    /* Before overwriting TSO_LINK */
+    STGCALL3(GranSimBlock,CurrentTSO,CurrentProc,(StgClosure *)R1 /*Node*/);
+#endif
+
+    TICK_ENT_BH();
+
+#ifdef THREADED_RTS
+    // foreign "C" debugBelch("BLACKHOLE entry\n");
+#endif
+
+    /* Actually this is not necessary because R1 is about to be destroyed. */
+    LDV_ENTER(R1);
+
+#if defined(THREADED_RTS)
+    foreign "C" ACQUIRE_LOCK(sched_mutex "ptr");
+    // released in stg_block_blackhole_finally
+#endif
+
+    /* Put ourselves on the blackhole queue */
+    StgTSO_link(CurrentTSO) = W_[blackhole_queue];
+    W_[blackhole_queue] = CurrentTSO;
+
+    /* jot down why and on what closure we are blocked */
+    StgTSO_why_blocked(CurrentTSO) = BlockedOnBlackHole::I16;
+    StgTSO_block_info(CurrentTSO) = R1;
+
+    jump stg_block_blackhole;
+}
+
+#if defined(PAR) || defined(GRAN)
+
+INFO_TABLE(stg_RBH,1,1,RBH,"RBH","RBH")
+{
+# if defined(GRAN)
+    /* mainly statistics gathering for GranSim simulation */
+    STGCALL3(GranSimBlock,CurrentTSO,CurrentProc,(StgClosure *)R1 /*Node*/);
+# endif
+
+    /* exactly the same as a BLACKHOLE_BQ_entry -- HWL */
+    /* Put ourselves on the blocking queue for this black hole */
+    TSO_link(CurrentTSO) = StgBlockingQueue_blocking_queue(R1);
+    StgBlockingQueue_blocking_queue(R1) = CurrentTSO;
+    /* jot down why and on what closure we are blocked */
+    TSO_why_blocked(CurrentTSO) = BlockedOnBlackHole::I16;
+    TSO_block_info(CurrentTSO) = R1;
+
+    /* PAR: dumping of event now done in blockThread -- HWL */
+
+    /* stg_gen_block is too heavyweight, use a specialised one */
+    jump stg_block_1;
+}
+
+INFO_TABLE(stg_RBH_Save_0,0,2,CONSTR,"RBH_Save_0","RBH_Save_0")
+{ foreign "C" barf("RBH_Save_0 object entered!"); }
+
+INFO_TABLE(stg_RBH_Save_1,1,1,CONSTR,"RBH_Save_1","RBH_Save_1");
+{ foreign "C" barf("RBH_Save_1 object entered!"); }
+
+INFO_TABLE(stg_RBH_Save_2,2,0,CONSTR,"RBH_Save_2","RBH_Save_2");
+{ foreign "C" barf("RBH_Save_2 object entered!"); }
+
+#endif /* defined(PAR) || defined(GRAN) */
+
+/* identical to BLACKHOLEs except for the infotag */
+INFO_TABLE(stg_CAF_BLACKHOLE,0,1,CAF_BLACKHOLE,"CAF_BLACKHOLE","CAF_BLACKHOLE")
+{
+#if defined(GRAN)
+    /* mainly statistics gathering for GranSim simulation */
+    STGCALL3(GranSimBlock,CurrentTSO,CurrentProc,(StgClosure *)R1 /*Node*/);
+#endif
+
+    TICK_ENT_BH();
+    LDV_ENTER(R1);
+
+#if defined(THREADED_RTS)
+    // foreign "C" debugBelch("BLACKHOLE entry\n");
+#endif
+
+#if defined(THREADED_RTS)
+    foreign "C" ACQUIRE_LOCK(sched_mutex "ptr");
+    // released in stg_block_blackhole_finally
+#endif
+
+    /* Put ourselves on the blackhole queue */
+    StgTSO_link(CurrentTSO) = W_[blackhole_queue];
+    W_[blackhole_queue] = CurrentTSO;
+
+    /* jot down why and on what closure we are blocked */
+    StgTSO_why_blocked(CurrentTSO) = BlockedOnBlackHole::I16;
+    StgTSO_block_info(CurrentTSO) = R1;
+
+    jump stg_block_blackhole;
+}
+
+#ifdef EAGER_BLACKHOLING
+INFO_TABLE(stg_SE_BLACKHOLE,0,1,SE_BLACKHOLE,"SE_BLACKHOLE","SE_BLACKHOLE")
+{ foreign "C" barf("SE_BLACKHOLE object entered!"); }
+
+INFO_TABLE(stg_SE_CAF_BLACKHOLE,0,1,SE_CAF_BLACKHOLE,"SE_CAF_BLACKHOLE","SE_CAF_BLACKHOLE")
+{ foreign "C" barf("SE_CAF_BLACKHOLE object entered!"); }
+#endif
+
+/* ----------------------------------------------------------------------------
+   Whiteholes are used for the "locked" state of a closure (see lockClosure())
+
+   The closure type is BLAKCHOLE, just because we need a valid closure type
+   for sanity checking.
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_WHITEHOLE, 0,0, BLACKHOLE, "WHITEHOLE", "WHITEHOLE")
+{ foreign "C" barf("WHITEHOLE object entered!"); }
+
+/* ----------------------------------------------------------------------------
+   Some static info tables for things that don't get entered, and
+   therefore don't need entry code (i.e. boxed but unpointed objects)
+   NON_ENTERABLE_ENTRY_CODE now defined at the beginning of the file
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_TSO, 0,0,TSO, "TSO", "TSO")
+{ foreign "C" barf("TSO object entered!"); }
+
+/* ----------------------------------------------------------------------------
+   Evacuees are left behind by the garbage collector.  Any attempt to enter
+   one is a real bug.
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_EVACUATED,1,0,EVACUATED,"EVACUATED","EVACUATED")
+{ foreign "C" barf("EVACUATED object entered!"); }
+
+/* ----------------------------------------------------------------------------
+   Weak pointers
+
+   Live weak pointers have a special closure type.  Dead ones are just
+   nullary constructors (although they live on the heap - we overwrite
+   live weak pointers with dead ones).
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_WEAK,0,4,WEAK,"WEAK","WEAK")
+{ foreign "C" barf("WEAK object entered!"); }
+
+/*
+ * It's important when turning an existing WEAK into a DEAD_WEAK
+ * (which is what finalizeWeak# does) that we don't lose the link
+ * field and break the linked list of weak pointers.  Hence, we give
+ * DEAD_WEAK 4 non-pointer fields, the same as WEAK.
+ */
+INFO_TABLE_CONSTR(stg_DEAD_WEAK,0,4,0,CONSTR,"DEAD_WEAK","DEAD_WEAK")
+{ foreign "C" barf("DEAD_WEAK object entered!"); }
+
+/* ----------------------------------------------------------------------------
+   NO_FINALIZER
+
+   This is a static nullary constructor (like []) that we use to mark an empty
+   finalizer in a weak pointer object.
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE_CONSTR(stg_NO_FINALIZER,0,0,0,CONSTR_NOCAF_STATIC,"NO_FINALIZER","NO_FINALIZER")
+{ foreign "C" barf("NO_FINALIZER object entered!"); }
+
+CLOSURE(stg_NO_FINALIZER_closure,stg_NO_FINALIZER);
+
+/* ----------------------------------------------------------------------------
+   Stable Names are unlifted too.
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_STABLE_NAME,0,1,STABLE_NAME,"STABLE_NAME","STABLE_NAME")
+{ foreign "C" barf("STABLE_NAME object entered!"); }
+
+/* ----------------------------------------------------------------------------
+   MVars
+
+   There are two kinds of these: full and empty.  We need an info table
+   and entry code for each type.
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_FULL_MVAR,3,0,MVAR,"MVAR","MVAR")
+{ foreign "C" barf("FULL_MVAR object entered!"); }
+
+INFO_TABLE(stg_EMPTY_MVAR,3,0,MVAR,"MVAR","MVAR")
+{ foreign "C" barf("EMPTY_MVAR object entered!"); }
+
+/* -----------------------------------------------------------------------------
+   STM
+   -------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_TVAR, 0, 0, TVAR, "TVAR", "TVAR")
+{ foreign "C" barf("TVAR object entered!"); }
+
+INFO_TABLE(stg_TVAR_WAIT_QUEUE, 0, 0, TVAR_WAIT_QUEUE, "TVAR_WAIT_QUEUE", "TVAR_WAIT_QUEUE")
+{ foreign "C" barf("TVAR_WAIT_QUEUE object entered!"); }
+
+INFO_TABLE(stg_TREC_CHUNK, 0, 0, TREC_CHUNK, "TREC_CHUNK", "TREC_CHUNK")
+{ foreign "C" barf("TREC_CHUNK object entered!"); }
+
+INFO_TABLE(stg_TREC_HEADER, 0, 0, TREC_HEADER, "TREC_HEADER", "TREC_HEADER")
+{ foreign "C" barf("TREC_HEADER object entered!"); }
+
+INFO_TABLE_CONSTR(stg_END_STM_WAIT_QUEUE,0,0,0,CONSTR_NOCAF_STATIC,"END_STM_WAIT_QUEUE","END_STM_WAIT_QUEUE")
+{ foreign "C" barf("END_STM_WAIT_QUEUE object entered!"); }
+
+INFO_TABLE_CONSTR(stg_END_STM_CHUNK_LIST,0,0,0,CONSTR_NOCAF_STATIC,"END_STM_CHUNK_LIST","END_STM_CHUNK_LIST")
+{ foreign "C" barf("END_STM_CHUNK_LIST object entered!"); }
+
+INFO_TABLE_CONSTR(stg_NO_TREC,0,0,0,CONSTR_NOCAF_STATIC,"NO_TREC","NO_TREC")
+{ foreign "C" barf("NO_TREC object entered!"); }
+
+CLOSURE(stg_END_STM_WAIT_QUEUE_closure,stg_END_STM_WAIT_QUEUE);
+
+CLOSURE(stg_END_STM_CHUNK_LIST_closure,stg_END_STM_CHUNK_LIST);
+
+CLOSURE(stg_NO_TREC_closure,stg_NO_TREC);
+
+/* ----------------------------------------------------------------------------
+   END_TSO_QUEUE
+
+   This is a static nullary constructor (like []) that we use to mark the
+   end of a linked TSO queue.
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE_CONSTR(stg_END_TSO_QUEUE,0,0,0,CONSTR_NOCAF_STATIC,"END_TSO_QUEUE","END_TSO_QUEUE")
+{ foreign "C" barf("END_TSO_QUEUE object entered!"); }
+
+CLOSURE(stg_END_TSO_QUEUE_closure,stg_END_TSO_QUEUE);
+
+/* ----------------------------------------------------------------------------
+   Exception lists
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE_CONSTR(stg_END_EXCEPTION_LIST,0,0,0,CONSTR_NOCAF_STATIC,"END_EXCEPTION_LIST","END_EXCEPTION_LIST")
+{ foreign "C" barf("END_EXCEPTION_LIST object entered!"); }
+
+CLOSURE(stg_END_EXCEPTION_LIST_closure,stg_END_EXCEPTION_LIST);
+
+INFO_TABLE(stg_EXCEPTION_CONS,1,1,CONSTR,"EXCEPTION_CONS","EXCEPTION_CONS")
+{ foreign "C" barf("EXCEPTION_CONS object entered!"); }
+
+/* ----------------------------------------------------------------------------
+   Arrays
+
+   These come in two basic flavours: arrays of data (StgArrWords) and arrays of
+   pointers (StgArrPtrs).  They all have a similar layout:
+
+	___________________________
+	| Info | No. of | data....
+        |  Ptr | Words  |
+	---------------------------
+
+   These are *unpointed* objects: i.e. they cannot be entered.
+
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_ARR_WORDS, 0, 0, ARR_WORDS, "ARR_WORDS", "ARR_WORDS")
+{ foreign "C" barf("ARR_WORDS object entered!"); }
+
+INFO_TABLE(stg_MUT_ARR_PTRS_CLEAN, 0, 0, MUT_ARR_PTRS_CLEAN, "MUT_ARR_PTRS_CLEAN", "MUT_ARR_PTRS_CLEAN")
+{ foreign "C" barf("MUT_ARR_PTRS_CLEAN object entered!"); }
+
+INFO_TABLE(stg_MUT_ARR_PTRS_DIRTY, 0, 0, MUT_ARR_PTRS_DIRTY, "MUT_ARR_PTRS_DIRTY", "MUT_ARR_PTRS_DIRTY")
+{ foreign "C" barf("MUT_ARR_PTRS_DIRTY object entered!"); }
+
+INFO_TABLE(stg_MUT_ARR_PTRS_FROZEN, 0, 0, MUT_ARR_PTRS_FROZEN, "MUT_ARR_PTRS_FROZEN", "MUT_ARR_PTRS_FROZEN")
+{ foreign "C" barf("MUT_ARR_PTRS_FROZEN object entered!"); }
+
+INFO_TABLE(stg_MUT_ARR_PTRS_FROZEN0, 0, 0, MUT_ARR_PTRS_FROZEN0, "MUT_ARR_PTRS_FROZEN0", "MUT_ARR_PTRS_FROZEN0")
+{ foreign "C" barf("MUT_ARR_PTRS_FROZEN0 object entered!"); }
+
+/* ----------------------------------------------------------------------------
+   Mutable Variables
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_MUT_VAR_CLEAN, 1, 0, MUT_VAR_CLEAN, "MUT_VAR_CLEAN", "MUT_VAR_CLEAN")
+{ foreign "C" barf("MUT_VAR_CLEAN object entered!"); }
+INFO_TABLE(stg_MUT_VAR_DIRTY, 1, 0, MUT_VAR_DIRTY, "MUT_VAR_DIRTY", "MUT_VAR_DIRTY")
+{ foreign "C" barf("MUT_VAR_DIRTY object entered!"); }
+
+/* ----------------------------------------------------------------------------
+   Dummy return closure
+ 
+   Entering this closure will just return to the address on the top of the
+   stack.  Useful for getting a thread in a canonical form where we can
+   just enter the top stack word to start the thread.  (see deleteThread)
+ * ------------------------------------------------------------------------- */
+
+INFO_TABLE( stg_dummy_ret, 0, 0, CONSTR_NOCAF_STATIC, "DUMMY_RET", "DUMMY_RET")
+{
+  jump %ENTRY_CODE(Sp(0));
+}
+CLOSURE(stg_dummy_ret_closure,stg_dummy_ret);
+
+/* ----------------------------------------------------------------------------
+   CHARLIKE and INTLIKE closures.  
+
+   These are static representations of Chars and small Ints, so that
+   we can remove dynamic Chars and Ints during garbage collection and
+   replace them with references to the static objects.
+   ------------------------------------------------------------------------- */
+
+#if defined(ENABLE_WIN32_DLL_SUPPORT)
+/*
+ * When sticking the RTS in a DLL, we delay populating the
+ * Charlike and Intlike tables until load-time, which is only
+ * when we've got the real addresses to the C# and I# closures.
+ *
+ */
+static INFO_TBL_CONST StgInfoTable czh_static_info;
+static INFO_TBL_CONST StgInfoTable izh_static_info;
+#define Char_hash_static_info czh_static_info
+#define Int_hash_static_info izh_static_info
+#else
+#define Char_hash_static_info GHCziBase_Czh_static
+#define Int_hash_static_info GHCziBase_Izh_static
+#endif
+
+
+#define CHARLIKE_HDR(n)  CLOSURE(Char_hash_static_info, n)
+#define INTLIKE_HDR(n)   CLOSURE(Int_hash_static_info, n)
+
+/* put these in the *data* section, since the garbage collector relies
+ * on the fact that static closures live in the data section.
+ */
+
+/* end the name with _closure, to convince the mangler this is a closure */
+
+section "data" {
+ stg_CHARLIKE_closure:
+    CHARLIKE_HDR(0)
+    CHARLIKE_HDR(1)
+    CHARLIKE_HDR(2)
+    CHARLIKE_HDR(3)
+    CHARLIKE_HDR(4)
+    CHARLIKE_HDR(5)
+    CHARLIKE_HDR(6)
+    CHARLIKE_HDR(7)
+    CHARLIKE_HDR(8)
+    CHARLIKE_HDR(9)
+    CHARLIKE_HDR(10)
+    CHARLIKE_HDR(11)
+    CHARLIKE_HDR(12)
+    CHARLIKE_HDR(13)
+    CHARLIKE_HDR(14)
+    CHARLIKE_HDR(15)
+    CHARLIKE_HDR(16)
+    CHARLIKE_HDR(17)
+    CHARLIKE_HDR(18)
+    CHARLIKE_HDR(19)
+    CHARLIKE_HDR(20)
+    CHARLIKE_HDR(21)
+    CHARLIKE_HDR(22)
+    CHARLIKE_HDR(23)
+    CHARLIKE_HDR(24)
+    CHARLIKE_HDR(25)
+    CHARLIKE_HDR(26)
+    CHARLIKE_HDR(27)
+    CHARLIKE_HDR(28)
+    CHARLIKE_HDR(29)
+    CHARLIKE_HDR(30)
+    CHARLIKE_HDR(31)
+    CHARLIKE_HDR(32)
+    CHARLIKE_HDR(33)
+    CHARLIKE_HDR(34)
+    CHARLIKE_HDR(35)
+    CHARLIKE_HDR(36)
+    CHARLIKE_HDR(37)
+    CHARLIKE_HDR(38)
+    CHARLIKE_HDR(39)
+    CHARLIKE_HDR(40)
+    CHARLIKE_HDR(41)
+    CHARLIKE_HDR(42)
+    CHARLIKE_HDR(43)
+    CHARLIKE_HDR(44)
+    CHARLIKE_HDR(45)
+    CHARLIKE_HDR(46)
+    CHARLIKE_HDR(47)
+    CHARLIKE_HDR(48)
+    CHARLIKE_HDR(49)
+    CHARLIKE_HDR(50)
+    CHARLIKE_HDR(51)
+    CHARLIKE_HDR(52)
+    CHARLIKE_HDR(53)
+    CHARLIKE_HDR(54)
+    CHARLIKE_HDR(55)
+    CHARLIKE_HDR(56)
+    CHARLIKE_HDR(57)
+    CHARLIKE_HDR(58)
+    CHARLIKE_HDR(59)
+    CHARLIKE_HDR(60)
+    CHARLIKE_HDR(61)
+    CHARLIKE_HDR(62)
+    CHARLIKE_HDR(63)
+    CHARLIKE_HDR(64)
+    CHARLIKE_HDR(65)
+    CHARLIKE_HDR(66)
+    CHARLIKE_HDR(67)
+    CHARLIKE_HDR(68)
+    CHARLIKE_HDR(69)
+    CHARLIKE_HDR(70)
+    CHARLIKE_HDR(71)
+    CHARLIKE_HDR(72)
+    CHARLIKE_HDR(73)
+    CHARLIKE_HDR(74)
+    CHARLIKE_HDR(75)
+    CHARLIKE_HDR(76)
+    CHARLIKE_HDR(77)
+    CHARLIKE_HDR(78)
+    CHARLIKE_HDR(79)
+    CHARLIKE_HDR(80)
+    CHARLIKE_HDR(81)
+    CHARLIKE_HDR(82)
+    CHARLIKE_HDR(83)
+    CHARLIKE_HDR(84)
+    CHARLIKE_HDR(85)
+    CHARLIKE_HDR(86)
+    CHARLIKE_HDR(87)
+    CHARLIKE_HDR(88)
+    CHARLIKE_HDR(89)
+    CHARLIKE_HDR(90)
+    CHARLIKE_HDR(91)
+    CHARLIKE_HDR(92)
+    CHARLIKE_HDR(93)
+    CHARLIKE_HDR(94)
+    CHARLIKE_HDR(95)
+    CHARLIKE_HDR(96)
+    CHARLIKE_HDR(97)
+    CHARLIKE_HDR(98)
+    CHARLIKE_HDR(99)
+    CHARLIKE_HDR(100)
+    CHARLIKE_HDR(101)
+    CHARLIKE_HDR(102)
+    CHARLIKE_HDR(103)
+    CHARLIKE_HDR(104)
+    CHARLIKE_HDR(105)
+    CHARLIKE_HDR(106)
+    CHARLIKE_HDR(107)
+    CHARLIKE_HDR(108)
+    CHARLIKE_HDR(109)
+    CHARLIKE_HDR(110)
+    CHARLIKE_HDR(111)
+    CHARLIKE_HDR(112)
+    CHARLIKE_HDR(113)
+    CHARLIKE_HDR(114)
+    CHARLIKE_HDR(115)
+    CHARLIKE_HDR(116)
+    CHARLIKE_HDR(117)
+    CHARLIKE_HDR(118)
+    CHARLIKE_HDR(119)
+    CHARLIKE_HDR(120)
+    CHARLIKE_HDR(121)
+    CHARLIKE_HDR(122)
+    CHARLIKE_HDR(123)
+    CHARLIKE_HDR(124)
+    CHARLIKE_HDR(125)
+    CHARLIKE_HDR(126)
+    CHARLIKE_HDR(127)
+    CHARLIKE_HDR(128)
+    CHARLIKE_HDR(129)
+    CHARLIKE_HDR(130)
+    CHARLIKE_HDR(131)
+    CHARLIKE_HDR(132)
+    CHARLIKE_HDR(133)
+    CHARLIKE_HDR(134)
+    CHARLIKE_HDR(135)
+    CHARLIKE_HDR(136)
+    CHARLIKE_HDR(137)
+    CHARLIKE_HDR(138)
+    CHARLIKE_HDR(139)
+    CHARLIKE_HDR(140)
+    CHARLIKE_HDR(141)
+    CHARLIKE_HDR(142)
+    CHARLIKE_HDR(143)
+    CHARLIKE_HDR(144)
+    CHARLIKE_HDR(145)
+    CHARLIKE_HDR(146)
+    CHARLIKE_HDR(147)
+    CHARLIKE_HDR(148)
+    CHARLIKE_HDR(149)
+    CHARLIKE_HDR(150)
+    CHARLIKE_HDR(151)
+    CHARLIKE_HDR(152)
+    CHARLIKE_HDR(153)
+    CHARLIKE_HDR(154)
+    CHARLIKE_HDR(155)
+    CHARLIKE_HDR(156)
+    CHARLIKE_HDR(157)
+    CHARLIKE_HDR(158)
+    CHARLIKE_HDR(159)
+    CHARLIKE_HDR(160)
+    CHARLIKE_HDR(161)
+    CHARLIKE_HDR(162)
+    CHARLIKE_HDR(163)
+    CHARLIKE_HDR(164)
+    CHARLIKE_HDR(165)
+    CHARLIKE_HDR(166)
+    CHARLIKE_HDR(167)
+    CHARLIKE_HDR(168)
+    CHARLIKE_HDR(169)
+    CHARLIKE_HDR(170)
+    CHARLIKE_HDR(171)
+    CHARLIKE_HDR(172)
+    CHARLIKE_HDR(173)
+    CHARLIKE_HDR(174)
+    CHARLIKE_HDR(175)
+    CHARLIKE_HDR(176)
+    CHARLIKE_HDR(177)
+    CHARLIKE_HDR(178)
+    CHARLIKE_HDR(179)
+    CHARLIKE_HDR(180)
+    CHARLIKE_HDR(181)
+    CHARLIKE_HDR(182)
+    CHARLIKE_HDR(183)
+    CHARLIKE_HDR(184)
+    CHARLIKE_HDR(185)
+    CHARLIKE_HDR(186)
+    CHARLIKE_HDR(187)
+    CHARLIKE_HDR(188)
+    CHARLIKE_HDR(189)
+    CHARLIKE_HDR(190)
+    CHARLIKE_HDR(191)
+    CHARLIKE_HDR(192)
+    CHARLIKE_HDR(193)
+    CHARLIKE_HDR(194)
+    CHARLIKE_HDR(195)
+    CHARLIKE_HDR(196)
+    CHARLIKE_HDR(197)
+    CHARLIKE_HDR(198)
+    CHARLIKE_HDR(199)
+    CHARLIKE_HDR(200)
+    CHARLIKE_HDR(201)
+    CHARLIKE_HDR(202)
+    CHARLIKE_HDR(203)
+    CHARLIKE_HDR(204)
+    CHARLIKE_HDR(205)
+    CHARLIKE_HDR(206)
+    CHARLIKE_HDR(207)
+    CHARLIKE_HDR(208)
+    CHARLIKE_HDR(209)
+    CHARLIKE_HDR(210)
+    CHARLIKE_HDR(211)
+    CHARLIKE_HDR(212)
+    CHARLIKE_HDR(213)
+    CHARLIKE_HDR(214)
+    CHARLIKE_HDR(215)
+    CHARLIKE_HDR(216)
+    CHARLIKE_HDR(217)
+    CHARLIKE_HDR(218)
+    CHARLIKE_HDR(219)
+    CHARLIKE_HDR(220)
+    CHARLIKE_HDR(221)
+    CHARLIKE_HDR(222)
+    CHARLIKE_HDR(223)
+    CHARLIKE_HDR(224)
+    CHARLIKE_HDR(225)
+    CHARLIKE_HDR(226)
+    CHARLIKE_HDR(227)
+    CHARLIKE_HDR(228)
+    CHARLIKE_HDR(229)
+    CHARLIKE_HDR(230)
+    CHARLIKE_HDR(231)
+    CHARLIKE_HDR(232)
+    CHARLIKE_HDR(233)
+    CHARLIKE_HDR(234)
+    CHARLIKE_HDR(235)
+    CHARLIKE_HDR(236)
+    CHARLIKE_HDR(237)
+    CHARLIKE_HDR(238)
+    CHARLIKE_HDR(239)
+    CHARLIKE_HDR(240)
+    CHARLIKE_HDR(241)
+    CHARLIKE_HDR(242)
+    CHARLIKE_HDR(243)
+    CHARLIKE_HDR(244)
+    CHARLIKE_HDR(245)
+    CHARLIKE_HDR(246)
+    CHARLIKE_HDR(247)
+    CHARLIKE_HDR(248)
+    CHARLIKE_HDR(249)
+    CHARLIKE_HDR(250)
+    CHARLIKE_HDR(251)
+    CHARLIKE_HDR(252)
+    CHARLIKE_HDR(253)
+    CHARLIKE_HDR(254)
+    CHARLIKE_HDR(255)
+}
+
+section "data" {
+ stg_INTLIKE_closure:
+    INTLIKE_HDR(-16)	/* MIN_INTLIKE == -16 */
+    INTLIKE_HDR(-15)
+    INTLIKE_HDR(-14)
+    INTLIKE_HDR(-13)
+    INTLIKE_HDR(-12)
+    INTLIKE_HDR(-11)
+    INTLIKE_HDR(-10)
+    INTLIKE_HDR(-9)
+    INTLIKE_HDR(-8)
+    INTLIKE_HDR(-7)
+    INTLIKE_HDR(-6)
+    INTLIKE_HDR(-5)
+    INTLIKE_HDR(-4)
+    INTLIKE_HDR(-3)
+    INTLIKE_HDR(-2)
+    INTLIKE_HDR(-1)
+    INTLIKE_HDR(0)
+    INTLIKE_HDR(1)
+    INTLIKE_HDR(2)
+    INTLIKE_HDR(3)
+    INTLIKE_HDR(4)
+    INTLIKE_HDR(5)
+    INTLIKE_HDR(6)
+    INTLIKE_HDR(7)
+    INTLIKE_HDR(8)
+    INTLIKE_HDR(9)
+    INTLIKE_HDR(10)
+    INTLIKE_HDR(11)
+    INTLIKE_HDR(12)
+    INTLIKE_HDR(13)
+    INTLIKE_HDR(14)
+    INTLIKE_HDR(15)
+    INTLIKE_HDR(16)	/* MAX_INTLIKE == 16 */
+}
diff --git a/rts/StgPrimFloat.c b/rts/StgPrimFloat.c
new file mode 100644
index 0000000000..5bd6aebb1c
--- /dev/null
+++ b/rts/StgPrimFloat.c
@@ -0,0 +1,491 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2000
+ *
+ * Miscellaneous support for floating-point primitives
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+
+#include <math.h>
+
+/*
+ * Encoding and decoding Doubles.  Code based on the HBC code
+ * (lib/fltcode.c).
+ */
+
+#ifdef _SHORT_LIMB
+#define SIZEOF_LIMB_T SIZEOF_UNSIGNED_INT
+#else
+#ifdef _LONG_LONG_LIMB
+#define SIZEOF_LIMB_T SIZEOF_UNSIGNED_LONG_LONG
+#else
+#define SIZEOF_LIMB_T SIZEOF_UNSIGNED_LONG
+#endif
+#endif
+
+#if SIZEOF_LIMB_T == 4
+#define GMP_BASE 4294967296.0
+#elif SIZEOF_LIMB_T == 8
+#define GMP_BASE 18446744073709551616.0
+#else
+#error Cannot cope with SIZEOF_LIMB_T -- please add definition of GMP_BASE
+#endif
+
+#define DNBIGIT	 ((SIZEOF_DOUBLE+SIZEOF_LIMB_T-1)/SIZEOF_LIMB_T)
+#define FNBIGIT	 ((SIZEOF_FLOAT +SIZEOF_LIMB_T-1)/SIZEOF_LIMB_T)
+
+#if IEEE_FLOATING_POINT
+#define MY_DMINEXP  ((DBL_MIN_EXP) - (DBL_MANT_DIG) - 1)
+/* DMINEXP is defined in values.h on Linux (for example) */
+#define DHIGHBIT 0x00100000
+#define DMSBIT   0x80000000
+
+#define MY_FMINEXP  ((FLT_MIN_EXP) - (FLT_MANT_DIG) - 1)
+#define FHIGHBIT 0x00800000
+#define FMSBIT   0x80000000
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define L 1
+#define H 0
+#else
+#define L 0
+#define H 1
+#endif
+
+#define __abs(a)		(( (a) >= 0 ) ? (a) : (-(a)))
+
+StgDouble
+__encodeDouble (I_ size, StgByteArray ba, I_ e) /* result = s * 2^e */
+{
+    StgDouble r;
+    const mp_limb_t *const arr = (const mp_limb_t *)ba;
+    I_ i;
+
+    /* Convert MP_INT to a double; knows a lot about internal rep! */
+    for(r = 0.0, i = __abs(size)-1; i >= 0; i--)
+	r = (r * GMP_BASE) + arr[i];
+
+    /* Now raise to the exponent */
+    if ( r != 0.0 ) /* Lennart suggests this avoids a bug in MIPS's ldexp */
+	r = ldexp(r, e);
+
+    /* sign is encoded in the size */
+    if (size < 0)
+	r = -r;
+
+    return r;
+}
+
+/* Special version for small Integers */
+StgDouble
+__int_encodeDouble (I_ j, I_ e)
+{
+  StgDouble r;
+  
+  r = (StgDouble)__abs(j);
+  
+  /* Now raise to the exponent */
+  if ( r != 0.0 ) /* Lennart suggests this avoids a bug in MIPS's ldexp */
+    r = ldexp(r, e);
+  
+  /* sign is encoded in the size */
+  if (j < 0)
+    r = -r;
+  
+  return r;
+}
+
+StgFloat
+__encodeFloat (I_ size, StgByteArray ba, I_ e) /* result = s * 2^e */
+{
+    StgFloat r;
+    const mp_limb_t *arr = (const mp_limb_t *)ba;
+    I_ i;
+
+    /* Convert MP_INT to a float; knows a lot about internal rep! */
+    for(r = 0.0, i = __abs(size)-1; i >= 0; i--)
+	r = (r * GMP_BASE) + arr[i];
+
+    /* Now raise to the exponent */
+    if ( r != 0.0 ) /* Lennart suggests this avoids a bug in MIPS's ldexp */
+	r = ldexp(r, e);
+
+    /* sign is encoded in the size */
+    if (size < 0)
+	r = -r;
+
+    return r;
+}
+
+/* Special version for small Integers */
+StgFloat
+__int_encodeFloat (I_ j, I_ e)
+{
+  StgFloat r;
+  
+  r = (StgFloat)__abs(j);
+  
+  /* Now raise to the exponent */
+  if ( r != 0.0 ) /* Lennart suggests this avoids a bug in MIPS's ldexp */
+    r = ldexp(r, e);
+  
+  /* sign is encoded in the size */
+  if (j < 0)
+    r = -r;
+  
+  return r;
+}
+
+/* This only supports IEEE floating point */
+
+void
+__decodeDouble (MP_INT *man, I_ *exp, StgDouble dbl)
+{
+    /* Do some bit fiddling on IEEE */
+    unsigned int low, high; 	     	/* assuming 32 bit ints */
+    int sign, iexp;
+    union { double d; unsigned int i[2]; } u;	/* assuming 32 bit ints, 64 bit double */
+
+    ASSERT(sizeof(unsigned int ) == 4            );
+    ASSERT(sizeof(dbl          ) == SIZEOF_DOUBLE);
+    ASSERT(sizeof(man->_mp_d[0]) == SIZEOF_LIMB_T);
+    ASSERT(DNBIGIT*SIZEOF_LIMB_T >= SIZEOF_DOUBLE);
+
+    u.d = dbl;	    /* grab chunks of the double */
+    low = u.i[L];
+    high = u.i[H];
+
+    /* we know the MP_INT* passed in has size zero, so we realloc
+    	no matter what.
+    */
+    man->_mp_alloc = DNBIGIT;
+
+    if (low == 0 && (high & ~DMSBIT) == 0) {
+	man->_mp_size = 0;
+	*exp = 0L;
+    } else {
+	man->_mp_size = DNBIGIT;
+	iexp = ((high >> 20) & 0x7ff) + MY_DMINEXP;
+	sign = high;
+
+	high &= DHIGHBIT-1;
+	if (iexp != MY_DMINEXP)	/* don't add hidden bit to denorms */
+	    high |= DHIGHBIT;
+	else {
+	    iexp++;
+	    /* A denorm, normalize the mantissa */
+	    while (! (high & DHIGHBIT)) {
+		high <<= 1;
+		if (low & DMSBIT)
+		    high++;
+		low <<= 1;
+		iexp--;
+	    }
+	}
+        *exp = (I_) iexp;
+#if DNBIGIT == 2
+	man->_mp_d[0] = (mp_limb_t)low;
+	man->_mp_d[1] = (mp_limb_t)high;
+#else
+#if DNBIGIT == 1
+	man->_mp_d[0] = ((mp_limb_t)high) << 32 | (mp_limb_t)low;
+#else
+#error Cannot cope with DNBIGIT
+#endif
+#endif
+	if (sign < 0)
+	    man->_mp_size = -man->_mp_size;
+    }
+}
+
+void
+__decodeFloat (MP_INT *man, I_ *exp, StgFloat flt)
+{
+    /* Do some bit fiddling on IEEE */
+    int high, sign; 	    	    /* assuming 32 bit ints */
+    union { float f; int i; } u;    /* assuming 32 bit float and int */
+
+    ASSERT(sizeof(int          ) == 4            );
+    ASSERT(sizeof(flt          ) == SIZEOF_FLOAT );
+    ASSERT(sizeof(man->_mp_d[0]) == SIZEOF_LIMB_T);
+    ASSERT(FNBIGIT*SIZEOF_LIMB_T >= SIZEOF_FLOAT );
+
+    u.f = flt;	    /* grab the float */
+    high = u.i;
+
+    /* we know the MP_INT* passed in has size zero, so we realloc
+    	no matter what.
+    */
+    man->_mp_alloc = FNBIGIT;
+
+    if ((high & ~FMSBIT) == 0) {
+	man->_mp_size = 0;
+	*exp = 0;
+    } else {
+	man->_mp_size = FNBIGIT;
+	*exp = ((high >> 23) & 0xff) + MY_FMINEXP;
+	sign = high;
+
+	high &= FHIGHBIT-1;
+	if (*exp != MY_FMINEXP)	/* don't add hidden bit to denorms */
+	    high |= FHIGHBIT;
+	else {
+	    (*exp)++;
+	    /* A denorm, normalize the mantissa */
+	    while (! (high & FHIGHBIT)) {
+		high <<= 1;
+		(*exp)--;
+	    }
+	}
+#if FNBIGIT == 1
+	man->_mp_d[0] = (mp_limb_t)high;
+#else
+#error Cannot cope with FNBIGIT
+#endif
+	if (sign < 0)
+	    man->_mp_size = -man->_mp_size;
+    }
+}
+
+/* Convenient union types for checking the layout of IEEE 754 types -
+   based on defs in GNU libc <ieee754.h>
+*/
+
+union stg_ieee754_flt
+{
+   float f;
+   struct {
+
+#if WORDS_BIGENDIAN
+	unsigned int negative:1;
+	unsigned int exponent:8;
+	unsigned int mantissa:23;
+#else
+	unsigned int mantissa:23;
+	unsigned int exponent:8;
+	unsigned int negative:1;
+#endif
+   } ieee;
+   struct {
+
+#if WORDS_BIGENDIAN
+	unsigned int negative:1;
+	unsigned int exponent:8;
+	unsigned int quiet_nan:1;
+	unsigned int mantissa:22;
+#else
+	unsigned int mantissa:22;
+	unsigned int quiet_nan:1;
+	unsigned int exponent:8;
+	unsigned int negative:1;
+#endif
+   } ieee_nan;
+};
+
+/*
+ 
+ To recap, here's the representation of a double precision
+ IEEE floating point number:
+
+ sign         63           sign bit (0==positive, 1==negative)
+ exponent     62-52        exponent (biased by 1023)
+ fraction     51-0         fraction (bits to right of binary point)
+*/
+
+union stg_ieee754_dbl
+{
+   double d;
+   struct {
+
+#if WORDS_BIGENDIAN
+	unsigned int negative:1;
+	unsigned int exponent:11;
+	unsigned int mantissa0:20;
+	unsigned int mantissa1:32;
+#else
+	unsigned int mantissa1:32;
+	unsigned int mantissa0:20;
+	unsigned int exponent:11;
+	unsigned int negative:1;
+#endif
+   } ieee;
+    /* This format makes it easier to see if a NaN is a signalling NaN.  */
+   struct {
+
+#if WORDS_BIGENDIAN
+	unsigned int negative:1;
+	unsigned int exponent:11;
+	unsigned int quiet_nan:1;
+	unsigned int mantissa0:19;
+	unsigned int mantissa1:32;
+#else
+	unsigned int mantissa1:32;
+	unsigned int mantissa0:19;
+	unsigned int quiet_nan:1;
+	unsigned int exponent:11;
+	unsigned int negative:1;
+#endif
+   } ieee_nan;
+};
+
+/*
+ * Predicates for testing for extreme IEEE fp values. Used
+ * by the bytecode evaluator and the Prelude.
+ *
+ */ 
+
+/* In case you don't suppport IEEE, you'll just get dummy defs.. */
+#ifdef IEEE_FLOATING_POINT
+
+StgInt
+isDoubleNaN(StgDouble d)
+{
+  union stg_ieee754_dbl u;
+  
+  u.d = d;
+
+  return (
+    u.ieee.exponent  == 2047 /* 2^11 - 1 */ &&  /* Is the exponent all ones? */
+    (u.ieee.mantissa0 != 0 || u.ieee.mantissa1 != 0)
+    	/* and the mantissa non-zero? */
+    );
+}
+
+StgInt
+isDoubleInfinite(StgDouble d)
+{
+    union stg_ieee754_dbl u;
+
+    u.d = d;
+
+    /* Inf iff exponent is all ones, mantissa all zeros */
+    return (
+        u.ieee.exponent  == 2047 /* 2^11 - 1 */ &&
+	u.ieee.mantissa0 == 0 		        &&
+	u.ieee.mantissa1 == 0
+      );
+}
+
+StgInt
+isDoubleDenormalized(StgDouble d) 
+{
+    union stg_ieee754_dbl u;
+
+    u.d = d;
+
+    /* A (single/double/quad) precision floating point number
+       is denormalised iff:
+        - exponent is zero
+	- mantissa is non-zero.
+        - (don't care about setting of sign bit.)
+
+    */
+    return (  
+	u.ieee.exponent  == 0 &&
+	(u.ieee.mantissa0 != 0 ||
+	 u.ieee.mantissa1 != 0)
+      );
+	 
+}
+
+StgInt
+isDoubleNegativeZero(StgDouble d) 
+{
+    union stg_ieee754_dbl u;
+
+    u.d = d;
+    /* sign (bit 63) set (only) => negative zero */
+
+    return (
+    	u.ieee.negative  == 1 &&
+	u.ieee.exponent  == 0 &&
+	u.ieee.mantissa0 == 0 &&
+	u.ieee.mantissa1 == 0);
+}
+
+/* Same tests, this time for StgFloats. */
+
+/*
+ To recap, here's the representation of a single precision
+ IEEE floating point number:
+
+ sign         31           sign bit (0 == positive, 1 == negative)
+ exponent     30-23        exponent (biased by 127)
+ fraction     22-0         fraction (bits to right of binary point)
+*/
+
+
+StgInt
+isFloatNaN(StgFloat f)
+{
+    union stg_ieee754_flt u;
+    u.f = f;
+
+   /* Floating point NaN iff exponent is all ones, mantissa is
+      non-zero (but see below.) */
+   return (
+   	u.ieee.exponent == 255 /* 2^8 - 1 */ &&
+	u.ieee.mantissa != 0);
+}
+
+StgInt
+isFloatInfinite(StgFloat f)
+{
+    union stg_ieee754_flt u;
+    u.f = f;
+  
+    /* A float is Inf iff exponent is max (all ones),
+       and mantissa is min(all zeros.) */
+    return (
+    	u.ieee.exponent == 255 /* 2^8 - 1 */ &&
+	u.ieee.mantissa == 0);
+}
+
+StgInt
+isFloatDenormalized(StgFloat f)
+{
+    union stg_ieee754_flt u;
+    u.f = f;
+
+    /* A (single/double/quad) precision floating point number
+       is denormalised iff:
+        - exponent is zero
+	- mantissa is non-zero.
+        - (don't care about setting of sign bit.)
+
+    */
+    return (
+    	u.ieee.exponent == 0 &&
+	u.ieee.mantissa != 0);
+}
+
+StgInt
+isFloatNegativeZero(StgFloat f) 
+{
+    union stg_ieee754_flt u;
+    u.f = f;
+
+    /* sign (bit 31) set (only) => negative zero */
+    return (
+	u.ieee.negative      &&
+	u.ieee.exponent == 0 &&
+	u.ieee.mantissa == 0);
+}
+
+#else /* ! IEEE_FLOATING_POINT */
+
+/* Dummy definitions of predicates - they all return false */
+StgInt isDoubleNaN(d) StgDouble d; { return 0; }
+StgInt isDoubleInfinite(d) StgDouble d; { return 0; }
+StgInt isDoubleDenormalized(d) StgDouble d; { return 0; }
+StgInt isDoubleNegativeZero(d) StgDouble d; { return 0; }
+StgInt isFloatNaN(f) StgFloat f; { return 0; }
+StgInt isFloatInfinite(f) StgFloat f; { return 0; }
+StgInt isFloatDenormalized(f) StgFloat f; { return 0; }
+StgInt isFloatNegativeZero(f) StgFloat f; { return 0; }
+
+#endif /* ! IEEE_FLOATING_POINT */
diff --git a/rts/StgRun.h b/rts/StgRun.h
new file mode 100644
index 0000000000..da376b4971
--- /dev/null
+++ b/rts/StgRun.h
@@ -0,0 +1,16 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * Tiny assembler 'layer' between the C and STG worlds.
+ * 
+ ---------------------------------------------------------------------------- */
+
+#ifndef STGRUN_H
+#define STGRUN_H
+
+extern StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg);
+
+RTS_FUN(StgReturn);
+
+#endif /* STGRUN_H */
diff --git a/rts/StgStartup.cmm b/rts/StgStartup.cmm
new file mode 100644
index 0000000000..2f2a759c81
--- /dev/null
+++ b/rts/StgStartup.cmm
@@ -0,0 +1,218 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * Code for starting, stopping and restarting threads.
+ *
+ * This file is written in a subset of C--, extended with various
+ * features specific to GHC.  It is compiled by GHC directly.  For the
+ * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Cmm.h"
+
+/*
+ * This module contains the two entry points and the final exit point
+ * to/from the Haskell world.  We can enter either by:
+ *
+ *   a) returning to the address on the top of the stack, or
+ *   b) entering the closure on the top of the stack
+ *
+ * the function stg_stop_thread_entry is the final exit for a
+ * thread: it is the last return address on the stack.  It returns
+ * to the scheduler marking the thread as finished.
+ */
+
+#define CHECK_SENSIBLE_REGS() \
+    ASSERT(Hp != 0);			\
+    ASSERT(Sp != 0);			\
+    ASSERT(SpLim != 0);			\
+    ASSERT(HpLim != 0);			\
+    ASSERT(SpLim - WDS(RESERVED_STACK_WORDS) <= Sp); \
+    ASSERT(HpLim >= Hp);
+
+/* -----------------------------------------------------------------------------
+   Returning from the STG world.
+
+   This is a polymorphic return address, meaning that any old constructor
+   can be returned, we don't care (actually, it's probably going to be
+   an IOok constructor, which will indirect through the vector table
+   slot 0).
+   -------------------------------------------------------------------------- */
+
+#if defined(PROFILING)
+#define STOP_THREAD_BITMAP 3
+#define STOP_THREAD_WORDS  2
+#else
+#define STOP_THREAD_BITMAP 0
+#define STOP_THREAD_WORDS  0
+#endif
+
+/* A polymorhpic return address, where all the vector slots point to the
+   direct entry point. */
+INFO_TABLE_RET( stg_stop_thread, STOP_THREAD_WORDS, STOP_THREAD_BITMAP,
+		STOP_FRAME, 
+		RET_LBL(stg_stop_thread),
+		RET_LBL(stg_stop_thread),
+		RET_LBL(stg_stop_thread),
+		RET_LBL(stg_stop_thread),
+		RET_LBL(stg_stop_thread),
+		RET_LBL(stg_stop_thread),
+		RET_LBL(stg_stop_thread),
+		RET_LBL(stg_stop_thread) )
+{
+    /* 
+       The final exit.
+      
+       The top-top-level closures (e.g., "main") are of type "IO a".
+       When entered, they perform an IO action and return an 'a' in R1.
+      
+       We save R1 on top of the stack where the scheduler can find it,
+       tidy up the registers and return to the scheduler.
+      
+       We Leave the stack looking like this:
+      
+            	+----------------+
+                |      -------------------> return value
+            	+----------------+
+            	| stg_enter_info |
+            	+----------------+
+      
+       The stg_enter_info is just a dummy info table so that the
+       garbage collector can understand the stack (there must always
+       be an info table on top of the stack).
+    */
+
+    Sp = Sp + SIZEOF_StgStopFrame - WDS(2);
+    Sp(1) = R1;
+    Sp(0) = stg_enter_info;
+
+    StgTSO_what_next(CurrentTSO) = ThreadComplete::I16;
+
+    SAVE_THREAD_STATE();
+
+    /* The return code goes in BaseReg->rRet, and BaseReg is returned in R1 */
+    StgRegTable_rRet(BaseReg) = ThreadFinished;
+    R1 = BaseReg;
+
+    jump StgReturn;
+}
+
+/* -----------------------------------------------------------------------------
+   Start a thread from the scheduler by returning to the address on
+   the top of the stack.  This is used for all entries to STG code
+   from C land.
+
+   On the way back, we (usually) pass through stg_returnToSched which saves
+   the thread's state away nicely.
+   -------------------------------------------------------------------------- */
+
+stg_returnToStackTop
+{
+  LOAD_THREAD_STATE();
+  CHECK_SENSIBLE_REGS();
+  jump %ENTRY_CODE(Sp(0));
+}
+
+stg_returnToSched
+{
+  SAVE_THREAD_STATE();
+  foreign "C" threadPaused(MyCapability() "ptr", CurrentTSO);
+  jump StgReturn;
+}
+
+// A variant of stg_returntToSched that doesn't call threadPaused() on the
+// current thread.  This is used for switching from compiled execution to the
+// interpreter, where calling threadPaused() on every switch would be too
+// expensive.
+stg_returnToSchedNotPaused
+{
+  SAVE_THREAD_STATE();
+  jump StgReturn;
+}
+
+// A variant of stg_returnToSched, but instead of returning directly to the
+// scheduler, we jump to the code fragment pointed to by R2.  This lets us
+// perform some final actions after making the thread safe, such as unlocking
+// the MVar on which we are about to block in SMP mode.
+stg_returnToSchedButFirst
+{
+  SAVE_THREAD_STATE();
+  foreign "C" threadPaused(MyCapability() "ptr", CurrentTSO);
+  jump R2;
+}
+
+/* -----------------------------------------------------------------------------
+    Strict IO application - performing an IO action and entering its result.
+    
+    rts_evalIO() lets you perform Haskell IO actions from outside of
+    Haskell-land, returning back to you their result. Want this result
+    to be evaluated to WHNF by that time, so that we can easily get at
+    the int/char/whatever using the various get{Ty} functions provided
+    by the RTS API.
+
+    forceIO takes care of this, performing the IO action and entering the
+    results that comes back.
+    ------------------------------------------------------------------------- */
+
+INFO_TABLE_RET( stg_forceIO, 0/*size*/, 0/*bitmap*/, RET_SMALL)
+
+#ifdef REG_R1
+{
+  Sp_adj(1);
+  ENTER();
+}
+#else
+{
+  R1 = Sp(0);
+  Sp_adj(2);
+  ENTER();
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+    Non-strict IO application.
+
+    This stack frame works like stg_forceIO_info except that it
+    doesn't evaluate the return value.  We need the layer because the
+    return convention for an IO action differs depending on whether R1
+    is a register or not.
+    ------------------------------------------------------------------------- */
+
+INFO_TABLE_RET( stg_noforceIO, 0/*size*/, 0/*bitmap*/, RET_SMALL )
+
+#ifdef REG_R1
+{
+  Sp_adj(1);
+  jump %ENTRY_CODE(Sp(0));
+}
+#else
+{
+  R1 = Sp(0);
+  Sp_adj(2);
+  jump %ENTRY_CODE(Sp(0));
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+   Special STG entry points for module registration.
+   -------------------------------------------------------------------------- */
+
+stg_init_finish
+{
+  jump StgReturn;
+}
+
+/* On entry to stg_init:
+ *    init_stack[0] = &stg_init_ret;
+ *    init_stack[1] = __stginit_Something;
+ */
+stg_init
+{
+  W_ next;
+  Sp = W_[BaseReg + OFFSET_StgRegTable_rSp];
+  next = W_[Sp];
+  Sp_adj(1);
+  jump next;
+}
diff --git a/rts/StgStdThunks.cmm b/rts/StgStdThunks.cmm
new file mode 100644
index 0000000000..342a6eb164
--- /dev/null
+++ b/rts/StgStdThunks.cmm
@@ -0,0 +1,274 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The University of Glasgow, 1998-2004
+ *
+ * Canned "Standard Form" Thunks
+ *
+ * This file is written in a subset of C--, extended with various
+ * features specific to GHC.  It is compiled by GHC directly.  For the
+ * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Cmm.h"
+
+/* -----------------------------------------------------------------------------
+   The code for a thunk that simply extracts a field from a
+   single-constructor datatype depends only on the offset of the field
+   to be selected.
+
+   Here we define some canned "selector" thunks that do just that; any
+   selector thunk appearing in a program will refer to one of these
+   instead of being compiled independently.
+
+   The garbage collector spots selector thunks and reduces them if
+   possible, in order to avoid space leaks resulting from lazy pattern
+   matching.
+   -------------------------------------------------------------------------- */
+
+#define WITHUPD_FRAME_SIZE  (SIZEOF_StgUpdateFrame + SIZEOF_StgHeader)
+#define NOUPD_FRAME_SIZE    (SIZEOF_StgHeader)
+
+#ifdef PROFILING
+#define SAVE_CCCS(fs)  	StgHeader_ccs(Sp-fs) = W_[CCCS]
+#define GET_SAVED_CCCS  W_[CCCS] = StgHeader_ccs(Sp)
+#define RET_BITMAP    3
+#define RET_FRAMESIZE 2
+#else
+#define SAVE_CCCS(fs)   /* empty */
+#define GET_SAVED_CCCS  /* empty */
+#define RET_BITMAP    0
+#define RET_FRAMESIZE 0
+#endif
+
+#define SELECTOR_CODE_UPD(offset) \
+  INFO_TABLE_RET(stg_sel_ret_##offset##_upd, RET_FRAMESIZE, RET_BITMAP, RET_SMALL)	\
+  {									\
+      R1 = StgClosure_payload(R1,offset);				\
+      GET_SAVED_CCCS;							\
+      Sp = Sp + SIZEOF_StgHeader;					\
+      ENTER();								\
+  }									\
+									\
+  INFO_TABLE_SELECTOR(stg_sel_##offset##_upd, offset, THUNK_SELECTOR, "stg_sel_upd", "stg_sel_upd") \
+  {									\
+      TICK_ENT_DYN_THK();						\
+      STK_CHK_NP(WITHUPD_FRAME_SIZE);					\
+      UPD_BH_UPDATABLE();						\
+      LDV_ENTER(R1);							\
+      PUSH_UPD_FRAME(Sp - SIZEOF_StgUpdateFrame, R1);			\
+      ENTER_CCS_THUNK(R1);						\
+      SAVE_CCCS(WITHUPD_FRAME_SIZE);					\
+      W_[Sp-WITHUPD_FRAME_SIZE] = stg_sel_ret_##offset##_upd_info;	\
+      R1 = StgThunk_payload(R1,0);					\
+      Sp = Sp - WITHUPD_FRAME_SIZE;					\
+      jump %GET_ENTRY(R1);						\
+  }
+  /* NOTE: no need to ENTER() here, we know the closure cannot evaluate to a function,
+     because we're going to do a field selection on the result. */
+
+SELECTOR_CODE_UPD(0)
+SELECTOR_CODE_UPD(1)
+SELECTOR_CODE_UPD(2)
+SELECTOR_CODE_UPD(3)
+SELECTOR_CODE_UPD(4)
+SELECTOR_CODE_UPD(5)
+SELECTOR_CODE_UPD(6)
+SELECTOR_CODE_UPD(7)
+SELECTOR_CODE_UPD(8)
+SELECTOR_CODE_UPD(9)
+SELECTOR_CODE_UPD(10)
+SELECTOR_CODE_UPD(11)
+SELECTOR_CODE_UPD(12)
+SELECTOR_CODE_UPD(13)
+SELECTOR_CODE_UPD(14)
+SELECTOR_CODE_UPD(15)
+
+#define SELECTOR_CODE_NOUPD(offset) \
+  INFO_TABLE_RET(stg_sel_ret_##offset##_noupd, RET_FRAMESIZE, RET_BITMAP, RET_SMALL)	\
+  {									\
+      R1 = StgClosure_payload(R1,offset);				\
+      GET_SAVED_CCCS;							\
+      Sp = Sp + SIZEOF_StgHeader;					\
+      jump %GET_ENTRY(R1);						\
+  }									\
+									\
+  INFO_TABLE_SELECTOR(stg_sel_##offset##_noupd, offset, THUNK_SELECTOR, "stg_sel_noupd", "stg_sel_noupd")\
+  {									\
+      TICK_ENT_DYN_THK();						\
+      STK_CHK_NP(NOUPD_FRAME_SIZE);					\
+      UPD_BH_SINGLE_ENTRY();						\
+      LDV_ENTER(R1);							\
+      TICK_UPDF_OMITTED();						\
+      ENTER_CCS_THUNK(R1);						\
+      SAVE_CCCS(NOUPD_FRAME_SIZE);					\
+      W_[Sp-NOUPD_FRAME_SIZE] = stg_sel_ret_##offset##_noupd_info;	\
+      R1 = StgThunk_payload(R1,0);					\
+      Sp = Sp - NOUPD_FRAME_SIZE;					\
+      jump %GET_ENTRY(R1);						\
+  }
+
+SELECTOR_CODE_NOUPD(0)
+SELECTOR_CODE_NOUPD(1)
+SELECTOR_CODE_NOUPD(2)
+SELECTOR_CODE_NOUPD(3)
+SELECTOR_CODE_NOUPD(4)
+SELECTOR_CODE_NOUPD(5)
+SELECTOR_CODE_NOUPD(6)
+SELECTOR_CODE_NOUPD(7)
+SELECTOR_CODE_NOUPD(8)
+SELECTOR_CODE_NOUPD(9)
+SELECTOR_CODE_NOUPD(10)
+SELECTOR_CODE_NOUPD(11)
+SELECTOR_CODE_NOUPD(12)
+SELECTOR_CODE_NOUPD(13)
+SELECTOR_CODE_NOUPD(14)
+SELECTOR_CODE_NOUPD(15)
+
+/* -----------------------------------------------------------------------------
+   Apply thunks
+
+   An apply thunk is a thunk of the form
+	
+		let z = [x1...xn] \u x1...xn
+		in ...
+
+   We pre-compile some of these because the code is always the same.
+
+   These have to be independent of the update frame size, so the code
+   works when profiling etc.
+   -------------------------------------------------------------------------- */
+
+/* stg_ap_1_upd_info is a bit redundant, but there appears to be a bug
+ * in the compiler that means stg_ap_1 is generated occasionally (ToDo)
+ */
+
+INFO_TABLE(stg_ap_1_upd,1,1,THUNK_1_0,"stg_ap_1_upd_info","stg_ap_1_upd_info")
+{
+  TICK_ENT_DYN_THK();
+  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(1));
+  UPD_BH_UPDATABLE();
+  LDV_ENTER(R1);
+  ENTER_CCS_THUNK(R1);
+  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
+  R1 = StgThunk_payload(R1,0);
+  Sp = Sp - SIZEOF_StgUpdateFrame;
+  jump stg_ap_0_fast;
+}
+
+INFO_TABLE(stg_ap_2_upd,2,0,THUNK_2_0,"stg_ap_2_upd_info","stg_ap_2_upd_info")
+{
+  TICK_ENT_DYN_THK();
+  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(2));
+  UPD_BH_UPDATABLE();
+  LDV_ENTER(R1);
+  ENTER_CCS_THUNK(R1);
+  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,1);
+  R1 = StgThunk_payload(R1,0);
+  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(1);
+  Sp_adj(-1); // for stg_ap_*_ret
+  TICK_UNKNOWN_CALL();
+  TICK_SLOW_CALL_p();
+  jump RET_LBL(stg_ap_p);
+}
+
+INFO_TABLE(stg_ap_3_upd,3,0,THUNK,"stg_ap_3_upd_info","stg_ap_3_upd_info")
+{
+  TICK_ENT_DYN_THK();
+  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(3));
+  UPD_BH_UPDATABLE();
+  LDV_ENTER(R1);
+  ENTER_CCS_THUNK(R1);
+  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,2);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(2)] = StgThunk_payload(R1,1);
+  R1 = StgThunk_payload(R1,0);
+  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(2);
+  Sp_adj(-1); // for stg_ap_*_ret
+  TICK_UNKNOWN_CALL();
+  TICK_SLOW_CALL_pp();
+  jump RET_LBL(stg_ap_pp);
+}
+
+INFO_TABLE(stg_ap_4_upd,4,0,THUNK,"stg_ap_4_upd_info","stg_ap_4_upd_info")
+{
+  TICK_ENT_DYN_THK();
+  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(4));
+  UPD_BH_UPDATABLE();
+  LDV_ENTER(R1);
+  ENTER_CCS_THUNK(R1);
+  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,3);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(2)] = StgThunk_payload(R1,2);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(3)] = StgThunk_payload(R1,1);
+  R1 = StgThunk_payload(R1,0);
+  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(3);
+  Sp_adj(-1); // for stg_ap_*_ret
+  TICK_UNKNOWN_CALL();
+  TICK_SLOW_CALL_ppp();
+  jump RET_LBL(stg_ap_ppp);
+}
+
+INFO_TABLE(stg_ap_5_upd,5,0,THUNK,"stg_ap_5_upd_info","stg_ap_5_upd_info")
+{
+  TICK_ENT_DYN_THK();
+  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(5));
+  UPD_BH_UPDATABLE();
+  LDV_ENTER(R1);
+  ENTER_CCS_THUNK(R1);
+  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,4);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(2)] = StgThunk_payload(R1,3);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(3)] = StgThunk_payload(R1,2);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(4)] = StgThunk_payload(R1,1);
+  R1 = StgThunk_payload(R1,0);
+  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(4);
+  Sp_adj(-1); // for stg_ap_*_ret
+  TICK_UNKNOWN_CALL();
+  TICK_SLOW_CALL_pppp();
+  jump RET_LBL(stg_ap_pppp);
+}
+
+INFO_TABLE(stg_ap_6_upd,6,0,THUNK,"stg_ap_6_upd_info","stg_ap_6_upd_info")
+{
+  TICK_ENT_DYN_THK();
+  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(6));
+  UPD_BH_UPDATABLE();
+  LDV_ENTER(R1);
+  ENTER_CCS_THUNK(R1);
+  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,5);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(2)] = StgThunk_payload(R1,4);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(3)] = StgThunk_payload(R1,3);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(4)] = StgThunk_payload(R1,2);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(5)] = StgThunk_payload(R1,1);
+  R1 = StgThunk_payload(R1,0);
+  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(5);
+  Sp_adj(-1); // for stg_ap_*_ret
+  TICK_UNKNOWN_CALL();
+  TICK_SLOW_CALL_ppppp();
+  jump RET_LBL(stg_ap_ppppp);
+}
+
+INFO_TABLE(stg_ap_7_upd,7,0,THUNK,"stg_ap_7_upd_info","stg_ap_7_upd_info")
+{
+  TICK_ENT_DYN_THK();
+  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(7));
+  UPD_BH_UPDATABLE();
+  LDV_ENTER(R1);
+  ENTER_CCS_THUNK(R1);
+  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,6);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(2)] = StgThunk_payload(R1,5);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(3)] = StgThunk_payload(R1,4);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(4)] = StgThunk_payload(R1,3);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(5)] = StgThunk_payload(R1,2);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(6)] = StgThunk_payload(R1,1);
+  R1 = StgThunk_payload(R1,0);
+  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(6);
+  Sp_adj(-1); // for stg_ap_*_ret
+  TICK_UNKNOWN_CALL();
+  TICK_SLOW_CALL_pppppp();
+  jump RET_LBL(stg_ap_pppppp);
+}
diff --git a/rts/Storage.c b/rts/Storage.c
new file mode 100644
index 0000000000..974be45f10
--- /dev/null
+++ b/rts/Storage.c
@@ -0,0 +1,1137 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * Storage manager front end
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "Stats.h"
+#include "Hooks.h"
+#include "BlockAlloc.h"
+#include "MBlock.h"
+#include "Weak.h"
+#include "Sanity.h"
+#include "Arena.h"
+#include "OSThreads.h"
+#include "Capability.h"
+#include "Storage.h"
+#include "Schedule.h"
+#include "RetainerProfile.h"	// for counting memory blocks (memInventory)
+
+#include <stdlib.h>
+#include <string.h>
+
+/* 
+ * All these globals require sm_mutex to access in THREADED_RTS mode.
+ */
+StgClosure    *caf_list         = NULL;
+StgClosure    *revertible_caf_list = NULL;
+rtsBool       keepCAFs;
+
+bdescr *small_alloc_list;	/* allocate()d small objects */
+bdescr *pinned_object_block;    /* allocate pinned objects into this block */
+nat alloc_blocks;		/* number of allocate()d blocks since GC */
+nat alloc_blocks_lim;		/* approximate limit on alloc_blocks */
+
+StgPtr alloc_Hp    = NULL;	/* next free byte in small_alloc_list */
+StgPtr alloc_HpLim = NULL;	/* end of block at small_alloc_list   */
+
+generation *generations = NULL;	/* all the generations */
+generation *g0		= NULL; /* generation 0, for convenience */
+generation *oldest_gen  = NULL; /* oldest generation, for convenience */
+step *g0s0 		= NULL; /* generation 0, step 0, for convenience */
+
+ullong total_allocated = 0;	/* total memory allocated during run */
+
+nat n_nurseries         = 0;    /* == RtsFlags.ParFlags.nNodes, convenience */
+step *nurseries         = NULL; /* array of nurseries, >1 only if THREADED_RTS */
+
+#ifdef THREADED_RTS
+/*
+ * Storage manager mutex:  protects all the above state from
+ * simultaneous access by two STG threads.
+ */
+Mutex sm_mutex;
+/*
+ * This mutex is used by atomicModifyMutVar# only
+ */
+Mutex atomic_modify_mutvar_mutex;
+#endif
+
+
+/*
+ * Forward references
+ */
+static void *stgAllocForGMP   (size_t size_in_bytes);
+static void *stgReallocForGMP (void *ptr, size_t old_size, size_t new_size);
+static void  stgDeallocForGMP (void *ptr, size_t size);
+
+static void
+initStep (step *stp, int g, int s)
+{
+    stp->no = s;
+    stp->blocks = NULL;
+    stp->n_blocks = 0;
+    stp->old_blocks = NULL;
+    stp->n_old_blocks = 0;
+    stp->gen = &generations[g];
+    stp->gen_no = g;
+    stp->hp = NULL;
+    stp->hpLim = NULL;
+    stp->hp_bd = NULL;
+    stp->scavd_hp = NULL;
+    stp->scavd_hpLim = NULL;
+    stp->scan = NULL;
+    stp->scan_bd = NULL;
+    stp->large_objects = NULL;
+    stp->n_large_blocks = 0;
+    stp->new_large_objects = NULL;
+    stp->scavenged_large_objects = NULL;
+    stp->n_scavenged_large_blocks = 0;
+    stp->is_compacted = 0;
+    stp->bitmap = NULL;
+}
+
+void
+initStorage( void )
+{
+  nat g, s;
+  generation *gen;
+
+  if (generations != NULL) {
+      // multi-init protection
+      return;
+  }
+
+  /* Sanity check to make sure the LOOKS_LIKE_ macros appear to be
+   * doing something reasonable.
+   */
+  ASSERT(LOOKS_LIKE_INFO_PTR(&stg_BLACKHOLE_info));
+  ASSERT(LOOKS_LIKE_CLOSURE_PTR(&stg_dummy_ret_closure));
+  ASSERT(!HEAP_ALLOCED(&stg_dummy_ret_closure));
+  
+  if (RtsFlags.GcFlags.maxHeapSize != 0 &&
+      RtsFlags.GcFlags.heapSizeSuggestion > 
+      RtsFlags.GcFlags.maxHeapSize) {
+    RtsFlags.GcFlags.maxHeapSize = RtsFlags.GcFlags.heapSizeSuggestion;
+  }
+
+  if (RtsFlags.GcFlags.maxHeapSize != 0 &&
+      RtsFlags.GcFlags.minAllocAreaSize > 
+      RtsFlags.GcFlags.maxHeapSize) {
+      errorBelch("maximum heap size (-M) is smaller than minimum alloc area size (-A)");
+      exit(1);
+  }
+
+  initBlockAllocator();
+  
+#if defined(THREADED_RTS)
+  initMutex(&sm_mutex);
+  initMutex(&atomic_modify_mutvar_mutex);
+#endif
+
+  ACQUIRE_SM_LOCK;
+
+  /* allocate generation info array */
+  generations = (generation *)stgMallocBytes(RtsFlags.GcFlags.generations 
+					     * sizeof(struct generation_),
+					     "initStorage: gens");
+
+  /* Initialise all generations */
+  for(g = 0; g < RtsFlags.GcFlags.generations; g++) {
+    gen = &generations[g];
+    gen->no = g;
+    gen->mut_list = allocBlock();
+    gen->collections = 0;
+    gen->failed_promotions = 0;
+    gen->max_blocks = 0;
+  }
+
+  /* A couple of convenience pointers */
+  g0 = &generations[0];
+  oldest_gen = &generations[RtsFlags.GcFlags.generations-1];
+
+  /* Allocate step structures in each generation */
+  if (RtsFlags.GcFlags.generations > 1) {
+    /* Only for multiple-generations */
+
+    /* Oldest generation: one step */
+    oldest_gen->n_steps = 1;
+    oldest_gen->steps = 
+      stgMallocBytes(1 * sizeof(struct step_), "initStorage: last step");
+
+    /* set up all except the oldest generation with 2 steps */
+    for(g = 0; g < RtsFlags.GcFlags.generations-1; g++) {
+      generations[g].n_steps = RtsFlags.GcFlags.steps;
+      generations[g].steps  = 
+	stgMallocBytes (RtsFlags.GcFlags.steps * sizeof(struct step_),
+			"initStorage: steps");
+    }
+    
+  } else {
+    /* single generation, i.e. a two-space collector */
+    g0->n_steps = 1;
+    g0->steps = stgMallocBytes (sizeof(struct step_), "initStorage: steps");
+  }
+
+#ifdef THREADED_RTS
+  n_nurseries = n_capabilities;
+  nurseries = stgMallocBytes (n_nurseries * sizeof(struct step_),
+			      "initStorage: nurseries");
+#else
+  n_nurseries = 1;
+  nurseries = g0->steps; // just share nurseries[0] with g0s0
+#endif  
+
+  /* Initialise all steps */
+  for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+    for (s = 0; s < generations[g].n_steps; s++) {
+	initStep(&generations[g].steps[s], g, s);
+    }
+  }
+  
+#ifdef THREADED_RTS
+  for (s = 0; s < n_nurseries; s++) {
+      initStep(&nurseries[s], 0, s);
+  }
+#endif
+  
+  /* Set up the destination pointers in each younger gen. step */
+  for (g = 0; g < RtsFlags.GcFlags.generations-1; g++) {
+    for (s = 0; s < generations[g].n_steps-1; s++) {
+      generations[g].steps[s].to = &generations[g].steps[s+1];
+    }
+    generations[g].steps[s].to = &generations[g+1].steps[0];
+  }
+  oldest_gen->steps[0].to = &oldest_gen->steps[0];
+  
+#ifdef THREADED_RTS
+  for (s = 0; s < n_nurseries; s++) {
+      nurseries[s].to = generations[0].steps[0].to;
+  }
+#endif
+  
+  /* The oldest generation has one step. */
+  if (RtsFlags.GcFlags.compact) {
+      if (RtsFlags.GcFlags.generations == 1) {
+	  errorBelch("WARNING: compaction is incompatible with -G1; disabled");
+      } else {
+	  oldest_gen->steps[0].is_compacted = 1;
+      }
+  }
+
+#ifdef THREADED_RTS
+  if (RtsFlags.GcFlags.generations == 1) {
+      errorBelch("-G1 is incompatible with -threaded");
+      stg_exit(EXIT_FAILURE);
+  }
+#endif
+
+  /* generation 0 is special: that's the nursery */
+  generations[0].max_blocks = 0;
+
+  /* G0S0: the allocation area.  Policy: keep the allocation area
+   * small to begin with, even if we have a large suggested heap
+   * size.  Reason: we're going to do a major collection first, and we
+   * don't want it to be a big one.  This vague idea is borne out by 
+   * rigorous experimental evidence.
+   */
+  g0s0 = &generations[0].steps[0];
+
+  allocNurseries();
+
+  weak_ptr_list = NULL;
+  caf_list = NULL;
+  revertible_caf_list = NULL;
+   
+  /* initialise the allocate() interface */
+  small_alloc_list = NULL;
+  alloc_blocks = 0;
+  alloc_blocks_lim = RtsFlags.GcFlags.minAllocAreaSize;
+
+  /* Tell GNU multi-precision pkg about our custom alloc functions */
+  mp_set_memory_functions(stgAllocForGMP, stgReallocForGMP, stgDeallocForGMP);
+
+  IF_DEBUG(gc, statDescribeGens());
+
+  RELEASE_SM_LOCK;
+}
+
+void
+exitStorage (void)
+{
+    stat_exit(calcAllocated());
+}
+
+void
+freeStorage (void)
+{
+    freeAllMBlocks();
+}
+
+/* -----------------------------------------------------------------------------
+   CAF management.
+
+   The entry code for every CAF does the following:
+     
+      - builds a CAF_BLACKHOLE in the heap
+      - pushes an update frame pointing to the CAF_BLACKHOLE
+      - invokes UPD_CAF(), which:
+          - calls newCaf, below
+	  - updates the CAF with a static indirection to the CAF_BLACKHOLE
+      
+   Why do we build a BLACKHOLE in the heap rather than just updating
+   the thunk directly?  It's so that we only need one kind of update
+   frame - otherwise we'd need a static version of the update frame too.
+
+   newCaf() does the following:
+       
+      - it puts the CAF on the oldest generation's mut-once list.
+        This is so that we can treat the CAF as a root when collecting
+	younger generations.
+
+   For GHCI, we have additional requirements when dealing with CAFs:
+
+      - we must *retain* all dynamically-loaded CAFs ever entered,
+        just in case we need them again.
+      - we must be able to *revert* CAFs that have been evaluated, to
+        their pre-evaluated form.
+
+      To do this, we use an additional CAF list.  When newCaf() is
+      called on a dynamically-loaded CAF, we add it to the CAF list
+      instead of the old-generation mutable list, and save away its
+      old info pointer (in caf->saved_info) for later reversion.
+
+      To revert all the CAFs, we traverse the CAF list and reset the
+      info pointer to caf->saved_info, then throw away the CAF list.
+      (see GC.c:revertCAFs()).
+
+      -- SDM 29/1/01
+
+   -------------------------------------------------------------------------- */
+
+void
+newCAF(StgClosure* caf)
+{
+  ACQUIRE_SM_LOCK;
+
+  if(keepCAFs)
+  {
+    // HACK:
+    // If we are in GHCi _and_ we are using dynamic libraries,
+    // then we can't redirect newCAF calls to newDynCAF (see below),
+    // so we make newCAF behave almost like newDynCAF.
+    // The dynamic libraries might be used by both the interpreted
+    // program and GHCi itself, so they must not be reverted.
+    // This also means that in GHCi with dynamic libraries, CAFs are not
+    // garbage collected. If this turns out to be a problem, we could
+    // do another hack here and do an address range test on caf to figure
+    // out whether it is from a dynamic library.
+    ((StgIndStatic *)caf)->saved_info  = (StgInfoTable *)caf->header.info;
+    ((StgIndStatic *)caf)->static_link = caf_list;
+    caf_list = caf;
+  }
+  else
+  {
+    /* Put this CAF on the mutable list for the old generation.
+    * This is a HACK - the IND_STATIC closure doesn't really have
+    * a mut_link field, but we pretend it has - in fact we re-use
+    * the STATIC_LINK field for the time being, because when we
+    * come to do a major GC we won't need the mut_link field
+    * any more and can use it as a STATIC_LINK.
+    */
+    ((StgIndStatic *)caf)->saved_info = NULL;
+    recordMutableGen(caf, oldest_gen);
+  }
+  
+  RELEASE_SM_LOCK;
+
+#ifdef PAR
+  /* If we are PAR or DIST then  we never forget a CAF */
+  { globalAddr *newGA;
+    //debugBelch("<##> Globalising CAF %08x %s",caf,info_type(caf));
+    newGA=makeGlobal(caf,rtsTrue); /*given full weight*/
+    ASSERT(newGA);
+  } 
+#endif /* PAR */
+}
+
+// An alternate version of newCaf which is used for dynamically loaded
+// object code in GHCi.  In this case we want to retain *all* CAFs in
+// the object code, because they might be demanded at any time from an
+// expression evaluated on the command line.
+// Also, GHCi might want to revert CAFs, so we add these to the
+// revertible_caf_list.
+//
+// The linker hackily arranges that references to newCaf from dynamic
+// code end up pointing to newDynCAF.
+void
+newDynCAF(StgClosure *caf)
+{
+    ACQUIRE_SM_LOCK;
+
+    ((StgIndStatic *)caf)->saved_info  = (StgInfoTable *)caf->header.info;
+    ((StgIndStatic *)caf)->static_link = revertible_caf_list;
+    revertible_caf_list = caf;
+
+    RELEASE_SM_LOCK;
+}
+
+/* -----------------------------------------------------------------------------
+   Nursery management.
+   -------------------------------------------------------------------------- */
+
+static bdescr *
+allocNursery (step *stp, bdescr *tail, nat blocks)
+{
+    bdescr *bd;
+    nat i;
+
+    // Allocate a nursery: we allocate fresh blocks one at a time and
+    // cons them on to the front of the list, not forgetting to update
+    // the back pointer on the tail of the list to point to the new block.
+    for (i=0; i < blocks; i++) {
+	// @LDV profiling
+	/*
+	  processNursery() in LdvProfile.c assumes that every block group in
+	  the nursery contains only a single block. So, if a block group is
+	  given multiple blocks, change processNursery() accordingly.
+	*/
+	bd = allocBlock();
+	bd->link = tail;
+	// double-link the nursery: we might need to insert blocks
+	if (tail != NULL) {
+	    tail->u.back = bd;
+	}
+	bd->step = stp;
+	bd->gen_no = 0;
+	bd->flags = 0;
+	bd->free = bd->start;
+	tail = bd;
+    }
+    tail->u.back = NULL;
+    return tail;
+}
+
+static void
+assignNurseriesToCapabilities (void)
+{
+#ifdef THREADED_RTS
+    nat i;
+
+    for (i = 0; i < n_nurseries; i++) {
+	capabilities[i].r.rNursery        = &nurseries[i];
+	capabilities[i].r.rCurrentNursery = nurseries[i].blocks;
+	capabilities[i].r.rCurrentAlloc   = NULL;
+    }
+#else /* THREADED_RTS */
+    MainCapability.r.rNursery        = &nurseries[0];
+    MainCapability.r.rCurrentNursery = nurseries[0].blocks;
+    MainCapability.r.rCurrentAlloc   = NULL;
+#endif
+}
+
+void
+allocNurseries( void )
+{ 
+    nat i;
+
+    for (i = 0; i < n_nurseries; i++) {
+	nurseries[i].blocks = 
+	    allocNursery(&nurseries[i], NULL, 
+			 RtsFlags.GcFlags.minAllocAreaSize);
+	nurseries[i].n_blocks    = RtsFlags.GcFlags.minAllocAreaSize;
+	nurseries[i].old_blocks   = NULL;
+	nurseries[i].n_old_blocks = 0;
+	/* hp, hpLim, hp_bd, to_space etc. aren't used in the nursery */
+    }
+    assignNurseriesToCapabilities();
+}
+      
+void
+resetNurseries( void )
+{
+    nat i;
+    bdescr *bd;
+    step *stp;
+
+    for (i = 0; i < n_nurseries; i++) {
+	stp = &nurseries[i];
+	for (bd = stp->blocks; bd; bd = bd->link) {
+	    bd->free = bd->start;
+	    ASSERT(bd->gen_no == 0);
+	    ASSERT(bd->step == stp);
+	    IF_DEBUG(sanity,memset(bd->start, 0xaa, BLOCK_SIZE));
+	}
+    }
+    assignNurseriesToCapabilities();
+}
+
+lnat
+countNurseryBlocks (void)
+{
+    nat i;
+    lnat blocks = 0;
+
+    for (i = 0; i < n_nurseries; i++) {
+	blocks += nurseries[i].n_blocks;
+    }
+    return blocks;
+}
+
+static void
+resizeNursery ( step *stp, nat blocks )
+{
+  bdescr *bd;
+  nat nursery_blocks;
+
+  nursery_blocks = stp->n_blocks;
+  if (nursery_blocks == blocks) return;
+
+  if (nursery_blocks < blocks) {
+    IF_DEBUG(gc, debugBelch("Increasing size of nursery to %d blocks\n", 
+			 blocks));
+    stp->blocks = allocNursery(stp, stp->blocks, blocks-nursery_blocks);
+  } 
+  else {
+    bdescr *next_bd;
+    
+    IF_DEBUG(gc, debugBelch("Decreasing size of nursery to %d blocks\n", 
+			 blocks));
+
+    bd = stp->blocks;
+    while (nursery_blocks > blocks) {
+	next_bd = bd->link;
+	next_bd->u.back = NULL;
+	nursery_blocks -= bd->blocks; // might be a large block
+	freeGroup(bd);
+	bd = next_bd;
+    }
+    stp->blocks = bd;
+    // might have gone just under, by freeing a large block, so make
+    // up the difference.
+    if (nursery_blocks < blocks) {
+	stp->blocks = allocNursery(stp, stp->blocks, blocks-nursery_blocks);
+    }
+  }
+  
+  stp->n_blocks = blocks;
+  ASSERT(countBlocks(stp->blocks) == stp->n_blocks);
+}
+
+// 
+// Resize each of the nurseries to the specified size.
+//
+void
+resizeNurseriesFixed (nat blocks)
+{
+    nat i;
+    for (i = 0; i < n_nurseries; i++) {
+	resizeNursery(&nurseries[i], blocks);
+    }
+}
+
+// 
+// Resize the nurseries to the total specified size.
+//
+void
+resizeNurseries (nat blocks)
+{
+    // If there are multiple nurseries, then we just divide the number
+    // of available blocks between them.
+    resizeNurseriesFixed(blocks / n_nurseries);
+}
+
+/* -----------------------------------------------------------------------------
+   The allocate() interface
+
+   allocate(n) always succeeds, and returns a chunk of memory n words
+   long.  n can be larger than the size of a block if necessary, in
+   which case a contiguous block group will be allocated.
+   -------------------------------------------------------------------------- */
+
+StgPtr
+allocate( nat n )
+{
+    bdescr *bd;
+    StgPtr p;
+
+    ACQUIRE_SM_LOCK;
+
+    TICK_ALLOC_HEAP_NOCTR(n);
+    CCS_ALLOC(CCCS,n);
+
+    /* big allocation (>LARGE_OBJECT_THRESHOLD) */
+    /* ToDo: allocate directly into generation 1 */
+    if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
+	nat req_blocks =  (lnat)BLOCK_ROUND_UP(n*sizeof(W_)) / BLOCK_SIZE;
+	bd = allocGroup(req_blocks);
+	dbl_link_onto(bd, &g0s0->large_objects);
+	g0s0->n_large_blocks += req_blocks;
+	bd->gen_no  = 0;
+	bd->step = g0s0;
+	bd->flags = BF_LARGE;
+	bd->free = bd->start + n;
+	alloc_blocks += req_blocks;
+	RELEASE_SM_LOCK;
+	return bd->start;
+	
+	/* small allocation (<LARGE_OBJECT_THRESHOLD) */
+    } else if (small_alloc_list == NULL || alloc_Hp + n > alloc_HpLim) {
+	if (small_alloc_list) {
+	    small_alloc_list->free = alloc_Hp;
+	}
+	bd = allocBlock();
+	bd->link = small_alloc_list;
+	small_alloc_list = bd;
+	bd->gen_no = 0;
+	bd->step = g0s0;
+	bd->flags = 0;
+	alloc_Hp = bd->start;
+	alloc_HpLim = bd->start + BLOCK_SIZE_W;
+	alloc_blocks++;
+    }
+    
+    p = alloc_Hp;
+    alloc_Hp += n;
+    RELEASE_SM_LOCK;
+    return p;
+}
+
+lnat
+allocated_bytes( void )
+{
+    lnat allocated;
+
+    allocated = alloc_blocks * BLOCK_SIZE_W - (alloc_HpLim - alloc_Hp);
+    if (pinned_object_block != NULL) {
+	allocated -= (pinned_object_block->start + BLOCK_SIZE_W) - 
+	    pinned_object_block->free;
+    }
+	
+    return allocated;
+}
+
+void
+tidyAllocateLists (void)
+{
+    if (small_alloc_list != NULL) {
+	ASSERT(alloc_Hp >= small_alloc_list->start && 
+	       alloc_Hp <= small_alloc_list->start + BLOCK_SIZE);
+	small_alloc_list->free = alloc_Hp;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   allocateLocal()
+
+   This allocates memory in the current thread - it is intended for
+   use primarily from STG-land where we have a Capability.  It is
+   better than allocate() because it doesn't require taking the
+   sm_mutex lock in the common case.
+
+   Memory is allocated directly from the nursery if possible (but not
+   from the current nursery block, so as not to interfere with
+   Hp/HpLim).
+   -------------------------------------------------------------------------- */
+
+StgPtr
+allocateLocal (Capability *cap, nat n)
+{
+    bdescr *bd;
+    StgPtr p;
+
+    TICK_ALLOC_HEAP_NOCTR(n);
+    CCS_ALLOC(CCCS,n);
+    
+    /* big allocation (>LARGE_OBJECT_THRESHOLD) */
+    /* ToDo: allocate directly into generation 1 */
+    if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
+	nat req_blocks =  (lnat)BLOCK_ROUND_UP(n*sizeof(W_)) / BLOCK_SIZE;
+	ACQUIRE_SM_LOCK;
+	bd = allocGroup(req_blocks);
+	dbl_link_onto(bd, &g0s0->large_objects);
+	g0s0->n_large_blocks += req_blocks;
+	bd->gen_no  = 0;
+	bd->step = g0s0;
+	bd->flags = BF_LARGE;
+	bd->free = bd->start + n;
+	alloc_blocks += req_blocks;
+	RELEASE_SM_LOCK;
+	return bd->start;
+	
+	/* small allocation (<LARGE_OBJECT_THRESHOLD) */
+    } else {
+
+	bd = cap->r.rCurrentAlloc;
+	if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
+
+	    // The CurrentAlloc block is full, we need to find another
+	    // one.  First, we try taking the next block from the
+	    // nursery:
+	    bd = cap->r.rCurrentNursery->link;
+
+	    if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
+		// The nursery is empty, or the next block is already
+		// full: allocate a fresh block (we can't fail here).
+		ACQUIRE_SM_LOCK;
+		bd = allocBlock();
+		cap->r.rNursery->n_blocks++;
+		RELEASE_SM_LOCK;
+		bd->gen_no = 0;
+		bd->step = cap->r.rNursery;
+		bd->flags = 0;
+	    } else {
+		// we have a block in the nursery: take it and put
+		// it at the *front* of the nursery list, and use it
+		// to allocate() from.
+		cap->r.rCurrentNursery->link = bd->link;
+		if (bd->link != NULL) {
+		    bd->link->u.back = cap->r.rCurrentNursery;
+		}
+	    }
+	    dbl_link_onto(bd, &cap->r.rNursery->blocks);
+	    cap->r.rCurrentAlloc = bd;
+	    IF_DEBUG(sanity, checkNurserySanity(cap->r.rNursery));
+	}
+    }
+    p = bd->free;
+    bd->free += n;
+    return p;
+}
+
+/* ---------------------------------------------------------------------------
+   Allocate a fixed/pinned object.
+
+   We allocate small pinned objects into a single block, allocating a
+   new block when the current one overflows.  The block is chained
+   onto the large_object_list of generation 0 step 0.
+
+   NOTE: The GC can't in general handle pinned objects.  This
+   interface is only safe to use for ByteArrays, which have no
+   pointers and don't require scavenging.  It works because the
+   block's descriptor has the BF_LARGE flag set, so the block is
+   treated as a large object and chained onto various lists, rather
+   than the individual objects being copied.  However, when it comes
+   to scavenge the block, the GC will only scavenge the first object.
+   The reason is that the GC can't linearly scan a block of pinned
+   objects at the moment (doing so would require using the
+   mostly-copying techniques).  But since we're restricting ourselves
+   to pinned ByteArrays, not scavenging is ok.
+
+   This function is called by newPinnedByteArray# which immediately
+   fills the allocated memory with a MutableByteArray#.
+   ------------------------------------------------------------------------- */
+
+StgPtr
+allocatePinned( nat n )
+{
+    StgPtr p;
+    bdescr *bd = pinned_object_block;
+
+    // If the request is for a large object, then allocate()
+    // will give us a pinned object anyway.
+    if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
+	return allocate(n);
+    }
+
+    ACQUIRE_SM_LOCK;
+    
+    TICK_ALLOC_HEAP_NOCTR(n);
+    CCS_ALLOC(CCCS,n);
+
+    // we always return 8-byte aligned memory.  bd->free must be
+    // 8-byte aligned to begin with, so we just round up n to
+    // the nearest multiple of 8 bytes.
+    if (sizeof(StgWord) == 4) {
+	n = (n+1) & ~1;
+    }
+
+    // If we don't have a block of pinned objects yet, or the current
+    // one isn't large enough to hold the new object, allocate a new one.
+    if (bd == NULL || (bd->free + n) > (bd->start + BLOCK_SIZE_W)) {
+	pinned_object_block = bd = allocBlock();
+	dbl_link_onto(bd, &g0s0->large_objects);
+	bd->gen_no = 0;
+	bd->step   = g0s0;
+	bd->flags  = BF_PINNED | BF_LARGE;
+	bd->free   = bd->start;
+	alloc_blocks++;
+    }
+
+    p = bd->free;
+    bd->free += n;
+    RELEASE_SM_LOCK;
+    return p;
+}
+
+/* -----------------------------------------------------------------------------
+   This is the write barrier for MUT_VARs, a.k.a. IORefs.  A
+   MUT_VAR_CLEAN object is not on the mutable list; a MUT_VAR_DIRTY
+   is.  When written to, a MUT_VAR_CLEAN turns into a MUT_VAR_DIRTY
+   and is put on the mutable list.
+   -------------------------------------------------------------------------- */
+
+void
+dirty_MUT_VAR(StgRegTable *reg, StgClosure *p)
+{
+    Capability *cap = regTableToCapability(reg);
+    bdescr *bd;
+    if (p->header.info == &stg_MUT_VAR_CLEAN_info) {
+	p->header.info = &stg_MUT_VAR_DIRTY_info;
+	bd = Bdescr((StgPtr)p);
+	if (bd->gen_no > 0) recordMutableCap(p,cap,bd->gen_no);
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   Allocation functions for GMP.
+
+   These all use the allocate() interface - we can't have any garbage
+   collection going on during a gmp operation, so we use allocate()
+   which always succeeds.  The gmp operations which might need to
+   allocate will ask the storage manager (via doYouWantToGC()) whether
+   a garbage collection is required, in case we get into a loop doing
+   only allocate() style allocation.
+   -------------------------------------------------------------------------- */
+
+static void *
+stgAllocForGMP (size_t size_in_bytes)
+{
+  StgArrWords* arr;
+  nat data_size_in_words, total_size_in_words;
+  
+  /* round up to a whole number of words */
+  data_size_in_words  = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
+  total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
+  
+  /* allocate and fill it in. */
+#if defined(THREADED_RTS)
+  arr = (StgArrWords *)allocateLocal(myTask()->cap, total_size_in_words);
+#else
+  arr = (StgArrWords *)allocateLocal(&MainCapability, total_size_in_words);
+#endif
+  SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
+  
+  /* and return a ptr to the goods inside the array */
+  return arr->payload;
+}
+
+static void *
+stgReallocForGMP (void *ptr, size_t old_size, size_t new_size)
+{
+    void *new_stuff_ptr = stgAllocForGMP(new_size);
+    nat i = 0;
+    char *p = (char *) ptr;
+    char *q = (char *) new_stuff_ptr;
+
+    for (; i < old_size; i++, p++, q++) {
+	*q = *p;
+    }
+
+    return(new_stuff_ptr);
+}
+
+static void
+stgDeallocForGMP (void *ptr STG_UNUSED, 
+		  size_t size STG_UNUSED)
+{
+    /* easy for us: the garbage collector does the dealloc'n */
+}
+
+/* -----------------------------------------------------------------------------
+ * Stats and stuff
+ * -------------------------------------------------------------------------- */
+
+/* -----------------------------------------------------------------------------
+ * calcAllocated()
+ *
+ * Approximate how much we've allocated: number of blocks in the
+ * nursery + blocks allocated via allocate() - unused nusery blocks.
+ * This leaves a little slop at the end of each block, and doesn't
+ * take into account large objects (ToDo).
+ * -------------------------------------------------------------------------- */
+
+lnat
+calcAllocated( void )
+{
+  nat allocated;
+  bdescr *bd;
+
+  allocated = allocated_bytes();
+  allocated += countNurseryBlocks() * BLOCK_SIZE_W;
+  
+  {
+#ifdef THREADED_RTS
+  nat i;
+  for (i = 0; i < n_nurseries; i++) {
+      Capability *cap;
+      for ( bd = capabilities[i].r.rCurrentNursery->link; 
+	    bd != NULL; bd = bd->link ) {
+	  allocated -= BLOCK_SIZE_W;
+      }
+      cap = &capabilities[i];
+      if (cap->r.rCurrentNursery->free < 
+	  cap->r.rCurrentNursery->start + BLOCK_SIZE_W) {
+	  allocated -= (cap->r.rCurrentNursery->start + BLOCK_SIZE_W)
+	      - cap->r.rCurrentNursery->free;
+      }
+  }
+#else
+  bdescr *current_nursery = MainCapability.r.rCurrentNursery;
+
+  for ( bd = current_nursery->link; bd != NULL; bd = bd->link ) {
+      allocated -= BLOCK_SIZE_W;
+  }
+  if (current_nursery->free < current_nursery->start + BLOCK_SIZE_W) {
+      allocated -= (current_nursery->start + BLOCK_SIZE_W)
+	  - current_nursery->free;
+  }
+#endif
+  }
+
+  total_allocated += allocated;
+  return allocated;
+}  
+
+/* Approximate the amount of live data in the heap.  To be called just
+ * after garbage collection (see GarbageCollect()).
+ */
+extern lnat 
+calcLive(void)
+{
+  nat g, s;
+  lnat live = 0;
+  step *stp;
+
+  if (RtsFlags.GcFlags.generations == 1) {
+    live = (g0s0->n_blocks - 1) * BLOCK_SIZE_W + 
+      ((lnat)g0s0->hp_bd->free - (lnat)g0s0->hp_bd->start) / sizeof(W_);
+    return live;
+  }
+
+  for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+    for (s = 0; s < generations[g].n_steps; s++) {
+      /* approximate amount of live data (doesn't take into account slop
+       * at end of each block).
+       */
+      if (g == 0 && s == 0) { 
+	  continue; 
+      }
+      stp = &generations[g].steps[s];
+      live += (stp->n_large_blocks + stp->n_blocks - 1) * BLOCK_SIZE_W;
+      if (stp->hp_bd != NULL) {
+	  live += ((lnat)stp->hp_bd->free - (lnat)stp->hp_bd->start) 
+	      / sizeof(W_);
+      }
+      if (stp->scavd_hp != NULL) {
+	  live -= (P_)(BLOCK_ROUND_UP(stp->scavd_hp)) - stp->scavd_hp;
+      }
+    }
+  }
+  return live;
+}
+
+/* Approximate the number of blocks that will be needed at the next
+ * garbage collection.
+ *
+ * Assume: all data currently live will remain live.  Steps that will
+ * be collected next time will therefore need twice as many blocks
+ * since all the data will be copied.
+ */
+extern lnat 
+calcNeeded(void)
+{
+    lnat needed = 0;
+    nat g, s;
+    step *stp;
+    
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	for (s = 0; s < generations[g].n_steps; s++) {
+	    if (g == 0 && s == 0) { continue; }
+	    stp = &generations[g].steps[s];
+	    if (generations[g].steps[0].n_blocks +
+		generations[g].steps[0].n_large_blocks 
+		> generations[g].max_blocks
+		&& stp->is_compacted == 0) {
+		needed += 2 * stp->n_blocks;
+	    } else {
+		needed += stp->n_blocks;
+	    }
+	}
+    }
+    return needed;
+}
+
+/* -----------------------------------------------------------------------------
+   Debugging
+
+   memInventory() checks for memory leaks by counting up all the
+   blocks we know about and comparing that to the number of blocks
+   allegedly floating around in the system.
+   -------------------------------------------------------------------------- */
+
+#ifdef DEBUG
+
+static lnat
+stepBlocks (step *stp)
+{
+    lnat total_blocks;
+    bdescr *bd;
+
+    total_blocks = stp->n_blocks;    
+    total_blocks += stp->n_old_blocks;
+    for (bd = stp->large_objects; bd; bd = bd->link) {
+	total_blocks += bd->blocks;
+	/* hack for megablock groups: they have an extra block or two in
+	   the second and subsequent megablocks where the block
+	   descriptors would normally go.
+	*/
+	if (bd->blocks > BLOCKS_PER_MBLOCK) {
+	    total_blocks -= (MBLOCK_SIZE / BLOCK_SIZE - BLOCKS_PER_MBLOCK)
+		* (bd->blocks/(MBLOCK_SIZE/BLOCK_SIZE));
+	}
+    }
+    return total_blocks;
+}
+
+void
+memInventory(void)
+{
+  nat g, s, i;
+  step *stp;
+  bdescr *bd;
+  lnat total_blocks = 0, free_blocks = 0;
+
+  /* count the blocks we current have */
+
+  for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+      for (i = 0; i < n_capabilities; i++) {
+	  for (bd = capabilities[i].mut_lists[g]; bd != NULL; bd = bd->link) {
+	      total_blocks += bd->blocks;
+	  }
+      }	  
+      for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
+	  total_blocks += bd->blocks;
+      }
+      for (s = 0; s < generations[g].n_steps; s++) {
+	  if (g==0 && s==0) continue;
+	  stp = &generations[g].steps[s];
+	  total_blocks += stepBlocks(stp);
+      }
+  }
+
+  for (i = 0; i < n_nurseries; i++) {
+      total_blocks += stepBlocks(&nurseries[i]);
+  }
+#ifdef THREADED_RTS
+  // We put pinned object blocks in g0s0, so better count blocks there too.
+  total_blocks += stepBlocks(g0s0);
+#endif
+
+  /* any blocks held by allocate() */
+  for (bd = small_alloc_list; bd; bd = bd->link) {
+    total_blocks += bd->blocks;
+  }
+
+#ifdef PROFILING
+  if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER) {
+      total_blocks += retainerStackBlocks();
+  }
+#endif
+
+  // count the blocks allocated by the arena allocator
+  total_blocks += arenaBlocks();
+
+  /* count the blocks on the free list */
+  free_blocks = countFreeList();
+
+  if (total_blocks + free_blocks != mblocks_allocated *
+      BLOCKS_PER_MBLOCK) {
+    debugBelch("Blocks: %ld live + %ld free  = %ld total (%ld around)\n",
+	    total_blocks, free_blocks, total_blocks + free_blocks,
+	    mblocks_allocated * BLOCKS_PER_MBLOCK);
+  }
+
+  ASSERT(total_blocks + free_blocks == mblocks_allocated * BLOCKS_PER_MBLOCK);
+}
+
+
+nat
+countBlocks(bdescr *bd)
+{
+    nat n;
+    for (n=0; bd != NULL; bd=bd->link) {
+	n += bd->blocks;
+    }
+    return n;
+}
+
+/* Full heap sanity check. */
+void
+checkSanity( void )
+{
+    nat g, s;
+
+    if (RtsFlags.GcFlags.generations == 1) {
+	checkHeap(g0s0->blocks);
+	checkChain(g0s0->large_objects);
+    } else {
+	
+	for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	    for (s = 0; s < generations[g].n_steps; s++) {
+		if (g == 0 && s == 0) { continue; }
+		ASSERT(countBlocks(generations[g].steps[s].blocks)
+		       == generations[g].steps[s].n_blocks);
+		ASSERT(countBlocks(generations[g].steps[s].large_objects)
+		       == generations[g].steps[s].n_large_blocks);
+		checkHeap(generations[g].steps[s].blocks);
+		checkChain(generations[g].steps[s].large_objects);
+		if (g > 0) {
+		    checkMutableList(generations[g].mut_list, g);
+		}
+	    }
+	}
+
+	for (s = 0; s < n_nurseries; s++) {
+	    ASSERT(countBlocks(nurseries[s].blocks)
+		   == nurseries[s].n_blocks);
+	    ASSERT(countBlocks(nurseries[s].large_objects)
+		   == nurseries[s].n_large_blocks);
+	}
+	    
+	checkFreeListSanity();
+    }
+}
+
+/* Nursery sanity check */
+void
+checkNurserySanity( step *stp )
+{
+    bdescr *bd, *prev;
+    nat blocks = 0;
+
+    prev = NULL;
+    for (bd = stp->blocks; bd != NULL; bd = bd->link) {
+	ASSERT(bd->u.back == prev);
+	prev = bd;
+	blocks += bd->blocks;
+    }
+    ASSERT(blocks == stp->n_blocks);
+}
+
+// handy function for use in gdb, because Bdescr() is inlined.
+extern bdescr *_bdescr( StgPtr p );
+
+bdescr *
+_bdescr( StgPtr p )
+{
+    return Bdescr(p);
+}
+
+#endif
diff --git a/rts/Task.c b/rts/Task.c
new file mode 100644
index 0000000000..7366480094
--- /dev/null
+++ b/rts/Task.c
@@ -0,0 +1,315 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2001-2005
+ *
+ * The task manager subsystem.  Tasks execute STG code, with this
+ * module providing the API which the Scheduler uses to control their
+ * creation and destruction.
+ * 
+ * -------------------------------------------------------------------------*/
+
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "OSThreads.h"
+#include "Task.h"
+#include "Capability.h"
+#include "Stats.h"
+#include "RtsFlags.h"
+#include "Schedule.h"
+#include "Hash.h"
+
+#if HAVE_SIGNAL_H
+#include <signal.h>
+#endif
+
+// Task lists and global counters.
+// Locks required: sched_mutex.
+Task *all_tasks = NULL;
+static Task *task_free_list = NULL; // singly-linked
+static nat taskCount;
+#define DEFAULT_MAX_WORKERS 64
+static nat maxWorkers; // we won't create more workers than this
+static nat tasksRunning;
+static nat workerCount;
+
+/* -----------------------------------------------------------------------------
+ * Remembering the current thread's Task
+ * -------------------------------------------------------------------------- */
+
+// A thread-local-storage key that we can use to get access to the
+// current thread's Task structure.
+#if defined(THREADED_RTS)
+ThreadLocalKey currentTaskKey;
+#else
+Task *my_task;
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Rest of the Task API
+ * -------------------------------------------------------------------------- */
+
+void
+initTaskManager (void)
+{
+    static int initialized = 0;
+
+    if (!initialized) {
+	taskCount = 0;
+	workerCount = 0;
+	tasksRunning = 0;
+	maxWorkers = DEFAULT_MAX_WORKERS;
+	initialized = 1;
+#if defined(THREADED_RTS)
+	newThreadLocalKey(&currentTaskKey);
+#endif
+    }
+}
+
+
+void
+stopTaskManager (void)
+{
+    IF_DEBUG(scheduler, sched_belch("stopping task manager, %d tasks still running", tasksRunning));
+}
+
+
+static Task*
+newTask (void)
+{
+#if defined(THREADED_RTS)
+    Ticks currentElapsedTime, currentUserTime;
+#endif
+    Task *task;
+
+    task = stgMallocBytes(sizeof(Task), "newTask");
+    
+    task->cap  = NULL;
+    task->stopped = rtsFalse;
+    task->suspended_tso = NULL;
+    task->tso  = NULL;
+    task->stat = NoStatus;
+    task->ret  = NULL;
+    
+#if defined(THREADED_RTS)
+    initCondition(&task->cond);
+    initMutex(&task->lock);
+    task->wakeup = rtsFalse;
+#endif
+
+#if defined(THREADED_RTS)
+    currentUserTime = getThreadCPUTime();
+    currentElapsedTime = getProcessElapsedTime();
+    task->mut_time = 0.0;
+    task->mut_etime = 0.0;
+    task->gc_time = 0.0;
+    task->gc_etime = 0.0;
+    task->muttimestart = currentUserTime;
+    task->elapsedtimestart = currentElapsedTime;
+#endif
+
+    task->prev = NULL;
+    task->next = NULL;
+    task->return_link = NULL;
+
+    task->all_link = all_tasks;
+    all_tasks = task;
+
+    taskCount++;
+    workerCount++;
+
+    return task;
+}
+
+Task *
+newBoundTask (void)
+{
+    Task *task;
+
+    ASSERT_LOCK_HELD(&sched_mutex);
+    if (task_free_list == NULL) {
+	task = newTask();
+    } else {
+	task = task_free_list;
+	task_free_list = task->next;
+	task->next = NULL;
+	task->prev = NULL;
+	task->stopped = rtsFalse;
+    }
+#if defined(THREADED_RTS)
+    task->id = osThreadId();
+#endif
+    ASSERT(task->cap == NULL);
+
+    tasksRunning++;
+
+    taskEnter(task);
+
+    IF_DEBUG(scheduler,sched_belch("new task (taskCount: %d)", taskCount););
+    return task;
+}
+
+void
+boundTaskExiting (Task *task)
+{
+    task->stopped = rtsTrue;
+    task->cap = NULL;
+
+#if defined(THREADED_RTS)
+    ASSERT(osThreadId() == task->id);
+#endif
+    ASSERT(myTask() == task);
+    setMyTask(task->prev_stack);
+
+    tasksRunning--;
+
+    // sadly, we need a lock around the free task list. Todo: eliminate.
+    ACQUIRE_LOCK(&sched_mutex);
+    task->next = task_free_list;
+    task_free_list = task;
+    RELEASE_LOCK(&sched_mutex);
+
+    IF_DEBUG(scheduler,sched_belch("task exiting"));
+}
+
+#ifdef THREADED_RTS
+#define TASK_ID(t) (t)->id
+#else
+#define TASK_ID(t) (t)
+#endif
+
+void
+discardTask (Task *task)
+{
+    ASSERT_LOCK_HELD(&sched_mutex);
+    if (!task->stopped) {
+	IF_DEBUG(scheduler,sched_belch("discarding task %p", TASK_ID(task)));
+	task->cap = NULL;
+	task->tso = NULL;
+	task->stopped = rtsTrue;
+	tasksRunning--;
+	task->next = task_free_list;
+	task_free_list = task;
+    }
+}
+
+void
+taskStop (Task *task)
+{
+#if defined(THREADED_RTS)
+    OSThreadId id;
+    Ticks currentElapsedTime, currentUserTime, elapsedGCTime;
+
+    id = osThreadId();
+    ASSERT(task->id == id);
+    ASSERT(myTask() == task);
+
+    currentUserTime = getThreadCPUTime();
+    currentElapsedTime = getProcessElapsedTime();
+
+    // XXX this is wrong; we want elapsed GC time since the
+    // Task started.
+    elapsedGCTime = stat_getElapsedGCTime();
+    
+    task->mut_time = 
+	currentUserTime - task->muttimestart - task->gc_time;
+    task->mut_etime = 
+	currentElapsedTime - task->elapsedtimestart - elapsedGCTime;
+
+    if (task->mut_time < 0.0)  { task->mut_time = 0.0;  }
+    if (task->mut_etime < 0.0) { task->mut_etime = 0.0; }
+#endif
+
+    task->stopped = rtsTrue;
+    tasksRunning--;
+}
+
+void
+resetTaskManagerAfterFork (void)
+{
+#warning TODO!
+    taskCount = 0;
+}
+
+#if defined(THREADED_RTS)
+
+void
+startWorkerTask (Capability *cap, 
+		 void OSThreadProcAttr (*taskStart)(Task *task))
+{
+  int r;
+  OSThreadId tid;
+  Task *task;
+
+  if (workerCount >= maxWorkers) {
+      barf("too many workers; runaway worker creation?");
+  }
+  workerCount++;
+
+  // A worker always gets a fresh Task structure.
+  task = newTask();
+
+  tasksRunning++;
+
+  // The lock here is to synchronise with taskStart(), to make sure
+  // that we have finished setting up the Task structure before the
+  // worker thread reads it.
+  ACQUIRE_LOCK(&task->lock);
+
+  task->cap = cap;
+
+  // Give the capability directly to the worker; we can't let anyone
+  // else get in, because the new worker Task has nowhere to go to
+  // sleep so that it could be woken up again.
+  ASSERT_LOCK_HELD(&cap->lock);
+  cap->running_task = task;
+
+  r = createOSThread(&tid, (OSThreadProc *)taskStart, task);
+  if (r != 0) {
+    barf("startTask: Can't create new task");
+  }
+
+  IF_DEBUG(scheduler,sched_belch("new worker task (taskCount: %d)", taskCount););
+
+  task->id = tid;
+
+  // ok, finished with the Task struct.
+  RELEASE_LOCK(&task->lock);
+}
+
+#endif /* THREADED_RTS */
+
+#ifdef DEBUG
+
+static void *taskId(Task *task)
+{
+#ifdef THREADED_RTS
+    return (void *)task->id;
+#else
+    return (void *)task;
+#endif
+}
+
+void printAllTasks(void);
+
+void
+printAllTasks(void)
+{
+    Task *task;
+    for (task = all_tasks; task != NULL; task = task->all_link) {
+	debugBelch("task %p is %s, ", taskId(task), task->stopped ? "stopped" : "alive");
+	if (!task->stopped) {
+	    if (task->cap) {
+		debugBelch("on capability %d, ", task->cap->no);
+	    }
+	    if (task->tso) {
+		debugBelch("bound to thread %d", task->tso->id);
+	    } else {
+		debugBelch("worker");
+	    }
+	}
+	debugBelch("\n");
+    }
+}		       
+
+#endif
+
diff --git a/rts/Task.h b/rts/Task.h
new file mode 100644
index 0000000000..ca71d2809a
--- /dev/null
+++ b/rts/Task.h
@@ -0,0 +1,271 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2001-2005
+ *
+ * Tasks
+ *
+ * -------------------------------------------------------------------------*/
+
+#ifndef TASK_H
+#define TASK_H
+
+#include "GetTime.h"
+
+/* 
+   Definition of a Task
+   --------------------
+ 
+   A task is an OSThread that runs Haskell code.  Every OSThread
+   created by the RTS for the purposes of running Haskell code is a
+   Task, and OS threads that enter the Haskell RTS for the purposes of
+   making a call-in are also Tasks.
+   
+   The relationship between the number of tasks and capabilities, and
+   the runtime build (-threaded, -smp etc.) is summarised by the
+   following table:
+
+     build        Tasks   Capabilities
+     ---------------------------------
+     normal         1          1
+     -threaded      N          N
+
+   The non-threaded build has a single Task and a single global
+   Capability.
+   
+   The THREADED_RTS build allows multiple tasks and mulitple Capabilities.
+   Multiple Tasks may all be running Haskell code simultaneously. A task
+   relinquishes its Capability when it is asked to evaluate an external
+   (C) call.
+
+   In general, there may be multiple Tasks for an OS thread.  This
+   happens if one Task makes a foreign call from Haskell, and
+   subsequently calls back in to create a new bound thread.
+
+   A particular Task structure can belong to more than one OS thread
+   over its lifetime.  This is to avoid creating an unbounded number
+   of Task structures.  The stats just accumulate.
+
+   Ownership of Task
+   -----------------
+
+   The OS thread named in the Task structure has exclusive access to
+   the structure, as long as it is the running_task of its Capability.
+   That is, if (task->cap->running_task == task), then task->id owns
+   the Task.  Otherwise the Task is owned by the owner of the parent
+   data structure on which it is sleeping; for example, if the task is
+   sleeping on spare_workers field of a Capability, then the owner of the
+   Capability has access to the Task.
+
+   When a task is migrated from sleeping on one Capability to another,
+   its task->cap field must be modified.  When the task wakes up, it
+   will read the new value of task->cap to find out which Capability
+   it belongs to.  Hence some synchronisation is required on
+   task->cap, and this is why we have task->lock.
+
+   If the Task is not currently owned by task->id, then the thread is
+   either
+
+      (a) waiting on the condition task->cond.  The Task is either
+         (1) a bound Task, the TSO will be on a queue somewhere
+	 (2) a worker task, on the spare_workers queue of task->cap.
+
+     (b) making a foreign call.  The Task will be on the
+         suspended_ccalling_tasks list.
+
+   We re-establish ownership in each case by respectively
+
+      (a) the task is currently blocked in yieldCapability().
+          This call will return when we have ownership of the Task and
+          a Capability.  The Capability we get might not be the same
+	  as the one we had when we called yieldCapability().
+          
+      (b) we must call resumeThread(task), which will safely establish
+          ownership of the Task and a Capability.
+*/
+
+typedef struct Task_ {
+#if defined(THREADED_RTS)
+    OSThreadId id;		// The OS Thread ID of this task
+#endif
+
+    // This points to the Capability that the Task "belongs" to.  If
+    // the Task owns a Capability, then task->cap points to it.  If
+    // the task does not own a Capability, then either (a) if the task
+    // is a worker, then task->cap points to the Capability it belongs
+    // to, or (b) it is returning from a foreign call, then task->cap
+    // points to the Capability with the returning_worker queue that this
+    // this Task is on.
+    //
+    // When a task goes to sleep, it may be migrated to a different
+    // Capability.  Hence, we always check task->cap on wakeup.  To
+    // syncrhonise between the migrater and the migratee, task->lock
+    // must be held when modifying task->cap.
+    struct Capability_ *cap;
+
+    rtsBool    stopped;         // this task has stopped or exited Haskell
+    StgTSO *   suspended_tso;   // the TSO is stashed here when we
+				// make a foreign call (NULL otherwise);
+
+    // The following 3 fields are used by bound threads:
+    StgTSO *   tso;             // the bound TSO (or NULL)
+    SchedulerStatus  stat;      // return status
+    StgClosure **    ret;       // return value
+
+#if defined(THREADED_RTS)
+    Condition cond;             // used for sleeping & waking up this task
+    Mutex lock;			// lock for the condition variable
+
+    // this flag tells the task whether it should wait on task->cond
+    // or just continue immediately.  It's a workaround for the fact
+    // that signalling a condition variable doesn't do anything if the
+    // thread is already running, but we want it to be sticky.
+    rtsBool wakeup;
+#endif
+
+    // Stats that we collect about this task
+    // ToDo: we probably want to put this in a separate TaskStats
+    // structure, so we can share it between multiple Tasks.  We don't
+    // really want separate stats for each call in a nested chain of
+    // foreign->haskell->foreign->haskell calls, but we'll get a
+    // separate Task for each of the haskell calls.
+    Ticks       elapsedtimestart;
+    Ticks       muttimestart;
+    Ticks       mut_time;
+    Ticks       mut_etime;
+    Ticks       gc_time;
+    Ticks       gc_etime;
+
+    // Links tasks onto various lists. (ToDo: do we need double
+    // linking now?)
+    struct Task_ *prev;
+    struct Task_ *next;
+
+    // Links tasks on the returning_tasks queue of a Capability.
+    struct Task_ *return_link;
+
+    // Links tasks on the all_tasks list
+    struct Task_ *all_link;
+
+    // When a Haskell thread makes a foreign call that re-enters
+    // Haskell, we end up with another Task associated with the
+    // current thread.  We have to remember the whole stack of Tasks
+    // associated with the current thread so that we can correctly
+    // save & restore the thread-local current task pointer.
+    struct Task_ *prev_stack;
+} Task;
+
+INLINE_HEADER rtsBool
+isBoundTask (Task *task) 
+{
+    return (task->tso != NULL);
+}
+
+
+// Linked list of all tasks.
+//
+extern Task *all_tasks;
+
+// Start and stop the task manager.
+// Requires: sched_mutex.
+//
+void initTaskManager (void);
+void stopTaskManager (void);
+
+// Create a new Task for a bound thread
+// Requires: sched_mutex.
+//
+Task *newBoundTask (void);
+
+// The current task is a bound task that is exiting.
+// Requires: sched_mutex.
+//
+void boundTaskExiting (Task *task);
+
+// This must be called when a new Task is associated with the current
+// thread.  It sets up the thread-local current task pointer so that
+// myTask() can work.
+INLINE_HEADER void taskEnter (Task *task);
+
+// Notify the task manager that a task has stopped.  This is used
+// mainly for stats-gathering purposes.
+// Requires: sched_mutex.
+//
+void taskStop (Task *task);
+
+// Put the task back on the free list, mark it stopped.  Used by
+// forkProcess().
+//
+void discardTask (Task *task);
+
+// Get the Task associated with the current OS thread (or NULL if none).
+//
+INLINE_HEADER Task *myTask (void);
+
+// After a fork, the tasks are not carried into the child process, so
+// we must tell the task manager.
+// Requires: sched_mutex.
+//
+void resetTaskManagerAfterFork (void);
+
+#if defined(THREADED_RTS)
+
+// Workers are attached to the supplied Capability.  This Capability
+// should not currently have a running_task, because the new task
+// will become the running_task for that Capability.
+// Requires: sched_mutex.
+//
+void startWorkerTask  (struct Capability_ *cap, 
+		       void OSThreadProcAttr (*taskStart)(Task *task));
+
+#endif /* THREADED_RTS */
+
+// -----------------------------------------------------------------------------
+// INLINE functions... private from here on down:
+
+// A thread-local-storage key that we can use to get access to the
+// current thread's Task structure.
+#if defined(THREADED_RTS)
+extern ThreadLocalKey currentTaskKey;
+#else
+extern Task *my_task;
+#endif
+
+//
+// myTask() uses thread-local storage to find the Task associated with
+// the current OS thread.  If the current OS thread has multiple
+// Tasks, because it has re-entered the RTS, then the task->prev_stack
+// field is used to store the previous Task.
+//
+INLINE_HEADER Task *
+myTask (void)
+{
+#if defined(THREADED_RTS)
+    return getThreadLocalVar(&currentTaskKey);
+#else
+    return my_task;
+#endif
+}
+
+INLINE_HEADER void
+setMyTask (Task *task)
+{
+#if defined(THREADED_RTS)
+    setThreadLocalVar(&currentTaskKey,task);
+#else
+    my_task = task;
+#endif
+}
+
+// This must be called when a new Task is associated with the current
+// thread.  It sets up the thread-local current task pointer so that
+// myTask() can work.
+INLINE_HEADER void
+taskEnter (Task *task)
+{
+    // save the current value, just in case this Task has been created
+    // as a result of re-entering the RTS (defaults to NULL):
+    task->prev_stack = myTask();
+    setMyTask(task);
+}
+
+#endif /* TASK_H */
diff --git a/rts/ThreadLabels.c b/rts/ThreadLabels.c
new file mode 100644
index 0000000000..9b9f1723ff
--- /dev/null
+++ b/rts/ThreadLabels.c
@@ -0,0 +1,50 @@
+/* -----------------------------------------------------------------------------
+ * ThreadLabels.c
+ *
+ * (c) The GHC Team 2002-2003
+ *
+ * Table of thread labels.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "ThreadLabels.h"
+#include "RtsUtils.h"
+
+#include <stdlib.h>
+
+#if defined(DEBUG)
+/* to the end */
+static HashTable * threadLabels = NULL;
+
+void
+initThreadLabelTable(void)
+{
+  if (threadLabels == NULL) {
+    threadLabels = allocHashTable();
+  }
+}
+
+void
+updateThreadLabel(StgWord key, void *data)
+{
+  removeThreadLabel(key);
+  insertHashTable(threadLabels,key,data);
+}
+
+void *
+lookupThreadLabel(StgWord key)
+{
+  return lookupHashTable(threadLabels,key);
+}
+
+void
+removeThreadLabel(StgWord key)
+{
+  void * old = NULL;
+  if ((old = lookupHashTable(threadLabels,key))) {
+    removeHashTable(threadLabels,key,old);
+    stgFree(old);
+  }  
+}
+#endif /* DEBUG */
diff --git a/rts/ThreadLabels.h b/rts/ThreadLabels.h
new file mode 100644
index 0000000000..97d3d0d241
--- /dev/null
+++ b/rts/ThreadLabels.h
@@ -0,0 +1,27 @@
+/* -----------------------------------------------------------------------------
+ * ThreadLabels.h
+ *
+ * (c) The GHC Team 2002-2003
+ *
+ * Table of thread labels.
+ *
+ * ---------------------------------------------------------------------------*/
+#ifndef __THREADLABELS_H__
+#define __THREADLABELS_H__
+
+#include "Rts.h"
+#include "Hash.h"
+
+void
+initThreadLabelTable(void);
+
+void
+updateThreadLabel(StgWord key, void *data);
+
+void *
+lookupThreadLabel(StgWord key);
+
+void
+removeThreadLabel(StgWord key);
+
+#endif /* __THREADLABELS_H__ */
diff --git a/rts/Ticker.h b/rts/Ticker.h
new file mode 100644
index 0000000000..f9555768b5
--- /dev/null
+++ b/rts/Ticker.h
@@ -0,0 +1,15 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2005
+ *
+ * Ticker interface (implementation is OS-specific)
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef TICKER_H
+#define TICKER_H
+
+extern int  startTicker( nat ms, TickProc handle_tick );
+extern int  stopTicker ( void );
+
+#endif /* TICKER_H */
diff --git a/rts/Ticky.c b/rts/Ticky.c
new file mode 100644
index 0000000000..294e12bdda
--- /dev/null
+++ b/rts/Ticky.c
@@ -0,0 +1,628 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The AQUA project, Glasgow University, 1992-1997
+ * (c) The GHC Team, 1998-1999
+ *
+ * Ticky-ticky profiling
+ *-------------------------------------------------------------------------- */
+
+#if defined(TICKY_TICKY)
+
+#define TICKY_C			/* define those variables */
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "Ticky.h"
+
+/* -----------------------------------------------------------------------------
+   Print out all the counters
+   -------------------------------------------------------------------------- */
+
+static void printRegisteredCounterInfo (FILE *); /* fwd decl */
+
+#define INTAVG(a,b) ((b == 0) ? 0.0 : ((double) (a) / (double) (b)))
+#define PC(a)	    (100.0 * a)
+
+#define AVG(thing) \
+	StgDouble avg##thing  = INTAVG(tot##thing,ctr##thing)
+
+void
+PrintTickyInfo(void)
+{
+  unsigned long i;
+  unsigned long tot_allocs = /* total number of things allocated */
+	ALLOC_FUN_ctr + ALLOC_SE_THK_ctr + ALLOC_UP_THK_ctr + ALLOC_CON_ctr + ALLOC_TUP_ctr +
+    	+ ALLOC_TSO_ctr + ALLOC_BH_ctr  + ALLOC_PAP_ctr + ALLOC_PRIM_ctr
+#ifdef PAR
+	+ ALLOC_FMBQ_ctr + ALLOC_FME_ctr + ALLOC_BF_ctr
+#endif
+      ;	
+
+  unsigned long tot_adm_wds = /* total number of admin words allocated */
+	ALLOC_FUN_adm + ALLOC_THK_adm + ALLOC_CON_adm + ALLOC_TUP_adm
+    	+ ALLOC_TSO_adm + ALLOC_BH_adm  + ALLOC_PAP_adm + ALLOC_PRIM_adm
+#ifdef PAR
+	+ ALLOC_FMBQ_adm + ALLOC_FME_adm + ALLOC_BF_adm
+#endif
+      ;
+
+  unsigned long tot_gds_wds = /* total number of words of ``good stuff'' allocated */
+	ALLOC_FUN_gds + ALLOC_THK_gds + ALLOC_CON_gds + ALLOC_TUP_gds
+    	+ ALLOC_TSO_gds + ALLOC_BH_gds  + ALLOC_PAP_gds + ALLOC_PRIM_gds
+#ifdef PAR
+	+ ALLOC_FMBQ_gds + ALLOC_FME_gds + ALLOC_BF_gds
+#endif
+      ;
+
+  unsigned long tot_slp_wds = /* total number of ``slop'' words allocated */
+	ALLOC_FUN_slp + ALLOC_THK_slp + ALLOC_CON_slp + ALLOC_TUP_slp
+    	+ ALLOC_TSO_slp + ALLOC_BH_slp  + ALLOC_PAP_slp + ALLOC_PRIM_slp
+#ifdef PAR
+	+ ALLOC_FMBQ_slp + ALLOC_FME_slp + ALLOC_BF_slp
+#endif
+      ;
+
+  unsigned long tot_wds = /* total words */
+	tot_adm_wds + tot_gds_wds + tot_slp_wds;
+
+  unsigned long tot_thk_enters = ENT_STATIC_THK_ctr + ENT_DYN_THK_ctr;
+  unsigned long tot_con_enters = ENT_STATIC_CON_ctr + ENT_DYN_CON_ctr;
+  unsigned long tot_fun_direct_enters = ENT_STATIC_FUN_DIRECT_ctr + ENT_DYN_FUN_DIRECT_ctr;
+  unsigned long tot_ind_enters = ENT_STATIC_IND_ctr + ENT_DYN_IND_ctr;
+  
+  // This is the number of times we entered a function via some kind
+  // of slow call.  It amounts to all the slow applications, not
+  // counting those that were to too few arguments.
+  unsigned long tot_fun_slow_enters = 
+      SLOW_CALL_ctr - 
+      SLOW_CALL_FUN_TOO_FEW_ctr -
+      SLOW_CALL_PAP_TOO_FEW_ctr;
+
+  unsigned long tot_known_calls =
+      KNOWN_CALL_ctr + KNOWN_CALL_TOO_FEW_ARGS_ctr + 
+      + KNOWN_CALL_EXTRA_ARGS_ctr;
+  unsigned long tot_tail_calls =
+      UNKNOWN_CALL_ctr + tot_known_calls;
+
+  unsigned long tot_enters =
+	tot_con_enters + tot_fun_direct_enters +
+	tot_ind_enters + ENT_PERM_IND_ctr + ENT_PAP_ctr + tot_thk_enters;
+  unsigned long jump_direct_enters =
+	tot_enters - ENT_VIA_NODE_ctr;
+
+  unsigned long tot_returns =
+      RET_NEW_ctr + RET_OLD_ctr + RET_UNBOXED_TUP_ctr;
+
+  unsigned long tot_returns_of_new = RET_NEW_ctr;
+
+  unsigned long con_updates = UPD_CON_IN_NEW_ctr + UPD_CON_IN_PLACE_ctr;
+  unsigned long pap_updates = UPD_PAP_IN_NEW_ctr + UPD_PAP_IN_PLACE_ctr;
+
+  unsigned long tot_updates = UPD_SQUEEZED_ctr + pap_updates + con_updates;
+
+  unsigned long tot_new_updates   = UPD_NEW_IND_ctr + UPD_NEW_PERM_IND_ctr;
+  unsigned long tot_old_updates   = UPD_OLD_IND_ctr + UPD_OLD_PERM_IND_ctr;
+  unsigned long tot_gengc_updates = tot_new_updates + tot_old_updates;
+
+  FILE *tf = RtsFlags.TickyFlags.tickyFile;
+
+  fprintf(tf,"\n\nALLOCATIONS: %ld (%ld words total: %ld admin, %ld goods, %ld slop)\n",
+	  tot_allocs, tot_wds, tot_adm_wds, tot_gds_wds, tot_slp_wds);
+  fprintf(tf,"\t\t\t\ttotal words:\t    2     3     4     5    6+\n");
+
+#define ALLOC_HISTO_MAGIC(categ) \
+	(PC(INTAVG(ALLOC_##categ##_hst[0], ALLOC_##categ##_ctr))), \
+	(PC(INTAVG(ALLOC_##categ##_hst[1], ALLOC_##categ##_ctr))), \
+	(PC(INTAVG(ALLOC_##categ##_hst[2], ALLOC_##categ##_ctr))), \
+	(PC(INTAVG(ALLOC_##categ##_hst[3], ALLOC_##categ##_ctr))), \
+	(PC(INTAVG(ALLOC_##categ##_hst[4], ALLOC_##categ##_ctr)))
+
+  fprintf(tf,"%7ld (%5.1f%%) function values",
+	ALLOC_FUN_ctr,
+	PC(INTAVG(ALLOC_FUN_ctr, tot_allocs)));
+  if (ALLOC_FUN_ctr != 0)
+      fprintf(tf,"\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(FUN));
+
+  fprintf(tf,"\n%7ld (%5.1f%%) thunks",
+	ALLOC_SE_THK_ctr + ALLOC_UP_THK_ctr,
+	PC(INTAVG(ALLOC_SE_THK_ctr + ALLOC_UP_THK_ctr, tot_allocs)));
+
+#define ALLOC_THK_ctr (ALLOC_UP_THK_ctr + ALLOC_SE_THK_ctr)
+  /* hack to make ALLOC_HISTO_MAGIC still work for THK */
+  if ((ALLOC_SE_THK_ctr + ALLOC_UP_THK_ctr) != 0)
+      fprintf(tf,"\t\t\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(THK));
+#undef ALLOC_THK_ctr
+
+  fprintf(tf,"\n%7ld (%5.1f%%) data values",
+	ALLOC_CON_ctr,
+	PC(INTAVG(ALLOC_CON_ctr, tot_allocs)));
+  if (ALLOC_CON_ctr != 0)
+      fprintf(tf,"\t\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(CON));
+
+  fprintf(tf,"\n%7ld (%5.1f%%) big tuples",
+	ALLOC_TUP_ctr,
+	PC(INTAVG(ALLOC_TUP_ctr, tot_allocs)));
+  if (ALLOC_TUP_ctr != 0)
+      fprintf(tf,"\t\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(TUP));
+
+  fprintf(tf,"\n%7ld (%5.1f%%) black holes",
+	ALLOC_BH_ctr,
+	PC(INTAVG(ALLOC_BH_ctr, tot_allocs)));
+  if (ALLOC_BH_ctr != 0)
+      fprintf(tf,"\t\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(BH));
+
+  fprintf(tf,"\n%7ld (%5.1f%%) prim things",
+	ALLOC_PRIM_ctr,
+	PC(INTAVG(ALLOC_PRIM_ctr, tot_allocs)));
+  if (ALLOC_PRIM_ctr != 0)
+      fprintf(tf,"\t\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(PRIM));
+
+  fprintf(tf,"\n%7ld (%5.1f%%) partial applications",
+	ALLOC_PAP_ctr,
+	PC(INTAVG(ALLOC_PAP_ctr, tot_allocs)));
+  if (ALLOC_PAP_ctr != 0)
+      fprintf(tf,"\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(PAP));
+
+  fprintf(tf,"\n%7ld (%5.1f%%) thread state objects",
+	ALLOC_TSO_ctr,
+	PC(INTAVG(ALLOC_TSO_ctr, tot_allocs)));
+  if (ALLOC_TSO_ctr != 0)
+      fprintf(tf,"\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(TSO));
+#ifdef PAR
+  fprintf(tf,"\n%7ld (%5.1f%%) thread state objects",
+	ALLOC_FMBQ_ctr,
+	PC(INTAVG(ALLOC_FMBQ_ctr, tot_allocs)));
+  if (ALLOC_FMBQ_ctr != 0)
+      fprintf(tf,"\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(FMBQ));
+  fprintf(tf,"\n%7ld (%5.1f%%) thread state objects",
+	ALLOC_FME_ctr,
+	PC(INTAVG(ALLOC_FME_ctr, tot_allocs)));
+  if (ALLOC_FME_ctr != 0)
+      fprintf(tf,"\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(FME));
+  fprintf(tf,"\n%7ld (%5.1f%%) thread state objects",
+	ALLOC_BF_ctr,
+	PC(INTAVG(ALLOC_BF_ctr, tot_allocs)));
+  if (ALLOC_BF_ctr != 0)
+      fprintf(tf,"\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(BF));
+#endif
+  fprintf(tf,"\n");
+
+  fprintf(tf,"\nTotal storage-manager allocations: %ld (%ld words)\n\t[%ld words lost to speculative heap-checks]\n", ALLOC_HEAP_ctr, ALLOC_HEAP_tot, ALLOC_HEAP_tot - tot_wds);
+
+  fprintf(tf,"\nSTACK USAGE:\n"); /* NB: some bits are direction sensitive */
+
+  fprintf(tf,"\nENTERS: %ld  of which %ld (%.1f%%) direct to the entry code\n\t\t  [the rest indirected via Node's info ptr]\n",
+	tot_enters,
+	jump_direct_enters,
+	PC(INTAVG(jump_direct_enters,tot_enters)));
+  fprintf(tf,"%7ld (%5.1f%%) thunks\n",
+	tot_thk_enters,
+	PC(INTAVG(tot_thk_enters,tot_enters)));
+  fprintf(tf,"%7ld (%5.1f%%) data values\n",
+	tot_con_enters,
+	PC(INTAVG(tot_con_enters,tot_enters)));
+  fprintf(tf,"%7ld (%5.1f%%) normal indirections\n",
+	tot_ind_enters,
+	PC(INTAVG(tot_ind_enters,tot_enters)));
+  fprintf(tf,"%7ld (%5.1f%%) permanent indirections\n",
+	ENT_PERM_IND_ctr,
+	PC(INTAVG(ENT_PERM_IND_ctr,tot_enters)));
+
+  fprintf(tf,"\nFUNCTION ENTRIES: %ld\n", tot_fun_direct_enters);
+
+  fprintf(tf, "\nTAIL CALLS: %ld, of which %ld (%.lf%%) were to known functions\n", 
+	  tot_tail_calls, tot_known_calls,
+	  PC(INTAVG(tot_known_calls,tot_tail_calls)));
+
+  fprintf(tf, "\nSLOW APPLICATIONS: %ld evaluated, %ld unevaluated\n",
+	  SLOW_CALL_ctr, SLOW_CALL_UNEVALD_ctr);
+  fprintf(tf, "\n");
+  fprintf(tf, "         Too few args   Correct args   Too many args\n");
+  fprintf(tf, "   FUN     %8ld       %8ld        %8ld\n", 
+	  SLOW_CALL_FUN_TOO_FEW_ctr, SLOW_CALL_FUN_CORRECT_ctr, SLOW_CALL_FUN_TOO_MANY_ctr);
+  fprintf(tf, "   PAP     %8ld       %8ld        %8ld\n", 
+	  SLOW_CALL_PAP_TOO_FEW_ctr, SLOW_CALL_PAP_CORRECT_ctr, SLOW_CALL_PAP_TOO_MANY_ctr);
+  fprintf(tf, "\n");
+
+  fprintf(tf,"\nRETURNS: %ld\n", tot_returns);
+  fprintf(tf,"%7ld (%5.1f%%) from entering a new constructor\n\t\t  [the rest from entering an existing constructor]\n",
+	tot_returns_of_new,
+	PC(INTAVG(tot_returns_of_new,tot_returns)));
+  fprintf(tf,"%7ld (%5.1f%%) vectored [the rest unvectored]\n",
+	VEC_RETURN_ctr,
+	PC(INTAVG(VEC_RETURN_ctr,tot_returns)));
+
+  fprintf(tf, "\nRET_NEW:         %7ld: ", RET_NEW_ctr);
+  for (i = 0; i < 9; i++) { fprintf(tf, "%5.1f%%",
+				PC(INTAVG(RET_NEW_hst[i],RET_NEW_ctr))); }
+  fprintf(tf, "\n");
+  fprintf(tf, "RET_OLD:         %7ld: ", RET_OLD_ctr);
+  for (i = 0; i < 9; i++) { fprintf(tf, "%5.1f%%",
+				PC(INTAVG(RET_OLD_hst[i],RET_OLD_ctr))); }
+  fprintf(tf, "\n");
+  fprintf(tf, "RET_UNBOXED_TUP: %7ld: ", RET_UNBOXED_TUP_ctr);
+  for (i = 0; i < 9; i++) { fprintf(tf, "%5.1f%%",
+				    PC(INTAVG(RET_UNBOXED_TUP_hst[i],
+					      RET_UNBOXED_TUP_ctr))); }
+  fprintf(tf, "\n");
+  fprintf(tf, "\nRET_VEC_RETURN : %7ld: ", VEC_RETURN_ctr);
+  for (i = 0; i < 9; i++) { fprintf(tf, "%5.1f%%",
+				PC(INTAVG(RET_VEC_RETURN_hst[i],VEC_RETURN_ctr))); }
+  fprintf(tf, "\n");
+
+  fprintf(tf,"\nUPDATE FRAMES: %ld (%ld omitted from thunks)",
+	UPDF_PUSHED_ctr,
+	UPDF_OMITTED_ctr);
+
+  fprintf(tf,"\nCATCH FRAMES:  %ld", CATCHF_PUSHED_ctr);
+
+  if (UPDF_RCC_PUSHED_ctr != 0)
+     fprintf(tf,"%7ld restore cost centre frames (%ld omitted)\n",
+	UPDF_RCC_PUSHED_ctr,
+	UPDF_RCC_OMITTED_ctr);
+
+  fprintf(tf,"\nUPDATES: %ld\n", tot_updates);
+  fprintf(tf,"%7ld (%5.1f%%) data values\n\t\t  [%ld in place, %ld allocated new space]\n",
+	con_updates,
+	PC(INTAVG(con_updates,tot_updates)),
+	UPD_CON_IN_PLACE_ctr, UPD_CON_IN_NEW_ctr);
+  fprintf(tf,"%7ld (%5.1f%%) partial applications\n\t\t  [%ld in place, %ld allocated new space]\n",
+	pap_updates,
+	PC(INTAVG(pap_updates,tot_updates)),
+	UPD_PAP_IN_PLACE_ctr, UPD_PAP_IN_NEW_ctr);
+  fprintf(tf,"%7ld (%5.1f%%) updates by squeezing\n",
+	UPD_SQUEEZED_ctr,
+	PC(INTAVG(UPD_SQUEEZED_ctr, tot_updates)));
+
+  fprintf(tf, "\nUPD_CON_IN_NEW:   %7ld: ", UPD_CON_IN_NEW_ctr);
+  for (i = 0; i < 9; i++) { fprintf(tf, "%7ld", UPD_CON_IN_NEW_hst[i]); }
+  fprintf(tf, "\n");
+  fprintf(tf, "UPD_CON_IN_PLACE: %7ld: ", UPD_CON_IN_PLACE_ctr);
+  for (i = 0; i < 9; i++) { fprintf(tf, "%7ld", UPD_CON_IN_PLACE_hst[i]); }
+  fprintf(tf, "\n");
+  fprintf(tf, "UPD_PAP_IN_NEW:   %7ld: ", UPD_PAP_IN_NEW_ctr);
+  for (i = 0; i < 9; i++) { fprintf(tf, "%7ld", UPD_PAP_IN_NEW_hst[i]); }
+  fprintf(tf, "\n");
+
+  if (tot_gengc_updates != 0) {
+      fprintf(tf,"\nNEW GEN UPDATES: %9ld (%5.1f%%)\n",
+	      tot_new_updates,
+	      PC(INTAVG(tot_new_updates,tot_gengc_updates)));
+      fprintf(tf,"OLD GEN UPDATES: %9ld (%5.1f%%)\n",
+	      tot_old_updates,
+	      PC(INTAVG(tot_old_updates,tot_gengc_updates)));
+  }
+
+  fprintf(tf,"\nTotal bytes copied during GC: %ld\n",
+	  GC_WORDS_COPIED_ctr * sizeof(W_));
+
+  printRegisteredCounterInfo(tf);
+
+  fprintf(tf,"\n**************************************************\n");
+
+  /* here, we print out all the raw numbers; these are really
+    more useful when we want to snag them for subsequent
+    rdb-etc processing. WDP 95/11
+  */
+
+#define PR_CTR(ctr) \
+  do { fprintf(tf,"%7ld " #ctr "\n", ctr); } while(0)
+/* COND_PR_CTR takes a boolean; if false then msg is the printname rather than ctr */
+#define COND_PR_CTR(ctr,b,msg) \
+    if (b) { fprintf(tf,"%7ld " #ctr "\n", ctr); } else { fprintf(tf,"%7ld " msg "\n", ctr); }
+#define PR_HST(hst,i) \
+  do { fprintf(tf,"%7ld " #hst "_" #i "\n", hst[i]); } while(0)
+
+  PR_CTR(ALLOC_HEAP_ctr);
+  PR_CTR(ALLOC_HEAP_tot);
+
+  PR_CTR(ALLOC_FUN_ctr);
+  PR_CTR(ALLOC_FUN_adm);
+  PR_CTR(ALLOC_FUN_gds);
+  PR_CTR(ALLOC_FUN_slp);
+  PR_HST(ALLOC_FUN_hst,0);
+  PR_HST(ALLOC_FUN_hst,1);
+  PR_HST(ALLOC_FUN_hst,2);
+  PR_HST(ALLOC_FUN_hst,3);
+  PR_HST(ALLOC_FUN_hst,4);
+  PR_CTR(ALLOC_UP_THK_ctr);
+  PR_CTR(ALLOC_SE_THK_ctr);
+  PR_CTR(ALLOC_THK_adm);
+  PR_CTR(ALLOC_THK_gds);
+  PR_CTR(ALLOC_THK_slp);
+  PR_HST(ALLOC_THK_hst,0);
+  PR_HST(ALLOC_THK_hst,1);
+  PR_HST(ALLOC_THK_hst,2);
+  PR_HST(ALLOC_THK_hst,3);
+  PR_HST(ALLOC_THK_hst,4);
+  PR_CTR(ALLOC_CON_ctr);
+  PR_CTR(ALLOC_CON_adm);
+  PR_CTR(ALLOC_CON_gds);
+  PR_CTR(ALLOC_CON_slp);
+  PR_HST(ALLOC_CON_hst,0);
+  PR_HST(ALLOC_CON_hst,1);
+  PR_HST(ALLOC_CON_hst,2);
+  PR_HST(ALLOC_CON_hst,3);
+  PR_HST(ALLOC_CON_hst,4);
+  PR_CTR(ALLOC_TUP_ctr);
+  PR_CTR(ALLOC_TUP_adm);
+  PR_CTR(ALLOC_TUP_gds);
+  PR_CTR(ALLOC_TUP_slp);
+  PR_HST(ALLOC_TUP_hst,0);
+  PR_HST(ALLOC_TUP_hst,1);
+  PR_HST(ALLOC_TUP_hst,2);
+  PR_HST(ALLOC_TUP_hst,3);
+  PR_HST(ALLOC_TUP_hst,4);
+  PR_CTR(ALLOC_BH_ctr);
+  PR_CTR(ALLOC_BH_adm);
+  PR_CTR(ALLOC_BH_gds);
+  PR_CTR(ALLOC_BH_slp);
+  PR_HST(ALLOC_BH_hst,0);
+  PR_HST(ALLOC_BH_hst,1);
+  PR_HST(ALLOC_BH_hst,2);
+  PR_HST(ALLOC_BH_hst,3);
+  PR_HST(ALLOC_BH_hst,4);
+  PR_CTR(ALLOC_PRIM_ctr);
+  PR_CTR(ALLOC_PRIM_adm);
+  PR_CTR(ALLOC_PRIM_gds);
+  PR_CTR(ALLOC_PRIM_slp);
+  PR_HST(ALLOC_PRIM_hst,0);
+  PR_HST(ALLOC_PRIM_hst,1);
+  PR_HST(ALLOC_PRIM_hst,2);
+  PR_HST(ALLOC_PRIM_hst,3);
+  PR_HST(ALLOC_PRIM_hst,4);
+  PR_CTR(ALLOC_PAP_ctr);
+  PR_CTR(ALLOC_PAP_adm);
+  PR_CTR(ALLOC_PAP_gds);
+  PR_CTR(ALLOC_PAP_slp);
+  PR_HST(ALLOC_PAP_hst,0);
+  PR_HST(ALLOC_PAP_hst,1);
+  PR_HST(ALLOC_PAP_hst,2);
+  PR_HST(ALLOC_PAP_hst,3);
+  PR_HST(ALLOC_PAP_hst,4);
+
+  PR_CTR(ALLOC_TSO_ctr);
+  PR_CTR(ALLOC_TSO_adm);
+  PR_CTR(ALLOC_TSO_gds);
+  PR_CTR(ALLOC_TSO_slp);
+  PR_HST(ALLOC_TSO_hst,0);
+  PR_HST(ALLOC_TSO_hst,1);
+  PR_HST(ALLOC_TSO_hst,2);
+  PR_HST(ALLOC_TSO_hst,3);
+  PR_HST(ALLOC_TSO_hst,4);
+
+#ifdef PAR
+  PR_CTR(ALLOC_FMBQ_ctr);
+  PR_CTR(ALLOC_FMBQ_adm);
+  PR_CTR(ALLOC_FMBQ_gds);
+  PR_CTR(ALLOC_FMBQ_slp);
+  PR_HST(ALLOC_FMBQ_hst,0);
+  PR_HST(ALLOC_FMBQ_hst,1);
+  PR_HST(ALLOC_FMBQ_hst,2);
+  PR_HST(ALLOC_FMBQ_hst,3);
+  PR_HST(ALLOC_FMBQ_hst,4);
+  PR_CTR(ALLOC_FME_ctr);
+  PR_CTR(ALLOC_FME_adm);
+  PR_CTR(ALLOC_FME_gds);
+  PR_CTR(ALLOC_FME_slp);
+  PR_HST(ALLOC_FME_hst,0);
+  PR_HST(ALLOC_FME_hst,1);
+  PR_HST(ALLOC_FME_hst,2);
+  PR_HST(ALLOC_FME_hst,3);
+  PR_HST(ALLOC_FME_hst,4);
+  PR_CTR(ALLOC_BF_ctr);
+  PR_CTR(ALLOC_BF_adm);
+  PR_CTR(ALLOC_BF_gds);
+  PR_CTR(ALLOC_BF_slp);
+  PR_HST(ALLOC_BF_hst,0);
+  PR_HST(ALLOC_BF_hst,1);
+  PR_HST(ALLOC_BF_hst,2);
+  PR_HST(ALLOC_BF_hst,3);
+  PR_HST(ALLOC_BF_hst,4);
+#endif
+
+  PR_CTR(ENT_VIA_NODE_ctr);
+  PR_CTR(ENT_STATIC_CON_ctr);
+  PR_CTR(ENT_DYN_CON_ctr);
+  PR_CTR(ENT_STATIC_FUN_DIRECT_ctr);
+  PR_CTR(ENT_DYN_FUN_DIRECT_ctr);
+  PR_CTR(ENT_STATIC_IND_ctr);
+  PR_CTR(ENT_DYN_IND_ctr);
+
+/* The counters ENT_PERM_IND and UPD_{NEW,OLD}_PERM_IND are not dumped
+ * at the end of execution unless update squeezing is turned off (+RTS
+ * -Z =RtsFlags.GcFlags.squeezeUpdFrames), as they will be wrong
+ * otherwise.  Why?  Because for each update frame squeezed out, we
+ * count an UPD_NEW_PERM_IND *at GC time* (i.e., too early).  And
+ * further, when we enter the closure that has been updated, we count
+ * the ENT_PERM_IND, but we then enter the PERM_IND that was built for
+ * the next update frame below, and so on down the chain until we
+ * finally reach the value.  Thus we count many new ENT_PERM_INDs too
+ * early.  
+ * 
+ * This of course refers to the -ticky version that uses PERM_INDs to
+ * determine the number of closures entered 0/1/>1.  KSW 1999-04.  */
+  COND_PR_CTR(ENT_PERM_IND_ctr,RtsFlags.GcFlags.squeezeUpdFrames == rtsFalse,"E!NT_PERM_IND_ctr requires +RTS -Z");
+
+  PR_CTR(ENT_AP_ctr);
+  PR_CTR(ENT_PAP_ctr);
+  PR_CTR(ENT_AP_STACK_ctr);
+  PR_CTR(ENT_BH_ctr);
+  PR_CTR(ENT_STATIC_THK_ctr);
+  PR_CTR(ENT_DYN_THK_ctr);
+
+  PR_CTR(SLOW_CALL_v_ctr);
+  PR_CTR(SLOW_CALL_f_ctr);
+  PR_CTR(SLOW_CALL_d_ctr);
+  PR_CTR(SLOW_CALL_l_ctr);
+  PR_CTR(SLOW_CALL_n_ctr);
+  PR_CTR(SLOW_CALL_p_ctr);
+  PR_CTR(SLOW_CALL_pv_ctr);
+  PR_CTR(SLOW_CALL_pp_ctr);
+  PR_CTR(SLOW_CALL_ppv_ctr);
+  PR_CTR(SLOW_CALL_ppp_ctr);
+  PR_CTR(SLOW_CALL_pppv_ctr);
+  PR_CTR(SLOW_CALL_pppp_ctr);
+  PR_CTR(SLOW_CALL_ppppp_ctr);
+  PR_CTR(SLOW_CALL_pppppp_ctr);
+  PR_CTR(SLOW_CALL_OTHER_ctr);
+
+  PR_CTR(UNKNOWN_CALL_ctr);
+  PR_CTR(KNOWN_CALL_ctr);
+  PR_CTR(KNOWN_CALL_TOO_FEW_ARGS_ctr);
+  PR_CTR(KNOWN_CALL_EXTRA_ARGS_ctr);
+  PR_CTR(MULTI_CHUNK_SLOW_CALL_ctr);
+  PR_CTR(MULTI_CHUNK_SLOW_CALL_CHUNKS_ctr);
+  PR_CTR(SLOW_CALL_ctr);
+  PR_CTR(SLOW_CALL_FUN_TOO_FEW_ctr);
+  PR_CTR(SLOW_CALL_FUN_CORRECT_ctr);
+  PR_CTR(SLOW_CALL_FUN_TOO_MANY_ctr);
+  PR_CTR(SLOW_CALL_PAP_TOO_FEW_ctr);
+  PR_CTR(SLOW_CALL_PAP_CORRECT_ctr);
+  PR_CTR(SLOW_CALL_PAP_TOO_MANY_ctr);
+  PR_CTR(SLOW_CALL_UNEVALD_ctr);
+  PR_HST(SLOW_CALL_hst,0);
+  PR_HST(SLOW_CALL_hst,1);
+  PR_HST(SLOW_CALL_hst,2);
+  PR_HST(SLOW_CALL_hst,3);
+  PR_HST(SLOW_CALL_hst,4);
+  PR_HST(SLOW_CALL_hst,5);
+  PR_HST(SLOW_CALL_hst,6);
+  PR_HST(SLOW_CALL_hst,7);
+
+  PR_CTR(RET_NEW_ctr);
+  PR_CTR(RET_OLD_ctr);
+  PR_CTR(RET_UNBOXED_TUP_ctr);
+  PR_CTR(VEC_RETURN_ctr);
+
+  PR_HST(RET_NEW_hst,0);
+  PR_HST(RET_NEW_hst,1);
+  PR_HST(RET_NEW_hst,2);
+  PR_HST(RET_NEW_hst,3);
+  PR_HST(RET_NEW_hst,4);
+  PR_HST(RET_NEW_hst,5);
+  PR_HST(RET_NEW_hst,6);
+  PR_HST(RET_NEW_hst,7);
+  PR_HST(RET_NEW_hst,8);
+  PR_HST(RET_OLD_hst,0);
+  PR_HST(RET_OLD_hst,1);
+  PR_HST(RET_OLD_hst,2);
+  PR_HST(RET_OLD_hst,3);
+  PR_HST(RET_OLD_hst,4);
+  PR_HST(RET_OLD_hst,5);
+  PR_HST(RET_OLD_hst,6);
+  PR_HST(RET_OLD_hst,7);
+  PR_HST(RET_OLD_hst,8);
+  PR_HST(RET_UNBOXED_TUP_hst,0);
+  PR_HST(RET_UNBOXED_TUP_hst,1);
+  PR_HST(RET_UNBOXED_TUP_hst,2);
+  PR_HST(RET_UNBOXED_TUP_hst,3);
+  PR_HST(RET_UNBOXED_TUP_hst,4);
+  PR_HST(RET_UNBOXED_TUP_hst,5);
+  PR_HST(RET_UNBOXED_TUP_hst,6);
+  PR_HST(RET_UNBOXED_TUP_hst,7);
+  PR_HST(RET_UNBOXED_TUP_hst,8);
+  PR_HST(RET_VEC_RETURN_hst,0);
+  PR_HST(RET_VEC_RETURN_hst,1);
+  PR_HST(RET_VEC_RETURN_hst,2);
+  PR_HST(RET_VEC_RETURN_hst,3);
+  PR_HST(RET_VEC_RETURN_hst,4);
+  PR_HST(RET_VEC_RETURN_hst,5);
+  PR_HST(RET_VEC_RETURN_hst,6);
+  PR_HST(RET_VEC_RETURN_hst,7);
+  PR_HST(RET_VEC_RETURN_hst,8);
+
+  PR_CTR(UPDF_OMITTED_ctr);
+  PR_CTR(UPDF_PUSHED_ctr);
+  PR_CTR(CATCHF_PUSHED_ctr);
+
+  PR_CTR(UPDF_RCC_PUSHED_ctr);
+  PR_CTR(UPDF_RCC_OMITTED_ctr);
+
+  PR_CTR(UPD_SQUEEZED_ctr);
+  PR_CTR(UPD_CON_IN_NEW_ctr);
+  PR_CTR(UPD_CON_IN_PLACE_ctr);
+  PR_CTR(UPD_PAP_IN_NEW_ctr);
+  PR_CTR(UPD_PAP_IN_PLACE_ctr);
+
+  PR_CTR(UPD_BH_UPDATABLE_ctr);
+  PR_CTR(UPD_BH_SINGLE_ENTRY_ctr);
+  PR_CTR(UPD_CAF_BH_UPDATABLE_ctr);
+  PR_CTR(UPD_CAF_BH_SINGLE_ENTRY_ctr);
+
+  PR_HST(UPD_CON_IN_NEW_hst,0);
+  PR_HST(UPD_CON_IN_NEW_hst,1);
+  PR_HST(UPD_CON_IN_NEW_hst,2);
+  PR_HST(UPD_CON_IN_NEW_hst,3);
+  PR_HST(UPD_CON_IN_NEW_hst,4);
+  PR_HST(UPD_CON_IN_NEW_hst,5);
+  PR_HST(UPD_CON_IN_NEW_hst,6);
+  PR_HST(UPD_CON_IN_NEW_hst,7);
+  PR_HST(UPD_CON_IN_NEW_hst,8);
+  PR_HST(UPD_PAP_IN_NEW_hst,0);
+  PR_HST(UPD_PAP_IN_NEW_hst,1);
+  PR_HST(UPD_PAP_IN_NEW_hst,2);
+  PR_HST(UPD_PAP_IN_NEW_hst,3);
+  PR_HST(UPD_PAP_IN_NEW_hst,4);
+  PR_HST(UPD_PAP_IN_NEW_hst,5);
+  PR_HST(UPD_PAP_IN_NEW_hst,6);
+  PR_HST(UPD_PAP_IN_NEW_hst,7);
+  PR_HST(UPD_PAP_IN_NEW_hst,8);
+
+  PR_CTR(UPD_NEW_IND_ctr);
+  /* see comment on ENT_PERM_IND_ctr */
+  COND_PR_CTR(UPD_NEW_PERM_IND_ctr,RtsFlags.GcFlags.squeezeUpdFrames == rtsFalse,"U!PD_NEW_PERM_IND_ctr requires +RTS -Z");
+  PR_CTR(UPD_OLD_IND_ctr);
+  /* see comment on ENT_PERM_IND_ctr */
+  COND_PR_CTR(UPD_OLD_PERM_IND_ctr,RtsFlags.GcFlags.squeezeUpdFrames == rtsFalse,"U!PD_OLD_PERM_IND_ctr requires +RTS -Z");
+
+  PR_CTR(GC_SEL_ABANDONED_ctr);
+  PR_CTR(GC_SEL_MINOR_ctr);
+  PR_CTR(GC_SEL_MAJOR_ctr);
+  PR_CTR(GC_FAILED_PROMOTION_ctr);
+  PR_CTR(GC_WORDS_COPIED_ctr);
+}
+
+/* Data structure used in ``registering'' one of these counters. */
+
+StgEntCounter *ticky_entry_ctrs = NULL; /* root of list of them */
+
+/* To print out all the registered-counter info: */
+
+static void
+printRegisteredCounterInfo (FILE *tf)
+{
+    StgEntCounter *p;
+
+    if ( ticky_entry_ctrs != NULL ) {
+      fprintf(tf,"\n**************************************************\n\n");
+    }
+    fprintf(tf, "%11s%11s %6s%6s    %-11s%-30s\n",
+	    "Entries", "Allocs", "Arity", "Stack", "Kinds", "Function");
+    fprintf(tf, "--------------------------------------------------------------------------------\n");
+    /* Function name at the end so it doesn't mess up the tabulation */
+
+    for (p = ticky_entry_ctrs; p != NULL; p = p->link) {
+	fprintf(tf, "%11ld%11ld %6u%6u    %-11s%-30s",
+		p->entry_count,
+		p->allocs,
+		p->arity,
+		p->stk_args,
+		p->arg_kinds,
+		p->str);
+
+	fprintf(tf, "\n");
+
+    }
+}
+
+/* Catch-all top-level counter struct.  Allocations from CAFs will go
+ * here.
+ */
+StgEntCounter top_ct
+        = { 0, 0, 0,
+	    "TOP", "",
+	    0, 0, NULL };
+
+#endif /* TICKY_TICKY */
+
diff --git a/rts/Ticky.h b/rts/Ticky.h
new file mode 100644
index 0000000000..21765e4bbb
--- /dev/null
+++ b/rts/Ticky.h
@@ -0,0 +1,9 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1999
+ *
+ * Header for Ticky.c
+ *
+ * ---------------------------------------------------------------------------*/
+
+extern void PrintTickyInfo(void);
diff --git a/rts/Timer.c b/rts/Timer.c
new file mode 100644
index 0000000000..0bfea2d6fd
--- /dev/null
+++ b/rts/Timer.c
@@ -0,0 +1,102 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1995-2005
+ *
+ * Interval timer service for profiling and pre-emptive scheduling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+/*
+ * The interval timer is used for profiling and for context switching in the
+ * threaded build. 
+ *
+ * This file defines the platform-independent view of interval timing, relying
+ * on platform-specific services to install and run the timers.
+ *
+ */
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "Proftimer.h"
+#include "Schedule.h"
+#include "Timer.h"
+#include "Ticker.h"
+#include "Capability.h"
+
+/* ticks left before next pre-emptive context switch */
+static int ticks_to_ctxt_switch = 0;
+
+#if defined(THREADED_RTS)
+/* idle ticks left before we perform a GC */
+static int ticks_to_gc = 0;
+#endif
+
+/*
+ * Function: handle_tick()
+ *
+ * At each occurrence of a tick, the OS timer will invoke
+ * handle_tick().
+ */
+static
+void
+handle_tick(int unused STG_UNUSED)
+{
+#ifdef PROFILING
+  handleProfTick();
+#endif
+  if (RtsFlags.ConcFlags.ctxtSwitchTicks > 0) {
+      ticks_to_ctxt_switch--;
+      if (ticks_to_ctxt_switch <= 0) {
+	  ticks_to_ctxt_switch = RtsFlags.ConcFlags.ctxtSwitchTicks;
+	  context_switch = 1;	/* schedule a context switch */
+      }
+  }
+
+#if defined(THREADED_RTS)
+  /* 
+   * If we've been inactive for idleGCDelayTicks (set by +RTS
+   * -I), tell the scheduler to wake up and do a GC, to check
+   * for threads that are deadlocked.
+   */
+  switch (recent_activity) {
+  case ACTIVITY_YES:
+      recent_activity = ACTIVITY_MAYBE_NO;
+      ticks_to_gc = RtsFlags.GcFlags.idleGCDelayTicks;
+      break;
+  case ACTIVITY_MAYBE_NO:
+      if (ticks_to_gc == 0) break; /* 0 ==> no idle GC */
+      ticks_to_gc--;
+      if (ticks_to_gc == 0) {
+	  ticks_to_gc = RtsFlags.GcFlags.idleGCDelayTicks;
+	  recent_activity = ACTIVITY_INACTIVE;
+	  blackholes_need_checking = rtsTrue;
+	  /* hack: re-use the blackholes_need_checking flag */
+	  
+	  /* ToDo: this doesn't work.  Can't invoke
+	   * pthread_cond_signal from a signal handler.
+	   * Furthermore, we can't prod a capability that we
+	   * might be holding.  What can we do?
+	   */
+	  prodOneCapability();
+      }
+      break;
+  default:
+      break;
+  }
+#endif
+}
+
+int
+startTimer(nat ms)
+{
+#ifdef PROFILING
+  initProfTimer();
+#endif
+
+  return startTicker(ms, handle_tick);
+}
+
+int
+stopTimer()
+{
+  return stopTicker();
+}
diff --git a/rts/Timer.h b/rts/Timer.h
new file mode 100644
index 0000000000..ae26653462
--- /dev/null
+++ b/rts/Timer.h
@@ -0,0 +1,24 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1995-2005
+ *
+ * Interval timer service for profiling and pre-emptive scheduling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef TIMER_H
+#define TIMER_H
+
+# define TICK_MILLISECS   (1000/TICK_FREQUENCY)   /* ms per tick */
+
+/* Context switch timing constants. Context switches happen after a
+ * whole number of ticks, the default being every tick.
+ */
+#define CS_MIN_MILLISECS TICK_MILLISECS       /* milliseconds per slice */
+
+typedef void (*TickProc)(int);
+
+extern int startTimer(nat ms);
+extern int stopTimer(void);
+
+#endif /* TIMER_H */
diff --git a/rts/Updates.cmm b/rts/Updates.cmm
new file mode 100644
index 0000000000..1d2fc5fe0f
--- /dev/null
+++ b/rts/Updates.cmm
@@ -0,0 +1,153 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * Code to perform updates.
+ *
+ * This file is written in a subset of C--, extended with various
+ * features specific to GHC.  It is compiled by GHC directly.  For the
+ * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Cmm.h"
+#include "Updates.h"
+#include "StgLdvProf.h"
+
+/*
+  The update frame return address must be *polymorphic*, that means
+  we have to cope with both vectored and non-vectored returns.  This
+  is done by putting the return vector right before the info table, and
+  having a standard direct return address after the info table (pointed
+  to by the return address itself, as usual).
+
+  Each entry in the vector table points to a specialised entry code fragment
+  that knows how to return after doing the update.  It would be possible to
+  use a single generic piece of code that simply entered the return value
+  to return, but it's quicker this way.  The direct return code of course
+  just does another direct return when it's finished.
+*/
+
+/* on entry to the update code
+   (1) R1 points to the closure being returned
+   (2) Sp points to the update frame
+*/
+
+/* The update fragment has been tuned so as to generate good
+   code with gcc, which accounts for some of the strangeness in the
+   way it is written.  
+
+   In particular, the JMP_(ret) bit is passed down and pinned on the
+   end of each branch (there end up being two major branches in the
+   code), since we don't mind duplicating this jump.
+*/
+
+#define UPD_FRAME_ENTRY_TEMPLATE(label,ind_info,ret)			\
+        label								\
+	{								\
+          W_ updatee;							\
+									\
+          updatee = StgUpdateFrame_updatee(Sp);				\
+									\
+	  /* remove the update frame from the stack */			\
+	  Sp = Sp + SIZEOF_StgUpdateFrame;				\
+									\
+	  /* ToDo: it might be a PAP, so we should check... */		\
+	  TICK_UPD_CON_IN_NEW(sizeW_fromITBL(%GET_STD_INFO(updatee)));	\
+									\
+	  UPD_SPEC_IND(updatee, ind_info, R1, jump (ret));		\
+	}
+
+UPD_FRAME_ENTRY_TEMPLATE(stg_upd_frame_0_ret,stg_IND_0_info,%RET_VEC(Sp(0),0))
+UPD_FRAME_ENTRY_TEMPLATE(stg_upd_frame_1_ret,stg_IND_1_info,%RET_VEC(Sp(0),1))
+UPD_FRAME_ENTRY_TEMPLATE(stg_upd_frame_2_ret,stg_IND_2_info,%RET_VEC(Sp(0),2))
+UPD_FRAME_ENTRY_TEMPLATE(stg_upd_frame_3_ret,stg_IND_3_info,%RET_VEC(Sp(0),3))
+UPD_FRAME_ENTRY_TEMPLATE(stg_upd_frame_4_ret,stg_IND_4_info,%RET_VEC(Sp(0),4))
+UPD_FRAME_ENTRY_TEMPLATE(stg_upd_frame_5_ret,stg_IND_5_info,%RET_VEC(Sp(0),5))
+UPD_FRAME_ENTRY_TEMPLATE(stg_upd_frame_6_ret,stg_IND_6_info,%RET_VEC(Sp(0),6))
+UPD_FRAME_ENTRY_TEMPLATE(stg_upd_frame_7_ret,stg_IND_7_info,%RET_VEC(Sp(0),7))
+
+#if MAX_VECTORED_RTN > 8
+#error MAX_VECTORED_RTN has changed: please modify stg_upd_frame too.
+#endif
+
+/*
+  Make sure this table is big enough to handle the maximum vectored
+  return size!
+  */
+
+#if defined(PROFILING)
+#define UPD_FRAME_BITMAP 3
+#define UPD_FRAME_WORDS  3
+#else
+#define UPD_FRAME_BITMAP 0
+#define UPD_FRAME_WORDS  1
+#endif
+
+/* this bitmap indicates that the first word of an update frame is a
+ * non-pointer - this is the update frame link.  (for profiling,
+ * there's a cost-centre-stack in there too).
+ */
+
+INFO_TABLE_RET( stg_upd_frame, 
+	    UPD_FRAME_WORDS, UPD_FRAME_BITMAP, UPDATE_FRAME,
+	    stg_upd_frame_0_ret,
+	    stg_upd_frame_1_ret,
+	    stg_upd_frame_2_ret,
+	    stg_upd_frame_3_ret,
+	    stg_upd_frame_4_ret,
+	    stg_upd_frame_5_ret,
+	    stg_upd_frame_6_ret,
+	    stg_upd_frame_7_ret
+	    )
+UPD_FRAME_ENTRY_TEMPLATE(,stg_IND_direct_info,%ENTRY_CODE(Sp(0)))
+
+
+INFO_TABLE_RET( stg_marked_upd_frame, 
+	    UPD_FRAME_WORDS, UPD_FRAME_BITMAP, UPDATE_FRAME,
+	    stg_upd_frame_0_ret,
+	    stg_upd_frame_1_ret,
+	    stg_upd_frame_2_ret,
+	    stg_upd_frame_3_ret,
+	    stg_upd_frame_4_ret,
+	    stg_upd_frame_5_ret,
+	    stg_upd_frame_6_ret,
+	    stg_upd_frame_7_ret
+	    )
+UPD_FRAME_ENTRY_TEMPLATE(,stg_IND_direct_info,%ENTRY_CODE(Sp(0)))
+
+/*-----------------------------------------------------------------------------
+  Seq frames 
+
+  We don't have a primitive seq# operator: it is just a 'case'
+  expression whose scrutinee has either a polymorphic or function type
+  (constructor types can be handled by normal 'case' expressions).
+
+  To handle a polymorphic/function typed seq, we push a SEQ frame on
+  the stack.  This is a polymorphic activation record that just pops
+  itself and returns (in a non-vectored way) when entered.  The
+  purpose of the SEQ frame is to avoid having to make a polymorphic return
+  point for each polymorphic case expression.  
+
+  Another way of looking at it: the SEQ frame turns a vectored return
+  into a direct one.
+  -------------------------------------------------------------------------- */
+
+#if MAX_VECTORED_RTN > 8
+#error MAX_VECTORED_RTN has changed: please modify stg_seq_frame too.
+#endif
+
+INFO_TABLE_RET( stg_seq_frame, 0/* words */, 0/* bitmap */, RET_SMALL,
+	RET_LBL(stg_seq_frame), /* 0 */
+	RET_LBL(stg_seq_frame), /* 1 */
+	RET_LBL(stg_seq_frame), /* 2 */
+	RET_LBL(stg_seq_frame), /* 3 */
+	RET_LBL(stg_seq_frame), /* 4 */
+	RET_LBL(stg_seq_frame), /* 5 */
+	RET_LBL(stg_seq_frame), /* 6 */
+	RET_LBL(stg_seq_frame)  /* 7 */
+	)
+{
+   Sp_adj(1);
+   jump %ENTRY_CODE(Sp(0));
+}
diff --git a/rts/Updates.h b/rts/Updates.h
new file mode 100644
index 0000000000..5872157c81
--- /dev/null
+++ b/rts/Updates.h
@@ -0,0 +1,361 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * Performing updates.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef UPDATES_H
+#define UPDATES_H
+
+/* -----------------------------------------------------------------------------
+   Updates
+
+   We have two layers of update macros.  The top layer, UPD_IND() and
+   friends perform all the work of an update.  In detail:
+
+      - if the closure being updated is a blocking queue, then all the
+        threads waiting on the blocking queue are updated.
+
+      - then the lower level updateWithIndirection() macro is invoked 
+        to actually replace the closure with an indirection (see below).
+
+   -------------------------------------------------------------------------- */
+
+#ifdef TICKY_TICKY
+# define UPD_IND(updclosure, heapptr) \
+   UPD_PERM_IND(updclosure,heapptr)
+# define UPD_SPEC_IND(updclosure, ind_info, heapptr, and_then) \
+   UPD_PERM_IND(updclosure,heapptr); and_then
+#else
+#  define SEMI ;
+# define UPD_IND(updclosure, heapptr) \
+   UPD_REAL_IND(updclosure,INFO_PTR(stg_IND_info),heapptr,SEMI)
+# define UPD_SPEC_IND(updclosure, ind_info, heapptr, and_then) \
+   UPD_REAL_IND(updclosure,ind_info,heapptr,and_then)
+#endif
+
+/* These macros have to work in both C and C--, so here's the
+ * impedence matching:
+ */
+#ifdef CMINUSMINUS
+#define BLOCK_BEGIN
+#define BLOCK_END
+#define DECLARE_IPTR(info)  W_ info
+#define FCALL               foreign "C"
+#define INFO_PTR(info)      info
+#define ARG_PTR             "ptr"
+#else
+#define BLOCK_BEGIN         {
+#define BLOCK_END           }
+#define DECLARE_IPTR(info)  const StgInfoTable *(info)
+#define FCALL               /* nothing */
+#define INFO_PTR(info)      &info
+#define StgBlockingQueue_blocking_queue(closure) \
+    (((StgBlockingQueue *)closure)->blocking_queue)
+#define ARG_PTR             /* nothing */
+#endif
+
+/* UPD_IND actually does a PERM_IND if TICKY_TICKY is on;
+   if you *really* need an IND use UPD_REAL_IND
+ */
+#define UPD_REAL_IND(updclosure, ind_info, heapptr, and_then)	\
+        BLOCK_BEGIN						\
+	DECLARE_IPTR(info);					\
+	info = GET_INFO(updclosure);				\
+	updateWithIndirection(ind_info,				\
+			      updclosure,			\
+			      heapptr,				\
+			      and_then);			\
+	BLOCK_END
+
+#if defined(PROFILING) || defined(TICKY_TICKY)
+#define UPD_PERM_IND(updclosure, heapptr)	\
+        BLOCK_BEGIN				\
+	updateWithPermIndirection(updclosure,	\
+				  heapptr);	\
+	BLOCK_END
+#endif
+
+#if defined(RTS_SUPPORTS_THREADS)
+
+# ifdef TICKY_TICKY
+#  define UPD_IND_NOLOCK(updclosure, heapptr)	\
+        BLOCK_BEGIN				\
+	updateWithPermIndirection(updclosure,	\
+				  heapptr);	\
+	BLOCK_END
+# else
+#  define UPD_IND_NOLOCK(updclosure, heapptr)			\
+        BLOCK_BEGIN						\
+	updateWithIndirection(INFO_PTR(stg_IND_info),		\
+			      updclosure,			\
+			      heapptr,); 			\
+	BLOCK_END
+# endif
+
+#else
+#define UPD_IND_NOLOCK(updclosure,heapptr) UPD_IND(updclosure,heapptr)
+#endif
+
+/* -----------------------------------------------------------------------------
+   Awaken any threads waiting on a blocking queue (BLACKHOLE_BQ).
+   -------------------------------------------------------------------------- */
+
+#if defined(PAR) 
+
+/* 
+   In a parallel setup several types of closures might have a blocking queue:
+     BLACKHOLE_BQ ... same as in the default concurrent setup; it will be
+                      reawakened via calling UPD_IND on that closure after
+		      having finished the computation of the graph
+     FETCH_ME_BQ  ... a global indirection (FETCH_ME) may be entered by a 
+                      local TSO, turning it into a FETCH_ME_BQ; it will be
+		      reawakened via calling processResume
+     RBH          ... a revertible black hole may be entered by another 
+                      local TSO, putting it onto its blocking queue; since
+		      RBHs only exist while the corresponding closure is in 
+		      transit, they will be reawakened via calling 
+		      convertToFetchMe (upon processing an ACK message)
+
+   In a parallel setup a blocking queue may contain 3 types of closures:
+     TSO           ... as in the default concurrent setup
+     BLOCKED_FETCH ... indicating that a TSO on another PE is waiting for
+                       the result of the current computation
+     CONSTR        ... an RBHSave closure (which contains data ripped out of
+                       the closure to make room for a blocking queue; since
+		       it only contains data we use the exisiting type of
+		       a CONSTR closure); this closure is the end of a 
+		       blocking queue for an RBH closure; it only exists in
+		       this kind of blocking queue and must be at the end
+		       of the queue
+*/		      
+extern void awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node);
+#define DO_AWAKEN_BQ(bqe, node)  STGCALL2(awakenBlockedQueue, bqe, node);
+
+#define AWAKEN_BQ(info,closure)						\
+     	if (info == &stg_BLACKHOLE_BQ_info ||               \
+	    info == &stg_FETCH_ME_BQ_info ||                \
+	    get_itbl(closure)->type == RBH) {		                \
+		DO_AWAKEN_BQ(((StgBlockingQueue *)closure)->blocking_queue, closure);     	                \
+	}
+
+#elif defined(GRAN)
+
+extern void awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node);
+#define DO_AWAKEN_BQ(bq, node)  STGCALL2(awakenBlockedQueue, bq, node);
+
+/* In GranSim we don't have FETCH_ME or FETCH_ME_BQ closures, so they are
+   not checked. The rest of the code is the same as for GUM.
+*/
+#define AWAKEN_BQ(info,closure)						\
+     	if (info == &stg_BLACKHOLE_BQ_info ||               \
+	    get_itbl(closure)->type == RBH) {		                \
+		DO_AWAKEN_BQ(((StgBlockingQueue *)closure)->blocking_queue, closure);     	                \
+	}
+
+#endif /* GRAN || PAR */
+
+
+/* -----------------------------------------------------------------------------
+   Updates: lower-level macros which update a closure with an
+   indirection to another closure.
+
+   There are several variants of this code.
+
+       PROFILING:
+   -------------------------------------------------------------------------- */
+
+/* LDV profiling:
+ * We call LDV_recordDead_FILL_SLOP_DYNAMIC(p1) regardless of the generation in 
+ * which p1 resides.
+ *
+ * Note: 
+ *   After all, we do *NOT* need to call LDV_RECORD_CREATE() for both IND and 
+ *   IND_OLDGEN closures because they are inherently used. But, it corrupts
+ *   the invariants that every closure keeps its creation time in the profiling
+ *  field. So, we call LDV_RECORD_CREATE().
+ */
+
+/* In the DEBUG case, we also zero out the slop of the old closure,
+ * so that the sanity checker can tell where the next closure is.
+ *
+ * Two important invariants: we should never try to update a closure
+ * to point to itself, and the closure being updated should not
+ * already have been updated (the mutable list will get messed up
+ * otherwise).
+ *
+ * NB. We do *not* do this in THREADED_RTS mode, because when we have the
+ * possibility of multiple threads entering the same closure, zeroing
+ * the slop in one of the threads would have a disastrous effect on
+ * the other (seen in the wild!).
+ */
+#ifdef CMINUSMINUS
+
+#define FILL_SLOP(p)							\
+  W_ inf;								\
+  W_ sz;								\
+  W_ i;									\
+  inf = %GET_STD_INFO(p);						\
+  if (%INFO_TYPE(inf) != HALF_W_(THUNK_SELECTOR)			\
+	&& %INFO_TYPE(inf) != HALF_W_(BLACKHOLE)			\
+	&& %INFO_TYPE(inf) != HALF_W_(CAF_BLACKHOLE)) {			\
+      if (%INFO_TYPE(inf) == HALF_W_(AP_STACK)) {			\
+          sz = StgAP_STACK_size(p) + BYTES_TO_WDS(SIZEOF_StgAP_STACK_NoThunkHdr); \
+      } else {								\
+          if (%INFO_TYPE(inf) == HALF_W_(AP)) {				\
+	      sz = TO_W_(StgAP_n_args(p)) +  BYTES_TO_WDS(SIZEOF_StgAP_NoThunkHdr);	\
+          } else {							\
+              sz = TO_W_(%INFO_PTRS(inf)) + TO_W_(%INFO_NPTRS(inf));	\
+	  }								\
+      }									\
+      i = 0;								\
+      for:								\
+        if (i < sz) {							\
+          StgThunk_payload(p,i) = 0;					\
+          i = i + 1;							\
+          goto for;							\
+        }								\
+  }
+
+#else /* !CMINUSMINUS */
+
+INLINE_HEADER void
+FILL_SLOP(StgClosure *p)
+{						
+    StgInfoTable *inf = get_itbl(p);		
+    nat i, sz;
+
+    switch (inf->type) {
+    case BLACKHOLE:
+    case CAF_BLACKHOLE:
+    case THUNK_SELECTOR:
+	return;
+    case AP:
+	sz = ((StgAP *)p)->n_args + sizeofW(StgAP) - sizeofW(StgThunkHeader);
+	break;
+    case AP_STACK:
+	sz = ((StgAP_STACK *)p)->size + sizeofW(StgAP_STACK) - sizeofW(StgThunkHeader);
+	break;
+    default:
+	sz = inf->layout.payload.ptrs + inf->layout.payload.nptrs;
+        break;
+    }
+    for (i = 0; i < sz; i++) {
+	((StgThunk *)p)->payload[i] = 0;
+    }
+}
+
+#endif /* CMINUSMINUS */
+
+#if !defined(DEBUG) || defined(THREADED_RTS)
+#define DEBUG_FILL_SLOP(p) /* do nothing */
+#else
+#define DEBUG_FILL_SLOP(p) FILL_SLOP(p)
+#endif
+
+/* We have two versions of this macro (sadly), one for use in C-- code,
+ * and the other for C.
+ *
+ * The and_then argument is a performance hack so that we can paste in
+ * the continuation code directly.  It helps shave a couple of
+ * instructions off the common case in the update code, which is
+ * worthwhile (the update code is often part of the inner loop).
+ * (except that gcc now appears to common up this code again and
+ * invert the optimisation.  Grrrr --SDM).
+ */
+#ifdef CMINUSMINUS
+#define generation(n) (W_[generations] + n*SIZEOF_generation)
+#define updateWithIndirection(ind_info, p1, p2, and_then)	\
+    W_ bd;							\
+								\
+    DEBUG_FILL_SLOP(p1);					\
+    LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC(p1);			\
+    StgInd_indirectee(p1) = p2;					\
+    foreign "C" wb() [];					\
+    bd = Bdescr(p1);						\
+    if (bdescr_gen_no(bd) != 0 :: CInt) {			\
+      foreign "C" recordMutableCap(p1 "ptr",			\
+				   MyCapability() "ptr",	\
+		                   bdescr_gen_no(bd)) [R1];	\
+      SET_INFO(p1, stg_IND_OLDGEN_info);			\
+      LDV_RECORD_CREATE(p1);					\
+      TICK_UPD_OLD_IND();					\
+      and_then;							\
+    } else {							\
+      SET_INFO(p1, ind_info);					\
+      LDV_RECORD_CREATE(p1);					\
+      TICK_UPD_NEW_IND();					\
+      and_then;							\
+  }
+#else
+#define updateWithIndirection(ind_info, p1, p2, and_then)		\
+  {									\
+    bdescr *bd;								\
+									\
+    /* cas(p1, 0, &stg_WHITEHOLE_info); */				\
+    ASSERT( (P_)p1 != (P_)p2 && !closure_IND(p1) );			\
+    DEBUG_FILL_SLOP(p1);						\
+    LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC(p1);				\
+    ((StgInd *)p1)->indirectee = p2;					\
+    wb();								\
+    bd = Bdescr((P_)p1);						\
+    if (bd->gen_no != 0) {						\
+      recordMutableGenLock(p1, &generations[bd->gen_no]);		\
+      SET_INFO(p1, &stg_IND_OLDGEN_info);				\
+      TICK_UPD_OLD_IND();						\
+      and_then;								\
+    } else {								\
+      SET_INFO(p1, ind_info);						\
+      LDV_RECORD_CREATE(p1);						\
+      TICK_UPD_NEW_IND();						\
+      and_then;								\
+    }									\
+  }
+#endif
+
+/* The permanent indirection version isn't performance critical.  We
+ * therefore use an inline C function instead of the C-- macro.
+ */
+#ifndef CMINUSMINUS
+INLINE_HEADER void
+updateWithPermIndirection(StgClosure *p1,
+	                  StgClosure *p2) 
+{
+  bdescr *bd;
+
+  ASSERT( p1 != p2 && !closure_IND(p1) );
+
+  /*
+   * @LDV profiling
+   * Destroy the old closure.
+   * Nb: LDV_* stuff cannot mix with ticky-ticky
+   */
+  LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC(p1);
+
+  bd = Bdescr((P_)p1);
+  if (bd->gen_no != 0) {
+    recordMutableGenLock(p1, &generations[bd->gen_no]);
+    ((StgInd *)p1)->indirectee = p2;
+    SET_INFO(p1, &stg_IND_OLDGEN_PERM_info);
+    /*
+     * @LDV profiling
+     * We have just created a new closure.
+     */
+    LDV_RECORD_CREATE(p1);
+    TICK_UPD_OLD_PERM_IND();
+  } else {
+    ((StgInd *)p1)->indirectee = p2;
+    SET_INFO(p1, &stg_IND_PERM_info);
+    /*
+     * @LDV profiling
+     * We have just created a new closure.
+     */
+    LDV_RECORD_CREATE(p1);
+    TICK_UPD_NEW_PERM_IND(p1);
+  }
+}
+#endif
+
+#endif /* UPDATES_H */
diff --git a/rts/VisCallbacks.c b/rts/VisCallbacks.c
new file mode 100644
index 0000000000..8e3c6ceb6c
--- /dev/null
+++ b/rts/VisCallbacks.c
@@ -0,0 +1,75 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2000
+ *
+ * RTS GTK Front Panel (callbacks)
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifdef RTS_GTK_FRONTPANEL
+
+#include "Rts.h"
+
+#include <gtk/gtk.h>
+
+#include "VisCallbacks.h"
+#include "VisWindow.h"
+#include "VisSupport.h"
+#include "FrontPanel.h"
+
+void
+on_cont_radio_clicked                  (GtkButton       *button,
+                                        gpointer         user_data)
+{
+    update_mode = Continuous;
+}
+
+
+void
+on_stop_before_radio_clicked           (GtkButton       *button,
+                                        gpointer         user_data)
+{
+    update_mode = BeforeGC;
+}
+
+
+void
+on_stop_after_radio_clicked            (GtkButton       *button,
+                                        gpointer         user_data)
+{
+    update_mode = AfterGC;
+}
+
+
+void
+on_stop_both_radio_clicked             (GtkButton       *button,
+                                        gpointer         user_data)
+{
+    update_mode = BeforeAfterGC;
+}
+
+
+void
+on_stop_but_clicked                    (GtkButton       *button,
+                                        gpointer         user_data)
+{
+    stop_now = TRUE;
+}
+
+
+void
+on_continue_but_clicked                (GtkButton       *button,
+                                        gpointer         user_data)
+{
+    continue_now = TRUE;
+}
+
+
+void
+on_quit_but_clicked                    (GtkButton       *button,
+                                        gpointer         user_data)
+{
+    quit = TRUE;
+}
+
+#endif /* RTS_GTK_FRONTPANEL */
diff --git a/rts/VisCallbacks.h b/rts/VisCallbacks.h
new file mode 100644
index 0000000000..d242010fad
--- /dev/null
+++ b/rts/VisCallbacks.h
@@ -0,0 +1,30 @@
+#include <gtk/gtk.h>
+
+
+void
+on_cont_radio_clicked                  (GtkButton       *button,
+                                        gpointer         user_data);
+
+void
+on_stop_before_radio_clicked           (GtkButton       *button,
+                                        gpointer         user_data);
+
+void
+on_stop_after_radio_clicked            (GtkButton       *button,
+                                        gpointer         user_data);
+
+void
+on_stop_both_radio_clicked             (GtkButton       *button,
+                                        gpointer         user_data);
+
+void
+on_stop_but_clicked                    (GtkButton       *button,
+                                        gpointer         user_data);
+
+void
+on_continue_but_clicked                (GtkButton       *button,
+                                        gpointer         user_data);
+
+void
+on_quit_but_clicked                    (GtkButton       *button,
+                                        gpointer         user_data);
diff --git a/rts/VisSupport.c b/rts/VisSupport.c
new file mode 100644
index 0000000000..a85c5f43a4
--- /dev/null
+++ b/rts/VisSupport.c
@@ -0,0 +1,144 @@
+/*
+ * DO NOT EDIT THIS FILE - it is generated by Glade.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdio.h>
+
+#include <gtk/gtk.h>
+
+#include "VisSupport.h"
+
+GtkWidget*
+lookup_widget                          (GtkWidget       *widget,
+                                        const gchar     *widget_name)
+{
+  GtkWidget *parent, *found_widget;
+
+  for (;;)
+    {
+      if (GTK_IS_MENU (widget))
+        parent = gtk_menu_get_attach_widget (GTK_MENU (widget));
+      else
+        parent = widget->parent;
+      if (!parent)
+        parent = (GtkWidget*) g_object_get_data (G_OBJECT (widget), "GladeParentKey");
+      if (parent == NULL)
+        break;
+      widget = parent;
+    }
+
+  found_widget = (GtkWidget*) g_object_get_data (G_OBJECT (widget),
+                                                 widget_name);
+  if (!found_widget)
+    g_warning ("Widget not found: %s", widget_name);
+  return found_widget;
+}
+
+static GList *pixmaps_directories = NULL;
+
+/* Use this function to set the directory containing installed pixmaps. */
+void
+add_pixmap_directory                   (const gchar     *directory)
+{
+  pixmaps_directories = g_list_prepend (pixmaps_directories,
+                                        g_strdup (directory));
+}
+
+/* This is an internally used function to find pixmap files. */
+static gchar*
+find_pixmap_file                       (const gchar     *filename)
+{
+  GList *elem;
+
+  /* We step through each of the pixmaps directory to find it. */
+  elem = pixmaps_directories;
+  while (elem)
+    {
+      gchar *pathname = g_strdup_printf ("%s%s%s", (gchar*)elem->data,
+                                         G_DIR_SEPARATOR_S, filename);
+      if (g_file_test (pathname, G_FILE_TEST_EXISTS))
+        return pathname;
+      g_free (pathname);
+      elem = elem->next;
+    }
+  return NULL;
+}
+
+/* This is an internally used function to create pixmaps. */
+GtkWidget*
+create_pixmap                          (GtkWidget       *widget,
+                                        const gchar     *filename)
+{
+  gchar *pathname = NULL;
+  GtkWidget *pixmap;
+
+  if (!filename || !filename[0])
+      return gtk_image_new ();
+
+  pathname = find_pixmap_file (filename);
+
+  if (!pathname)
+    {
+      g_warning ("Couldn't find pixmap file: %s", filename);
+      return gtk_image_new ();
+    }
+
+  pixmap = gtk_image_new_from_file (pathname);
+  g_free (pathname);
+  return pixmap;
+}
+
+/* This is an internally used function to create pixmaps. */
+GdkPixbuf*
+create_pixbuf                          (const gchar     *filename)
+{
+  gchar *pathname = NULL;
+  GdkPixbuf *pixbuf;
+  GError *error = NULL;
+
+  if (!filename || !filename[0])
+      return NULL;
+
+  pathname = find_pixmap_file (filename);
+
+  if (!pathname)
+    {
+      g_warning ("Couldn't find pixmap file: %s", filename);
+      return NULL;
+    }
+
+  pixbuf = gdk_pixbuf_new_from_file (pathname, &error);
+  if (!pixbuf)
+    {
+      fprintf (stderr, "Failed to load pixbuf file: %s: %s\n",
+               pathname, error->message);
+      g_error_free (error);
+    }
+  g_free (pathname);
+  return pixbuf;
+}
+
+/* This is used to set ATK action descriptions. */
+void
+glade_set_atk_action_description       (AtkAction       *action,
+                                        const gchar     *action_name,
+                                        const gchar     *description)
+{
+  gint n_actions, i;
+
+  n_actions = atk_action_get_n_actions (action);
+  for (i = 0; i < n_actions; i++)
+    {
+      if (!strcmp (atk_action_get_name (action, i), action_name))
+        atk_action_set_description (action, i, description);
+    }
+}
+
diff --git a/rts/VisSupport.h b/rts/VisSupport.h
new file mode 100644
index 0000000000..2dea079c2a
--- /dev/null
+++ b/rts/VisSupport.h
@@ -0,0 +1,44 @@
+/*
+ * DO NOT EDIT THIS FILE - it is generated by Glade.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include <gtk/gtk.h>
+
+/*
+ * Public Functions.
+ */
+
+/*
+ * This function returns a widget in a component created by Glade.
+ * Call it with the toplevel widget in the component (i.e. a window/dialog),
+ * or alternatively any widget in the component, and the name of the widget
+ * you want returned.
+ */
+GtkWidget*  lookup_widget              (GtkWidget       *widget,
+                                        const gchar     *widget_name);
+
+
+/* Use this function to set the directory containing installed pixmaps. */
+void        add_pixmap_directory       (const gchar     *directory);
+
+
+/*
+ * Private Functions.
+ */
+
+/* This is used to create the pixmaps used in the interface. */
+GtkWidget*  create_pixmap              (GtkWidget       *widget,
+                                        const gchar     *filename);
+
+/* This is used to create the pixbufs used in the interface. */
+GdkPixbuf*  create_pixbuf              (const gchar     *filename);
+
+/* This is used to set ATK action descriptions. */
+void        glade_set_atk_action_description (AtkAction       *action,
+                                              const gchar     *action_name,
+                                              const gchar     *description);
+
diff --git a/rts/VisWindow.c b/rts/VisWindow.c
new file mode 100644
index 0000000000..188b88976e
--- /dev/null
+++ b/rts/VisWindow.c
@@ -0,0 +1,747 @@
+/*
+ * DO NOT EDIT THIS FILE - it is generated by Glade.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdio.h>
+
+#include <gdk/gdkkeysyms.h>
+#include <gtk/gtk.h>
+
+#include "VisCallbacks.h"
+#include "VisWindow.h"
+#include "VisSupport.h"
+
+#define GLADE_HOOKUP_OBJECT(component,widget,name) \
+  g_object_set_data_full (G_OBJECT (component), name, \
+    gtk_widget_ref (widget), (GDestroyNotify) gtk_widget_unref)
+
+#define GLADE_HOOKUP_OBJECT_NO_REF(component,widget,name) \
+  g_object_set_data (G_OBJECT (component), name, widget)
+
+GtkWidget*
+create_GHC_Front_Panel (void)
+{
+  GtkWidget *GHC_Front_Panel;
+  GtkWidget *vbox1;
+  GtkWidget *hbox1;
+  GtkWidget *vbox4;
+  GtkWidget *frame3;
+  GtkWidget *hbox3;
+  GtkWidget *label40;
+  GtkWidget *map_ruler;
+  GtkWidget *memmap;
+  GtkWidget *label1;
+  GtkWidget *frame8;
+  GtkWidget *vbox14;
+  GtkWidget *table4;
+  GtkWidget *gen_ruler;
+  GtkWidget *gen_hbox;
+  GtkWidget *generations;
+  GtkWidget *label39;
+  GtkWidget *label41;
+  GtkWidget *frame7;
+  GtkWidget *table3;
+  GtkWidget *res_hruler;
+  GtkWidget *res_vruler;
+  GtkWidget *res_drawingarea;
+  GtkWidget *label37;
+  GtkWidget *label38;
+  GtkWidget *label42;
+  GtkWidget *vbox5;
+  GtkWidget *frame5;
+  GtkWidget *vbox6;
+  GtkWidget *table1;
+  GtkWidget *label12;
+  GtkWidget *label13;
+  GtkWidget *label14;
+  GtkWidget *label15;
+  GtkWidget *label16;
+  GtkWidget *label17;
+  GtkWidget *label18;
+  GtkWidget *label19;
+  GtkWidget *live_label;
+  GtkWidget *allocated_label;
+  GtkWidget *footprint_label;
+  GtkWidget *alloc_rate_label;
+  GtkWidget *label43;
+  GtkWidget *frame9;
+  GtkWidget *table5;
+  GtkWidget *label20;
+  GtkWidget *label21;
+  GtkWidget *label22;
+  GtkWidget *label24;
+  GtkWidget *label26;
+  GtkWidget *label25;
+  GtkWidget *label27;
+  GtkWidget *running_label;
+  GtkWidget *blockread_label;
+  GtkWidget *blockwrite_label;
+  GtkWidget *blockmvar_label;
+  GtkWidget *blockthrowto_label;
+  GtkWidget *blockbh_label;
+  GtkWidget *sleeping_label;
+  GtkWidget *hseparator1;
+  GtkWidget *hseparator2;
+  GtkWidget *label35;
+  GtkWidget *total_label;
+  GtkWidget *label44;
+  GtkWidget *frame6;
+  GtkWidget *vbox7;
+  GtkWidget *vbox9;
+  GtkWidget *cont_radio;
+  GSList *cont_radio_group = NULL;
+  GtkWidget *stop_before_radio;
+  GtkWidget *stop_after_radio;
+  GtkWidget *stop_both_radio;
+  GtkWidget *vbox8;
+  GtkWidget *stop_but;
+  GtkWidget *continue_but;
+  GtkWidget *label45;
+  GtkWidget *quit_but;
+  GtkWidget *statusbar;
+
+  GHC_Front_Panel = gtk_window_new (GTK_WINDOW_TOPLEVEL);
+  gtk_widget_set_name (GHC_Front_Panel, "GHC_Front_Panel");
+  gtk_window_set_title (GTK_WINDOW (GHC_Front_Panel), "GHC Front Panel");
+  gtk_window_set_default_size (GTK_WINDOW (GHC_Front_Panel), 450, 600);
+
+  vbox1 = gtk_vbox_new (FALSE, 0);
+  gtk_widget_set_name (vbox1, "vbox1");
+  gtk_widget_show (vbox1);
+  gtk_container_add (GTK_CONTAINER (GHC_Front_Panel), vbox1);
+
+  hbox1 = gtk_hbox_new (FALSE, 10);
+  gtk_widget_set_name (hbox1, "hbox1");
+  gtk_widget_show (hbox1);
+  gtk_box_pack_start (GTK_BOX (vbox1), hbox1, TRUE, TRUE, 0);
+  gtk_container_set_border_width (GTK_CONTAINER (hbox1), 10);
+
+  vbox4 = gtk_vbox_new (FALSE, 10);
+  gtk_widget_set_name (vbox4, "vbox4");
+  gtk_widget_show (vbox4);
+  gtk_box_pack_start (GTK_BOX (hbox1), vbox4, TRUE, TRUE, 0);
+
+  frame3 = gtk_frame_new (NULL);
+  gtk_widget_set_name (frame3, "frame3");
+  gtk_widget_show (frame3);
+  gtk_box_pack_start (GTK_BOX (vbox4), frame3, TRUE, TRUE, 0);
+
+  hbox3 = gtk_hbox_new (FALSE, 0);
+  gtk_widget_set_name (hbox3, "hbox3");
+  gtk_widget_show (hbox3);
+  gtk_container_add (GTK_CONTAINER (frame3), hbox3);
+
+  label40 = gtk_label_new ("Mb");
+  gtk_widget_set_name (label40, "label40");
+  gtk_widget_show (label40);
+  gtk_box_pack_start (GTK_BOX (hbox3), label40, FALSE, FALSE, 0);
+  gtk_label_set_justify (GTK_LABEL (label40), GTK_JUSTIFY_CENTER);
+
+  map_ruler = gtk_vruler_new ();
+  gtk_widget_set_name (map_ruler, "map_ruler");
+  gtk_widget_show (map_ruler);
+  gtk_box_pack_start (GTK_BOX (hbox3), map_ruler, FALSE, FALSE, 0);
+  gtk_ruler_set_range (GTK_RULER (map_ruler), 0, 10, 1.40845, 10);
+
+  memmap = gtk_drawing_area_new ();
+  gtk_widget_set_name (memmap, "memmap");
+  gtk_widget_show (memmap);
+  gtk_box_pack_start (GTK_BOX (hbox3), memmap, TRUE, TRUE, 0);
+
+  label1 = gtk_label_new ("Memory Map");
+  gtk_widget_set_name (label1, "label1");
+  gtk_widget_show (label1);
+  gtk_frame_set_label_widget (GTK_FRAME (frame3), label1);
+
+  frame8 = gtk_frame_new (NULL);
+  gtk_widget_set_name (frame8, "frame8");
+  gtk_widget_show (frame8);
+  gtk_box_pack_start (GTK_BOX (vbox4), frame8, TRUE, TRUE, 0);
+
+  vbox14 = gtk_vbox_new (FALSE, 0);
+  gtk_widget_set_name (vbox14, "vbox14");
+  gtk_widget_show (vbox14);
+  gtk_container_add (GTK_CONTAINER (frame8), vbox14);
+
+  table4 = gtk_table_new (2, 3, FALSE);
+  gtk_widget_set_name (table4, "table4");
+  gtk_widget_show (table4);
+  gtk_box_pack_start (GTK_BOX (vbox14), table4, TRUE, TRUE, 0);
+
+  gen_ruler = gtk_vruler_new ();
+  gtk_widget_set_name (gen_ruler, "gen_ruler");
+  gtk_widget_show (gen_ruler);
+  gtk_table_attach (GTK_TABLE (table4), gen_ruler, 1, 2, 0, 1,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (GTK_EXPAND | GTK_FILL), 0, 0);
+  gtk_ruler_set_range (GTK_RULER (gen_ruler), 0, 10, 1.69935, 10);
+
+  gen_hbox = gtk_hbox_new (FALSE, 0);
+  gtk_widget_set_name (gen_hbox, "gen_hbox");
+  gtk_widget_show (gen_hbox);
+  gtk_table_attach (GTK_TABLE (table4), gen_hbox, 2, 3, 1, 2,
+                    (GtkAttachOptions) (GTK_EXPAND | GTK_FILL),
+                    (GtkAttachOptions) (GTK_FILL), 0, 0);
+
+  generations = gtk_drawing_area_new ();
+  gtk_widget_set_name (generations, "generations");
+  gtk_widget_show (generations);
+  gtk_table_attach (GTK_TABLE (table4), generations, 2, 3, 0, 1,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (GTK_FILL), 0, 0);
+
+  label39 = gtk_label_new ("Mb");
+  gtk_widget_set_name (label39, "label39");
+  gtk_widget_show (label39);
+  gtk_table_attach (GTK_TABLE (table4), label39, 0, 1, 0, 1,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label39), GTK_JUSTIFY_CENTER);
+
+  label41 = gtk_label_new ("Generations");
+  gtk_widget_set_name (label41, "label41");
+  gtk_widget_show (label41);
+  gtk_frame_set_label_widget (GTK_FRAME (frame8), label41);
+
+  frame7 = gtk_frame_new (NULL);
+  gtk_widget_set_name (frame7, "frame7");
+  gtk_widget_show (frame7);
+  gtk_box_pack_start (GTK_BOX (vbox4), frame7, TRUE, TRUE, 0);
+
+  table3 = gtk_table_new (3, 3, FALSE);
+  gtk_widget_set_name (table3, "table3");
+  gtk_widget_show (table3);
+  gtk_container_add (GTK_CONTAINER (frame7), table3);
+  gtk_container_set_border_width (GTK_CONTAINER (table3), 2);
+
+  res_hruler = gtk_hruler_new ();
+  gtk_widget_set_name (res_hruler, "res_hruler");
+  gtk_widget_show (res_hruler);
+  gtk_table_attach (GTK_TABLE (table3), res_hruler, 2, 3, 1, 2,
+                    (GtkAttachOptions) (GTK_EXPAND | GTK_FILL),
+                    (GtkAttachOptions) (GTK_FILL), 0, 0);
+  gtk_ruler_set_range (GTK_RULER (res_hruler), 0, 10, 8.35443, 10);
+
+  res_vruler = gtk_vruler_new ();
+  gtk_widget_set_name (res_vruler, "res_vruler");
+  gtk_widget_show (res_vruler);
+  gtk_table_attach (GTK_TABLE (table3), res_vruler, 1, 2, 2, 3,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (GTK_EXPAND | GTK_FILL), 0, 0);
+  gtk_ruler_set_range (GTK_RULER (res_vruler), 0, 10, 9.69925, 10);
+
+  res_drawingarea = gtk_drawing_area_new ();
+  gtk_widget_set_name (res_drawingarea, "res_drawingarea");
+  gtk_widget_show (res_drawingarea);
+  gtk_table_attach (GTK_TABLE (table3), res_drawingarea, 2, 3, 2, 3,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (GTK_FILL), 0, 0);
+
+  label37 = gtk_label_new ("Secs");
+  gtk_widget_set_name (label37, "label37");
+  gtk_widget_show (label37);
+  gtk_table_attach (GTK_TABLE (table3), label37, 2, 3, 0, 1,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label37), GTK_JUSTIFY_CENTER);
+
+  label38 = gtk_label_new ("Mb");
+  gtk_widget_set_name (label38, "label38");
+  gtk_widget_show (label38);
+  gtk_table_attach (GTK_TABLE (table3), label38, 0, 1, 2, 3,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label38), GTK_JUSTIFY_CENTER);
+
+  label42 = gtk_label_new ("Residency");
+  gtk_widget_set_name (label42, "label42");
+  gtk_widget_show (label42);
+  gtk_frame_set_label_widget (GTK_FRAME (frame7), label42);
+
+  vbox5 = gtk_vbox_new (FALSE, 10);
+  gtk_widget_set_name (vbox5, "vbox5");
+  gtk_widget_show (vbox5);
+  gtk_box_pack_end (GTK_BOX (hbox1), vbox5, FALSE, FALSE, 0);
+
+  frame5 = gtk_frame_new (NULL);
+  gtk_widget_set_name (frame5, "frame5");
+  gtk_widget_show (frame5);
+  gtk_box_pack_start (GTK_BOX (vbox5), frame5, FALSE, TRUE, 0);
+
+  vbox6 = gtk_vbox_new (FALSE, 0);
+  gtk_widget_set_name (vbox6, "vbox6");
+  gtk_widget_show (vbox6);
+  gtk_container_add (GTK_CONTAINER (frame5), vbox6);
+  gtk_container_set_border_width (GTK_CONTAINER (vbox6), 5);
+
+  table1 = gtk_table_new (4, 3, FALSE);
+  gtk_widget_set_name (table1, "table1");
+  gtk_widget_show (table1);
+  gtk_box_pack_start (GTK_BOX (vbox6), table1, TRUE, TRUE, 0);
+  gtk_table_set_col_spacings (GTK_TABLE (table1), 7);
+
+  label12 = gtk_label_new ("Allocated");
+  gtk_widget_set_name (label12, "label12");
+  gtk_widget_show (label12);
+  gtk_table_attach (GTK_TABLE (table1), label12, 0, 1, 1, 2,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label12), GTK_JUSTIFY_RIGHT);
+  gtk_misc_set_alignment (GTK_MISC (label12), 1, 0.5);
+
+  label13 = gtk_label_new ("Live");
+  gtk_widget_set_name (label13, "label13");
+  gtk_widget_show (label13);
+  gtk_table_attach (GTK_TABLE (table1), label13, 0, 1, 0, 1,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label13), GTK_JUSTIFY_RIGHT);
+  gtk_misc_set_alignment (GTK_MISC (label13), 1, 0.5);
+
+  label14 = gtk_label_new ("Allocation Rate");
+  gtk_widget_set_name (label14, "label14");
+  gtk_widget_show (label14);
+  gtk_table_attach (GTK_TABLE (table1), label14, 0, 1, 3, 4,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label14), GTK_JUSTIFY_RIGHT);
+  gtk_misc_set_alignment (GTK_MISC (label14), 1, 0.5);
+
+  label15 = gtk_label_new ("\t\tFootprint");
+  gtk_widget_set_name (label15, "label15");
+  gtk_widget_show (label15);
+  gtk_table_attach (GTK_TABLE (table1), label15, 0, 1, 2, 3,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label15), GTK_JUSTIFY_RIGHT);
+  gtk_misc_set_alignment (GTK_MISC (label15), 1, 0.5);
+
+  label16 = gtk_label_new ("M/sec");
+  gtk_widget_set_name (label16, "label16");
+  gtk_widget_show (label16);
+  gtk_table_attach (GTK_TABLE (table1), label16, 2, 3, 3, 4,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label16), GTK_JUSTIFY_CENTER);
+
+  label17 = gtk_label_new ("M");
+  gtk_widget_set_name (label17, "label17");
+  gtk_widget_show (label17);
+  gtk_table_attach (GTK_TABLE (table1), label17, 2, 3, 2, 3,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_misc_set_alignment (GTK_MISC (label17), 7.45058e-09, 0.5);
+
+  label18 = gtk_label_new ("M");
+  gtk_widget_set_name (label18, "label18");
+  gtk_widget_show (label18);
+  gtk_table_attach (GTK_TABLE (table1), label18, 2, 3, 1, 2,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label18), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label18), 7.45058e-09, 0.5);
+
+  label19 = gtk_label_new ("M");
+  gtk_widget_set_name (label19, "label19");
+  gtk_widget_show (label19);
+  gtk_table_attach (GTK_TABLE (table1), label19, 2, 3, 0, 1,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label19), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label19), 7.45058e-09, 0.5);
+
+  live_label = gtk_label_new ("");
+  gtk_widget_set_name (live_label, "live_label");
+  gtk_widget_show (live_label);
+  gtk_table_attach (GTK_TABLE (table1), live_label, 1, 2, 0, 1,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (live_label), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (live_label), 1, 0.5);
+
+  allocated_label = gtk_label_new ("");
+  gtk_widget_set_name (allocated_label, "allocated_label");
+  gtk_widget_show (allocated_label);
+  gtk_table_attach (GTK_TABLE (table1), allocated_label, 1, 2, 1, 2,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (allocated_label), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (allocated_label), 1, 0.5);
+
+  footprint_label = gtk_label_new ("");
+  gtk_widget_set_name (footprint_label, "footprint_label");
+  gtk_widget_show (footprint_label);
+  gtk_table_attach (GTK_TABLE (table1), footprint_label, 1, 2, 2, 3,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (footprint_label), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (footprint_label), 1, 0.5);
+
+  alloc_rate_label = gtk_label_new ("");
+  gtk_widget_set_name (alloc_rate_label, "alloc_rate_label");
+  gtk_widget_show (alloc_rate_label);
+  gtk_table_attach (GTK_TABLE (table1), alloc_rate_label, 1, 2, 3, 4,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (alloc_rate_label), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (alloc_rate_label), 1, 0.5);
+
+  label43 = gtk_label_new ("Stats");
+  gtk_widget_set_name (label43, "label43");
+  gtk_widget_show (label43);
+  gtk_frame_set_label_widget (GTK_FRAME (frame5), label43);
+
+  frame9 = gtk_frame_new (NULL);
+  gtk_widget_set_name (frame9, "frame9");
+  gtk_widget_show (frame9);
+  gtk_box_pack_start (GTK_BOX (vbox5), frame9, FALSE, TRUE, 0);
+
+  table5 = gtk_table_new (9, 2, FALSE);
+  gtk_widget_set_name (table5, "table5");
+  gtk_widget_show (table5);
+  gtk_container_add (GTK_CONTAINER (frame9), table5);
+  gtk_container_set_border_width (GTK_CONTAINER (table5), 6);
+  gtk_table_set_col_spacings (GTK_TABLE (table5), 10);
+
+  label20 = gtk_label_new ("Running");
+  gtk_widget_set_name (label20, "label20");
+  gtk_widget_show (label20);
+  gtk_table_attach (GTK_TABLE (table5), label20, 0, 1, 0, 1,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label20), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label20), 1, 0.5);
+
+  label21 = gtk_label_new ("Blocked on I/O (Read)");
+  gtk_widget_set_name (label21, "label21");
+  gtk_widget_show (label21);
+  gtk_table_attach (GTK_TABLE (table5), label21, 0, 1, 1, 2,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label21), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label21), 1, 0.5);
+
+  label22 = gtk_label_new ("Blocked on MVar");
+  gtk_widget_set_name (label22, "label22");
+  gtk_widget_show (label22);
+  gtk_table_attach (GTK_TABLE (table5), label22, 0, 1, 3, 4,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label22), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label22), 1, 0.5);
+
+  label24 = gtk_label_new ("Blocked on throwTo");
+  gtk_widget_set_name (label24, "label24");
+  gtk_widget_show (label24);
+  gtk_table_attach (GTK_TABLE (table5), label24, 0, 1, 4, 5,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label24), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label24), 1, 0.5);
+
+  label26 = gtk_label_new ("Blocked on Black Hole");
+  gtk_widget_set_name (label26, "label26");
+  gtk_widget_show (label26);
+  gtk_table_attach (GTK_TABLE (table5), label26, 0, 1, 5, 6,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label26), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label26), 1, 0.5);
+
+  label25 = gtk_label_new ("Sleeping");
+  gtk_widget_set_name (label25, "label25");
+  gtk_widget_show (label25);
+  gtk_table_attach (GTK_TABLE (table5), label25, 0, 1, 6, 7,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label25), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label25), 1, 0.5);
+
+  label27 = gtk_label_new ("Blocked on I/O (Write)");
+  gtk_widget_set_name (label27, "label27");
+  gtk_widget_show (label27);
+  gtk_table_attach (GTK_TABLE (table5), label27, 0, 1, 2, 3,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label27), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label27), 1, 0.5);
+
+  running_label = gtk_label_new ("label28");
+  gtk_widget_set_name (running_label, "running_label");
+  gtk_widget_show (running_label);
+  gtk_table_attach (GTK_TABLE (table5), running_label, 1, 2, 0, 1,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (running_label), GTK_JUSTIFY_CENTER);
+
+  blockread_label = gtk_label_new ("label29");
+  gtk_widget_set_name (blockread_label, "blockread_label");
+  gtk_widget_show (blockread_label);
+  gtk_table_attach (GTK_TABLE (table5), blockread_label, 1, 2, 1, 2,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (blockread_label), GTK_JUSTIFY_CENTER);
+
+  blockwrite_label = gtk_label_new ("label30");
+  gtk_widget_set_name (blockwrite_label, "blockwrite_label");
+  gtk_widget_show (blockwrite_label);
+  gtk_table_attach (GTK_TABLE (table5), blockwrite_label, 1, 2, 2, 3,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (blockwrite_label), GTK_JUSTIFY_CENTER);
+
+  blockmvar_label = gtk_label_new ("label31");
+  gtk_widget_set_name (blockmvar_label, "blockmvar_label");
+  gtk_widget_show (blockmvar_label);
+  gtk_table_attach (GTK_TABLE (table5), blockmvar_label, 1, 2, 3, 4,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (blockmvar_label), GTK_JUSTIFY_CENTER);
+
+  blockthrowto_label = gtk_label_new ("label32");
+  gtk_widget_set_name (blockthrowto_label, "blockthrowto_label");
+  gtk_widget_show (blockthrowto_label);
+  gtk_table_attach (GTK_TABLE (table5), blockthrowto_label, 1, 2, 4, 5,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (blockthrowto_label), GTK_JUSTIFY_CENTER);
+
+  blockbh_label = gtk_label_new ("label33");
+  gtk_widget_set_name (blockbh_label, "blockbh_label");
+  gtk_widget_show (blockbh_label);
+  gtk_table_attach (GTK_TABLE (table5), blockbh_label, 1, 2, 5, 6,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (blockbh_label), GTK_JUSTIFY_CENTER);
+
+  sleeping_label = gtk_label_new ("label34");
+  gtk_widget_set_name (sleeping_label, "sleeping_label");
+  gtk_widget_show (sleeping_label);
+  gtk_table_attach (GTK_TABLE (table5), sleeping_label, 1, 2, 6, 7,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (sleeping_label), GTK_JUSTIFY_CENTER);
+
+  hseparator1 = gtk_hseparator_new ();
+  gtk_widget_set_name (hseparator1, "hseparator1");
+  gtk_widget_show (hseparator1);
+  gtk_table_attach (GTK_TABLE (table5), hseparator1, 0, 1, 7, 8,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (GTK_EXPAND | GTK_FILL), 0, 0);
+
+  hseparator2 = gtk_hseparator_new ();
+  gtk_widget_set_name (hseparator2, "hseparator2");
+  gtk_widget_show (hseparator2);
+  gtk_table_attach (GTK_TABLE (table5), hseparator2, 1, 2, 7, 8,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (GTK_FILL), 0, 0);
+
+  label35 = gtk_label_new ("Total");
+  gtk_widget_set_name (label35, "label35");
+  gtk_widget_show (label35);
+  gtk_table_attach (GTK_TABLE (table5), label35, 0, 1, 8, 9,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label35), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label35), 1, 0.5);
+
+  total_label = gtk_label_new ("label36");
+  gtk_widget_set_name (total_label, "total_label");
+  gtk_widget_show (total_label);
+  gtk_table_attach (GTK_TABLE (table5), total_label, 1, 2, 8, 9,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (total_label), GTK_JUSTIFY_CENTER);
+
+  label44 = gtk_label_new ("Threads");
+  gtk_widget_set_name (label44, "label44");
+  gtk_widget_show (label44);
+  gtk_frame_set_label_widget (GTK_FRAME (frame9), label44);
+
+  frame6 = gtk_frame_new (NULL);
+  gtk_widget_set_name (frame6, "frame6");
+  gtk_widget_show (frame6);
+  gtk_box_pack_start (GTK_BOX (vbox5), frame6, FALSE, FALSE, 0);
+
+  vbox7 = gtk_vbox_new (FALSE, 10);
+  gtk_widget_set_name (vbox7, "vbox7");
+  gtk_widget_show (vbox7);
+  gtk_container_add (GTK_CONTAINER (frame6), vbox7);
+  gtk_container_set_border_width (GTK_CONTAINER (vbox7), 5);
+
+  vbox9 = gtk_vbox_new (FALSE, 0);
+  gtk_widget_set_name (vbox9, "vbox9");
+  gtk_widget_show (vbox9);
+  gtk_box_pack_start (GTK_BOX (vbox7), vbox9, TRUE, TRUE, 0);
+
+  cont_radio = gtk_radio_button_new_with_mnemonic (NULL, "Continuous");
+  gtk_widget_set_name (cont_radio, "cont_radio");
+  gtk_widget_show (cont_radio);
+  gtk_box_pack_start (GTK_BOX (vbox9), cont_radio, FALSE, FALSE, 0);
+  gtk_radio_button_set_group (GTK_RADIO_BUTTON (cont_radio), cont_radio_group);
+  cont_radio_group = gtk_radio_button_get_group (GTK_RADIO_BUTTON (cont_radio));
+  gtk_toggle_button_set_active (GTK_TOGGLE_BUTTON (cont_radio), TRUE);
+
+  stop_before_radio = gtk_radio_button_new_with_mnemonic (NULL, "Stop before GC");
+  gtk_widget_set_name (stop_before_radio, "stop_before_radio");
+  gtk_widget_show (stop_before_radio);
+  gtk_box_pack_start (GTK_BOX (vbox9), stop_before_radio, FALSE, FALSE, 0);
+  gtk_radio_button_set_group (GTK_RADIO_BUTTON (stop_before_radio), cont_radio_group);
+  cont_radio_group = gtk_radio_button_get_group (GTK_RADIO_BUTTON (stop_before_radio));
+
+  stop_after_radio = gtk_radio_button_new_with_mnemonic (NULL, "Stop after GC");
+  gtk_widget_set_name (stop_after_radio, "stop_after_radio");
+  gtk_widget_show (stop_after_radio);
+  gtk_box_pack_start (GTK_BOX (vbox9), stop_after_radio, FALSE, FALSE, 0);
+  gtk_radio_button_set_group (GTK_RADIO_BUTTON (stop_after_radio), cont_radio_group);
+  cont_radio_group = gtk_radio_button_get_group (GTK_RADIO_BUTTON (stop_after_radio));
+
+  stop_both_radio = gtk_radio_button_new_with_mnemonic (NULL, "Stop before & after GC");
+  gtk_widget_set_name (stop_both_radio, "stop_both_radio");
+  gtk_widget_show (stop_both_radio);
+  gtk_box_pack_start (GTK_BOX (vbox9), stop_both_radio, FALSE, FALSE, 0);
+  gtk_radio_button_set_group (GTK_RADIO_BUTTON (stop_both_radio), cont_radio_group);
+  cont_radio_group = gtk_radio_button_get_group (GTK_RADIO_BUTTON (stop_both_radio));
+
+  vbox8 = gtk_vbox_new (FALSE, 0);
+  gtk_widget_set_name (vbox8, "vbox8");
+  gtk_widget_show (vbox8);
+  gtk_box_pack_start (GTK_BOX (vbox7), vbox8, FALSE, FALSE, 0);
+
+  stop_but = gtk_button_new_with_mnemonic ("Stop");
+  gtk_widget_set_name (stop_but, "stop_but");
+  gtk_widget_show (stop_but);
+  gtk_box_pack_start (GTK_BOX (vbox8), stop_but, FALSE, FALSE, 0);
+
+  continue_but = gtk_button_new_with_mnemonic ("Continue");
+  gtk_widget_set_name (continue_but, "continue_but");
+  gtk_widget_show (continue_but);
+  gtk_box_pack_start (GTK_BOX (vbox8), continue_but, FALSE, FALSE, 0);
+
+  label45 = gtk_label_new ("Updates");
+  gtk_widget_set_name (label45, "label45");
+  gtk_widget_show (label45);
+  gtk_frame_set_label_widget (GTK_FRAME (frame6), label45);
+
+  quit_but = gtk_button_new_with_mnemonic ("Quit");
+  gtk_widget_set_name (quit_but, "quit_but");
+  gtk_widget_show (quit_but);
+  gtk_box_pack_end (GTK_BOX (vbox5), quit_but, FALSE, FALSE, 0);
+
+  statusbar = gtk_statusbar_new ();
+  gtk_widget_set_name (statusbar, "statusbar");
+  gtk_widget_show (statusbar);
+  gtk_box_pack_start (GTK_BOX (vbox1), statusbar, FALSE, FALSE, 0);
+
+  g_signal_connect ((gpointer) cont_radio, "clicked",
+                    G_CALLBACK (on_cont_radio_clicked),
+                    NULL);
+  g_signal_connect ((gpointer) stop_before_radio, "clicked",
+                    G_CALLBACK (on_stop_before_radio_clicked),
+                    NULL);
+  g_signal_connect ((gpointer) stop_after_radio, "clicked",
+                    G_CALLBACK (on_stop_after_radio_clicked),
+                    NULL);
+  g_signal_connect ((gpointer) stop_both_radio, "clicked",
+                    G_CALLBACK (on_stop_both_radio_clicked),
+                    NULL);
+  g_signal_connect ((gpointer) stop_but, "clicked",
+                    G_CALLBACK (on_stop_but_clicked),
+                    NULL);
+  g_signal_connect ((gpointer) continue_but, "clicked",
+                    G_CALLBACK (on_continue_but_clicked),
+                    NULL);
+  g_signal_connect ((gpointer) quit_but, "clicked",
+                    G_CALLBACK (on_quit_but_clicked),
+                    NULL);
+
+  /* Store pointers to all widgets, for use by lookup_widget(). */
+  GLADE_HOOKUP_OBJECT_NO_REF (GHC_Front_Panel, GHC_Front_Panel, "GHC_Front_Panel");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, vbox1, "vbox1");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, hbox1, "hbox1");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, vbox4, "vbox4");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, frame3, "frame3");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, hbox3, "hbox3");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label40, "label40");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, map_ruler, "map_ruler");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, memmap, "memmap");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label1, "label1");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, frame8, "frame8");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, vbox14, "vbox14");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, table4, "table4");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, gen_ruler, "gen_ruler");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, gen_hbox, "gen_hbox");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, generations, "generations");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label39, "label39");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label41, "label41");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, frame7, "frame7");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, table3, "table3");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, res_hruler, "res_hruler");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, res_vruler, "res_vruler");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, res_drawingarea, "res_drawingarea");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label37, "label37");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label38, "label38");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label42, "label42");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, vbox5, "vbox5");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, frame5, "frame5");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, vbox6, "vbox6");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, table1, "table1");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label12, "label12");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label13, "label13");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label14, "label14");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label15, "label15");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label16, "label16");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label17, "label17");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label18, "label18");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label19, "label19");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, live_label, "live_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, allocated_label, "allocated_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, footprint_label, "footprint_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, alloc_rate_label, "alloc_rate_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label43, "label43");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, frame9, "frame9");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, table5, "table5");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label20, "label20");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label21, "label21");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label22, "label22");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label24, "label24");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label26, "label26");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label25, "label25");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label27, "label27");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, running_label, "running_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, blockread_label, "blockread_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, blockwrite_label, "blockwrite_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, blockmvar_label, "blockmvar_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, blockthrowto_label, "blockthrowto_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, blockbh_label, "blockbh_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, sleeping_label, "sleeping_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, hseparator1, "hseparator1");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, hseparator2, "hseparator2");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label35, "label35");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, total_label, "total_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label44, "label44");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, frame6, "frame6");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, vbox7, "vbox7");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, vbox9, "vbox9");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, cont_radio, "cont_radio");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, stop_before_radio, "stop_before_radio");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, stop_after_radio, "stop_after_radio");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, stop_both_radio, "stop_both_radio");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, vbox8, "vbox8");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, stop_but, "stop_but");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, continue_but, "continue_but");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label45, "label45");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, quit_but, "quit_but");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, statusbar, "statusbar");
+
+  return GHC_Front_Panel;
+}
+
diff --git a/rts/VisWindow.h b/rts/VisWindow.h
new file mode 100644
index 0000000000..c646c40c02
--- /dev/null
+++ b/rts/VisWindow.h
@@ -0,0 +1,5 @@
+/*
+ * DO NOT EDIT THIS FILE - it is generated by Glade.
+ */
+
+GtkWidget* create_GHC_Front_Panel (void);
diff --git a/rts/Weak.c b/rts/Weak.c
new file mode 100644
index 0000000000..f010395221
--- /dev/null
+++ b/rts/Weak.c
@@ -0,0 +1,97 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-1999
+ *
+ * Weak pointers / finalizers
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#define COMPILING_RTS_MAIN
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "SchedAPI.h"
+#include "RtsFlags.h"
+#include "Weak.h"
+#include "Storage.h"
+#include "Schedule.h"
+#include "Prelude.h"
+#include "RtsAPI.h"
+
+StgWeak *weak_ptr_list;
+
+/*
+ * scheduleFinalizers() is called on the list of weak pointers found
+ * to be dead after a garbage collection.  It overwrites each object
+ * with DEAD_WEAK, and creates a new thread to run the pending finalizers.
+ *
+ * This function is called just after GC.  The weak pointers on the
+ * argument list are those whose keys were found to be not reachable,
+ * however the value and finalizer fields have by now been marked live.
+ * The weak pointer object itself may not be alive - i.e. we may be
+ * looking at either an object in from-space or one in to-space.  It
+ * doesn't really matter either way.
+ *
+ * Pre-condition: sched_mutex _not_ held.
+ */
+
+void
+scheduleFinalizers(Capability *cap, StgWeak *list)
+{
+    StgWeak *w;
+    StgTSO *t;
+    StgMutArrPtrs *arr;
+    nat n;
+
+    // count number of finalizers, and kill all the weak pointers first...
+    n = 0;
+    for (w = list; w; w = w->link) { 
+
+	// Better not be a DEAD_WEAK at this stage; the garbage
+	// collector removes DEAD_WEAKs from the weak pointer list.
+	ASSERT(w->header.info != &stg_DEAD_WEAK_info);
+
+	if (w->finalizer != &stg_NO_FINALIZER_closure) {
+	    n++;
+	}
+
+#ifdef PROFILING
+        // A weak pointer is inherently used, so we do not need to call
+        // LDV_recordDead().
+	//
+        // Furthermore, when PROFILING is turned on, dead weak
+        // pointers are exactly as large as weak pointers, so there is
+        // no need to fill the slop, either.  See stg_DEAD_WEAK_info
+        // in StgMiscClosures.hc.
+#endif
+	SET_HDR(w, &stg_DEAD_WEAK_info, w->header.prof.ccs);
+    }
+	
+    // No finalizers to run?
+    if (n == 0) return;
+
+    IF_DEBUG(weak,debugBelch("weak: batching %d finalizers\n", n));
+
+    arr = (StgMutArrPtrs *)allocateLocal(cap, sizeofW(StgMutArrPtrs) + n);
+    TICK_ALLOC_PRIM(sizeofW(StgMutArrPtrs), n, 0);
+    SET_HDR(arr, &stg_MUT_ARR_PTRS_FROZEN_info, CCS_SYSTEM);
+    arr->ptrs = n;
+
+    n = 0;
+    for (w = list; w; w = w->link) {
+	if (w->finalizer != &stg_NO_FINALIZER_closure) {
+	    arr->payload[n] = w->finalizer;
+	    n++;
+	}
+    }
+
+    t = createIOThread(cap, 
+		       RtsFlags.GcFlags.initialStkSize, 
+		       rts_apply(cap,
+			   rts_apply(cap,
+			       (StgClosure *)runFinalizerBatch_closure,
+			       rts_mkInt(cap,n)), 
+			   (StgClosure *)arr)
+	);
+    scheduleThread(cap,t);
+}
diff --git a/rts/Weak.h b/rts/Weak.h
new file mode 100644
index 0000000000..ba8c1ca913
--- /dev/null
+++ b/rts/Weak.h
@@ -0,0 +1,17 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Weak pointers / finalizers
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef WEAK_H
+#define WEAK_H
+
+#include "Capability.h"
+
+void scheduleFinalizers(Capability *cap, StgWeak *w);
+void markWeakList(void);
+
+#endif
diff --git a/rts/dotnet/Invoke.c b/rts/dotnet/Invoke.c
new file mode 100644
index 0000000000..585dcacaad
--- /dev/null
+++ b/rts/dotnet/Invoke.c
@@ -0,0 +1,1081 @@
+/*
+ * C callable bridge to the .NET object model
+ *
+ * Managed C++ is used to access the .NET object model via
+ * System.Reflection. Here we provide C callable functions
+ * to that functionality, which we then export via a COM
+ * component.
+ *
+ * Note: the _only_ reason why we're going via COM and not simply
+ * exposing the required via some DLL entry points, is that COM
+ * gives us location independence (i.e., the RTS doesn't need
+ * be told where this interop layer resides in order to hoik
+ * it in, the CLSID suffices (provided the component has been
+ * registered, of course.)) It is a bit tiresome to have play
+ * by the .NET COM Interop's rules as regards argument arrays,
+ * so we may want to revisit this issue at some point.
+ * 
+ * [ But why not simply use MC++ and provide C-callable entry
+ *   points to the relevant functionality, and avoid COM interop
+ *   alltogether? Because we have to be able to (statically)
+ *   link with gcc-compiled code, and linking MC++ and gcc-compiled
+ *   object files doesn't work.]
+ *
+ * Note: you need something never than gcc-2.95 to compile this
+ *       code (I'm using gcc-3.2, which comes with mingw-2).
+ */
+#define _WIN32_DCOM
+#define COBJMACROS
+#include <stdio.h>
+#include <stdlib.h>
+#include <wtypes.h>
+#ifndef _MSC_VER
+#include <oaidl.h>
+#include <objbase.h>
+#include <oleauto.h>
+# if defined(COBJMACROS) && !defined(_MSC_VER)
+#define IErrorInfo_QueryInterface(T,r,O) (T)->lpVtbl->QueryInterface(T,r,O)
+#define IErrorInfo_AddRef(T) (T)->lpVtbl->AddRef(T)
+#define IErrorInfo_Release(T) (T)->lpVtbl->Release(T)
+#define IErrorInfo_GetSource(T,pbstr) (T)->lpVtbl->GetSource(T,pbstr)
+#define IErrorInfo_GetDescription(T,pbstr) (T)->lpVtbl->GetDescription(T,pbstr)
+
+#define ISupportErrorInfo_QueryInterface(T,r,O) (T)->lpVtbl->QueryInterface(T,r,O)
+#define ISupportErrorInfo_AddRef(T) (T)->lpVtbl->AddRef(T)
+#define ISupportErrorInfo_Release(T) (T)->lpVtbl->Release(T)
+#define ISupportErrorInfo_InterfaceSupportsErrorInfo(T,iid) (T)->lpVtbl->InterfaceSupportsErrorInfo(T,iid)
+# endif
+#endif
+#include "DNInvoke.h"
+#define WANT_UUID_DECLS
+#include "InvokerClient.h"
+#include "Dotnet.h"
+
+/* Local prototypes */
+static void genError( IUnknown* pUnk,
+		      HRESULT hr,
+		      char* loc,
+		      char** pErrMsg);
+static int  startBridge(char**);
+static int  fromVariant
+                    ( DotnetType resTy, 
+		      VARIANT* pVar, 
+		      void* res,
+		      char** pErrMsg);
+static VARIANT* toVariant ( DotnetArg* p );
+
+/* Pointer to .NET COM component instance; instantiated on demand. */
+static InvokeBridge* pBridge = NULL;
+
+/* convert a char* to a BSTR, copied from the HDirect comlib/ sources */
+static
+HRESULT
+stringToBSTR( /*[in,ptr]*/const char* pstrz
+	    , /*[out]*/ BSTR* pbstr
+	    )
+{
+  int i;
+
+  if (!pbstr) {
+    return E_FAIL;
+  } else {
+    *pbstr = NULL;
+  }
+  if (!pstrz) {
+    return S_OK;
+  }
+
+  i = MultiByteToWideChar(CP_ACP, 0, pstrz, -1, NULL, 0);
+  if ( i < 0 ) {
+    return E_FAIL;
+  }
+  *pbstr = SysAllocStringLen(NULL,i-1);
+  if (*pbstr != NULL) {
+    MultiByteToWideChar(CP_ACP, 0, pstrz, -1, *pbstr, i-1); 
+    //    (*pbstr)[i]=0;
+    return S_OK;
+  } else {
+    return E_FAIL;
+  }
+}
+
+static
+char*
+bstrToString( BSTR bstr )
+{
+    int  i,len;
+    char *res;
+    int  blen;
+
+    if (!bstr) {
+	return NULL;
+    }
+    
+    blen =  SysStringLen(bstr);
+    
+    /* pass in NULL for the multi-byte arg in order to compute length first */
+    len = WideCharToMultiByte(CP_ACP, 0, bstr, blen,
+			      NULL, 0, NULL, NULL);
+    if (len == 0) return NULL;
+    
+    /* Allocate string of required length. */
+    res = (char*)malloc(sizeof(char) * (len + 1));
+    if (!res) return NULL;
+    
+    i = WideCharToMultiByte(CP_ACP, 0, bstr, blen,
+			    res, (len+1), NULL, NULL);
+			    
+    /* Poor error handling to map this to NULL. */
+    if ( i == 0 ) return NULL;
+
+    /* Terminate and return */
+    res[i] = '\0';
+    return res;
+}
+
+static
+void
+freeArgs ( SAFEARRAY* psa )
+{
+  /* The argument SAFEARRAYs contain dynamically allocated
+   * VARIANTs. Release the VARIANT contents and its memory here.
+   */
+  long lb,ub;
+  int i;
+  HRESULT hr;
+  VARIANT *pv = NULL;
+  
+  hr = SafeArrayGetLBound(psa, 1, &lb);
+  if (FAILED(hr)) {
+    fprintf(stderr, "freeArgs: failed fetching lower bound\n");
+    SafeArrayDestroy(psa);
+    return;
+  }
+  hr = SafeArrayGetUBound(psa, 1, &ub);
+  if (FAILED(hr)) {
+    fprintf(stderr, "freeArgs: failed fetching upper bound\n");
+    SafeArrayDestroy(psa);
+    return;
+  }
+  for ( i = 0; i < (ub - lb); i++ ) {
+    hr = SafeArrayGetElement(psa,(long*)&i,(void*)pv);
+    if (FAILED(hr)) {
+      fprintf(stderr, "freeArgs: unable to fetch element %d\n", i);
+      SafeArrayDestroy(psa);
+      return;
+    }
+    VariantClear(pv);
+    free(pv);
+  }
+  SafeArrayDestroy(psa);
+}
+
+static
+SAFEARRAY*
+marshalArgs ( DotnetArg*   args,
+	      unsigned int n_args )
+{
+  SAFEARRAY *psa;
+  SAFEARRAYBOUND rgsabound[1];
+  int i;
+  long idxArr[1];
+  HRESULT hr;
+  VARIANT* var;
+
+  rgsabound[0].lLbound   = 0;
+  rgsabound[0].cElements = n_args;
+  psa = SafeArrayCreate(VT_VARIANT, 1, rgsabound);
+  
+  for(i=0;i < n_args; i++) {
+    idxArr[0] = i;
+    var = toVariant(&args[i]);
+    hr = SafeArrayPutElement(psa, idxArr, (void*)var);
+  }
+  return psa;
+}
+
+/* 
+ * ***** Accessing the .NET object model *****
+ *
+ * General remarks:
+ *
+ *   - the functions report error conditions via their return value; a char*.
+ *     If NULL, the call was successful. If not, the returned string 
+ *     contains the (dynamically allocated) error message. 
+ * 
+ *     This unorthodox calling convetion is used to simplify the task
+ *     of interfacing to these funs from GHC-generated code.
+ */
+
+/*
+ * Function: DN_invokeStatic()
+ *
+ * Given assembly and fully-qualified name of a static .NET method,
+ * invoke it using the supplied arguments.
+ *
+ * Returns NULL on success, pointer to error message if an error.
+ *
+ */
+char*
+DN_invokeStatic ( char       *assemName,
+		  char       *methName,
+		  DotnetArg  *args,
+		  int        n_args,
+		  DotnetType resultTy,
+		  void       *res)
+{
+    SAFEARRAY* psa;
+    VARIANT    result;
+    HRESULT    hr;
+    BSTR       b_assemName;
+    BSTR       b_methName;
+    char*      errMsg = NULL;
+    
+    if (!pBridge && !startBridge(&errMsg)) {
+      return errMsg;
+    }
+    
+    /* Package up arguments */
+    psa = marshalArgs(args, n_args);
+    VariantInit(&result);
+    
+    hr = stringToBSTR(assemName, &b_assemName);
+    hr = stringToBSTR(methName, &b_methName);
+
+    hr = InvokeBridge_InvokeStaticMethod(pBridge,
+					 b_assemName,
+					 b_methName,
+					 psa,
+					 &result);
+    SysFreeString(b_assemName);
+    SysFreeString(b_methName);
+    if (FAILED(hr)) {
+	genError((IUnknown*)pBridge, hr, "DInvoke.invokeStatic", &errMsg);
+	return errMsg;
+    }
+   
+    fromVariant(resultTy, &result, res, &errMsg);
+    freeArgs(psa);
+  
+    return errMsg;
+}
+
+/*
+ * Function: DN_invokeMethod()
+ *
+ * Given method name and arguments, invoke .NET method on an object.
+ * The object ref / this-pointer is passed in as the last argument.
+ *
+ * Returns NULL on success, pointer to error message if an error.
+ *
+ */
+char*
+DN_invokeMethod ( char       *clsAndMethName,
+		  DotnetArg  *args,
+		  int        n_args,
+		  DotnetType resultTy,
+		  void       *res)
+{
+    SAFEARRAY* psa;
+    VARIANT    result;
+    HRESULT    hr;
+    char*      methName;
+    BSTR       b_methName;
+    char*      errMsg = NULL;
+    VARIANT    *thisPtr;
+    
+    if (!pBridge && !startBridge(&errMsg)) {
+      return errMsg;
+    }
+    
+    if (n_args <= 0) {
+      genError(NULL, 0x0, "Invoke.invokeMethod - missing this pointer", &errMsg);
+      return errMsg;
+    }
+    
+    /* The this-pointer is last */
+    thisPtr = toVariant(&args[n_args-1]);
+
+    /* Package up arguments */
+    psa = marshalArgs(args, n_args-1);
+    VariantInit(&result);
+    
+    /* If the user has qualified method with class, ignore the class bit. */
+    if ( (methName = strrchr(clsAndMethName, '.')) == NULL) {
+      methName = clsAndMethName;
+    } else {
+      /* Skip past '.' */
+      methName++;
+    }
+    
+    hr = stringToBSTR(methName, &b_methName);
+    hr = InvokeBridge_InvokeMethod(pBridge,
+				   *thisPtr,
+				   b_methName,
+				   psa,
+				   &result);
+    SysFreeString(b_methName);
+    if (FAILED(hr)) {
+	genError((IUnknown*)pBridge, hr, "Invoke.invokeMethod", &errMsg);
+	return errMsg;
+    }
+    
+    fromVariant(resultTy, &result, res, &errMsg);
+    freeArgs(psa);
+  
+    return errMsg;
+}
+
+/*
+ * Function: DN_getField()
+ *
+ * Given a field name and an object pointer, read a field value.
+ * The object ref / this-pointer is passed in as the last argument.
+ *
+ * Returns NULL on success, pointer to error message if an error.
+ *
+ */
+char*
+DN_getField ( char       *clsAndMethName,
+	      DotnetArg  *args,
+	      int        n_args,
+	      DotnetType resultTy,
+	      void       *res)
+{
+    VARIANT    result;
+    HRESULT    hr;
+    char*      methName;
+    BSTR       b_methName;
+    char*      errMsg = NULL;
+    VARIANT    *thisPtr;
+    
+    if (!pBridge && !startBridge(&errMsg)) {
+      return errMsg;
+    }
+    
+    if (n_args <= 0) {
+      genError(NULL, 0x0, "Invoke.getField - missing this pointer", &errMsg);
+      return errMsg;
+    }
+    
+    /* The this-pointer is last */
+    thisPtr = toVariant(&args[n_args-1]);
+    VariantInit(&result);
+    
+    /* If the user has qualified method with class, ignore the class bit. */
+    if ( (methName = strrchr(clsAndMethName, '.')) == NULL) {
+      methName = clsAndMethName;
+    } else {
+      /* Skip past '.' */
+      methName++;
+    }
+    
+    hr = stringToBSTR(methName, &b_methName);
+    hr = InvokeBridge_GetField(pBridge,
+			       *thisPtr,
+			       b_methName,
+			       &result);
+    SysFreeString(b_methName);
+    if (FAILED(hr)) {
+	genError((IUnknown*)pBridge, hr, "Invoke.getField", &errMsg);
+	return errMsg;
+    }
+    
+    fromVariant(resultTy, &result, res, &errMsg);
+    return errMsg;
+}
+
+/*
+ * Function: DN_setField()
+ *
+ * Given field name, a value and an object reference, set the field value of
+ * an object.
+ * The object ref / this-pointer is passed in as the last argument.
+ *
+ * Returns NULL on success, pointer to error message if an error.
+ *
+ */
+char*
+DN_setField ( char       *clsAndMethName,
+	      DotnetArg  *args,
+	      int        n_args,
+	      /* next two args are ignored */
+	      DotnetType resultTy,
+	      void       *res)
+{
+    HRESULT    hr;
+    char*      methName;
+    BSTR       b_methName;
+    char*      errMsg = NULL;
+    VARIANT    *thisPtr;
+    VARIANT    *pVal;
+
+    if (!pBridge && !startBridge(&errMsg)) {
+      return errMsg;
+    }
+    
+    if (n_args != 2) {
+      genError(NULL, 0x0, "Invoke.setField - missing this pointer", &errMsg);
+      return errMsg;
+    }
+    
+    /* The this-pointer is last */
+    thisPtr = toVariant(&args[1]);
+
+    /* Package up arguments */
+    pVal = toVariant(&args[0]);
+    
+    /* If the user has qualified method with class, ignore the class bit. */
+    if ( (methName = strrchr(clsAndMethName, '.')) == NULL) {
+      methName = clsAndMethName;
+    } else {
+      /* Skip past '.' */
+      methName++;
+    }
+    
+    hr = stringToBSTR(methName, &b_methName);
+    hr = InvokeBridge_SetField(pBridge,
+			       *thisPtr,
+			       b_methName,
+			       *pVal);
+    SysFreeString(b_methName);
+    VariantClear(pVal);
+    free(pVal);
+    free(thisPtr);
+
+    if (FAILED(hr)) {
+	genError((IUnknown*)pBridge, hr, "Invoke.setField", &errMsg);
+	return errMsg;
+    }
+    return errMsg;
+}
+
+
+/*
+ * Function: DN_createObject()
+ *
+ * Given assembly and fully-qualified name of a type,
+ * invoke its (possibly parameterised) constructor.
+ *
+ * Returns NULL on success, pointer to error message if an error.
+ *
+ */
+char*
+DN_createObject ( char       *assemName,
+		  char       *methName,
+		  DotnetArg  *args,
+		  int        n_args,
+		  DotnetType resultTy,
+		  void       *res)
+{
+    SAFEARRAY* psa;
+    VARIANT    result;
+    HRESULT    hr;
+    BSTR       b_assemName;
+    BSTR       b_methName;
+    char*      errMsg = NULL;
+    
+    if (!pBridge && !startBridge(&errMsg)) {
+      return errMsg;
+    }
+    
+    /* Package up arguments */
+    psa = marshalArgs(args, n_args);
+    VariantInit(&result);
+    
+    hr = stringToBSTR(assemName, &b_assemName);
+    hr = stringToBSTR(methName, &b_methName);
+
+    hr = InvokeBridge_CreateObject(pBridge,
+				   b_assemName,
+				   b_methName,
+				   psa,
+				   &result);
+    SysFreeString(b_assemName);
+    SysFreeString(b_methName);
+    if (FAILED(hr)) {
+	genError((IUnknown*)pBridge, hr, "DN_createObject", &errMsg);
+	return errMsg;
+    }
+    
+    fromVariant(resultTy, &result, res, &errMsg);
+    freeArgs(psa);
+  
+    return errMsg;
+}
+
+/*
+ * Function: DN_getStatic()
+ *
+ * Given assembly and fully-qualified field name, fetch value of static
+ * field.
+ *
+ * Returns NULL on success, pointer to error message if an error.
+ *
+ */
+char*
+DN_getStatic ( char       *assemName,
+	       char       *fieldClsName,
+	       /* the next two args are ignored */
+	       DotnetArg  *args,
+	       int        n_args,
+	       DotnetType resultTy,
+	       void       *res)
+{
+    VARIANT    result;
+    HRESULT    hr;
+    BSTR       b_assemName;
+    BSTR       b_clsName;
+    BSTR       b_fieldName;
+    char*      errMsg = NULL;
+    char*      fieldName;
+    char*      clsName = fieldName;
+    
+    if (!pBridge && !startBridge(&errMsg)) {
+      return errMsg;
+    }
+    
+    fieldName = (char*)malloc(sizeof(char) * (strlen(fieldClsName) + 1));
+    strcpy(fieldName, fieldClsName);
+    clsName = fieldName;
+    
+    if (( fieldName = strrchr(fieldName, '.')) == NULL ) {
+      genError((IUnknown*)pBridge, 0x0, "Invoke.getStatic - malformed field spec", &errMsg);
+      return errMsg;
+    }
+    *fieldName = '\0';
+    fieldName++;
+    
+    VariantInit(&result);
+    
+    hr = stringToBSTR(assemName, &b_assemName);
+    hr = stringToBSTR(fieldName, &b_fieldName);
+    hr = stringToBSTR(clsName, &b_clsName);
+    /* ToDo: honour assembly spec */
+    hr = InvokeBridge_GetStaticField(pBridge,
+				     b_clsName,
+				     b_fieldName,
+				     &result);
+    SysFreeString(b_assemName);
+    SysFreeString(b_clsName);
+    SysFreeString(b_fieldName);
+    if (FAILED(hr)) {
+	genError((IUnknown*)pBridge, hr, "Invoke.getStatic", &errMsg);
+	return errMsg;
+    }
+    fromVariant(resultTy, &result, res, &errMsg);
+  
+    return errMsg;
+}
+
+/*
+ * Function: DN_setStatic()
+ *
+ * Given assembly and fully-qualified field name, set value of static
+ * field.
+ *
+ * Returns NULL on success, pointer to error message if an error.
+ *
+ */
+char*
+DN_setStatic ( char       *assemName,
+	       char       *fieldClsName,
+	       DotnetArg  *args,
+	       int        n_args,
+	       /* the next two args are ignored */
+	       DotnetType resultTy,
+	       void       *res)
+{
+    VARIANT    result;
+    VARIANT    *pVal;
+    HRESULT    hr;
+    BSTR       b_assemName;
+    BSTR       b_clsName;
+    BSTR       b_fieldName;
+    char*      errMsg = NULL;
+    char*      fieldName;
+    char*      clsName = fieldName;
+    
+    if (!pBridge && !startBridge(&errMsg)) {
+      return errMsg;
+    }
+    
+    fieldName = (char*)malloc(sizeof(char) * (strlen(fieldClsName) + 1));
+    strcpy(fieldName, fieldClsName);
+    clsName = fieldName;
+    
+    if (( fieldName = strrchr(fieldName, '.')) == NULL ) {
+      genError((IUnknown*)pBridge, 0x0, "Invoke.setStatic - malformed field spec", &errMsg);
+      return errMsg;
+    }
+    *fieldName = '\0';
+    fieldName++;
+    
+    pVal = toVariant(&args[0]);
+    VariantInit(&result);
+    
+    hr = stringToBSTR(assemName, &b_assemName);
+    hr = stringToBSTR(fieldName, &b_fieldName);
+    hr = stringToBSTR(clsName, &b_clsName);
+    /* ToDo: honour assembly spec */
+    hr = InvokeBridge_SetStaticField(pBridge,
+				     b_clsName,
+				     b_fieldName,
+				     *pVal);
+    SysFreeString(b_assemName);
+    SysFreeString(b_clsName);
+    SysFreeString(b_fieldName);
+    VariantClear(pVal);
+    free(pVal);
+    if (FAILED(hr)) {
+	genError((IUnknown*)pBridge, hr, "Invoke.setStatic", &errMsg);
+	return errMsg;
+    }
+    fromVariant(resultTy, &result, res, &errMsg);
+  
+    return errMsg;
+}
+
+
+
+
+/*
+ * Function: startBridge(pErrMsg)
+ *
+ * Instantiates an InvokeBridge component, which is then
+ * used to interact with the .NET world.
+ *
+ * If the component isn't available locally, zero is returned.
+ * Otherwise, 1.
+ */
+static
+int
+startBridge(char** pErrMsg)
+{
+    HRESULT   hr;
+    IUnknown *pUnk;
+
+    hr = CoInitializeEx(NULL, COINIT_APARTMENTTHREADED);
+    if (FAILED(hr)) {
+	genError(NULL, hr, "DInvoke.createBridge.CoInitializeEx", pErrMsg);
+	return FALSE;
+    }
+
+    hr = CoCreateInstance( &CLSID_InvokeBridge,
+			   NULL,
+			   CLSCTX_INPROC_SERVER,
+			   &IID_IUnknown,
+			   (void**)&pUnk);
+    if (FAILED(hr)) {
+	genError(NULL, hr, "DInvoke.createBridge.CoCreateInstance", pErrMsg);
+	return 0;
+    }
+    
+    hr = IUnknown_QueryInterface(pUnk, &IID_InvokeBridge, (void**)&pBridge);
+    IUnknown_Release(pUnk);
+    if (FAILED(hr)) {
+	genError(pUnk, hr, "DInvoke.createBridge.QueryInterface.InvokeBridge", pErrMsg);
+	return 0;
+    }
+    
+    return 1;
+}
+
+/*
+ * Function: stopBridge()
+ *
+ * Releases the InvokeBridge object and closes the COM library.
+ * 
+ */
+void
+stopDotnetBridge()
+{
+    if (pBridge) {
+	InvokeBridge_Release(pBridge);
+	pBridge = NULL;
+	CoUninitialize();
+    }
+    /* Match up the call to CoInitializeEx() in startBridge(). */
+}
+
+/*
+ * Function: genError()
+ *
+ * Construct a string describing an error condition given
+ * an HRESULT and a location. 
+ * 
+ * If an interface pointer is passed in via the first arg, 
+ * attempts are made to get at richer error information through
+ * the IErrorInfo interface. (Note: we don't currently look for
+ * the _Exception interface for even more detailed info.)
+ *
+ */
+#define LOCATION_HDR "Location: "
+#define HRESULT_HDR  "HRESULT: "
+#define SOURCE_HDR   "Source: "
+#define DESCR_HDR    "Description: "
+#define NEWLINE_EXTRA 3
+
+static
+void
+genError(IUnknown* pUnk,
+	 HRESULT err,
+	 char* loc,
+	 char** pErrMsg)
+{
+  HRESULT hr;
+  HRESULT invoke_hr = err;
+  char*   invoke_src   = NULL;
+  char*   invoke_descr = NULL;
+  char*   buf;
+  int     bufLen;
+  
+  /* If an interface pointer has been supplied, look for
+   * IErrorInfo in order to get more detailed information
+   * on the failure.
+   *
+   * The CLR's .NET COM Interop implementation does provide
+   * IErrorInfo, so we're not really clutching at straws here..
+   *
+   * Note: CLR also reflects .NET exceptions via the _Exception*
+   * interface..
+   *
+   */
+  if (pUnk) {
+    ISupportErrorInfo *pSupp;
+    IErrorInfo        *pErrInfo;
+    BSTR src   = NULL;
+    BSTR descr = NULL;
+
+    hr = IUnknown_QueryInterface(pUnk, 
+				 &IID_ISupportErrorInfo,
+				 (void**)&pSupp);
+    if ( SUCCEEDED(hr) ) {
+      hr = ISupportErrorInfo_InterfaceSupportsErrorInfo(pSupp,
+							&IID_InvokeBridge);
+      if ( SUCCEEDED(hr) ) {
+	hr = GetErrorInfo(0,&pErrInfo);
+	if ( SUCCEEDED(hr) ) {
+	  IErrorInfo_GetSource(pErrInfo,&src);
+	  IErrorInfo_GetDescription(pErrInfo,&descr);
+	  invoke_src   = bstrToString(src);
+	  invoke_descr = bstrToString(descr);
+
+	  IErrorInfo_Release(pErrInfo);
+	  if (src)   { SysFreeString(src);   src = NULL;   }
+	  if (descr) { SysFreeString(descr); descr = NULL; }
+	}
+	ISupportErrorInfo_Release(pSupp);
+      }
+    }
+  }
+  /* Putting it all together.. */
+  bufLen  = sizeof(LOCATION_HDR) + strlen(loc) + NEWLINE_EXTRA +
+            sizeof(HRESULT_HDR)  + 16 + NEWLINE_EXTRA + 
+            sizeof(SOURCE_HDR)   + (invoke_src ? strlen(invoke_src) : 16) + NEWLINE_EXTRA +
+            sizeof(DESCR_HDR)    + (invoke_descr ? strlen(invoke_descr) : 16) + NEWLINE_EXTRA;
+  buf = (char*) malloc(sizeof(char) * (bufLen + 1));
+  if (!buf) {
+    fprintf(stderr, "Unable to allocate %d for error message", (bufLen + 1));
+    *pErrMsg = NULL;
+    return;
+  }
+    
+  _snprintf(buf, bufLen, "%s%s\n%s0x%08x\n%s%s\n%s%s",
+	   LOCATION_HDR, loc,
+	   HRESULT_HDR,  invoke_hr,
+	   SOURCE_HDR,   invoke_src,
+	   DESCR_HDR,    invoke_descr);
+
+  /* Done with these chaps */
+  if (invoke_src)   free(invoke_src);
+  if (invoke_descr) free(invoke_descr);
+  
+  if (pErrMsg) *pErrMsg = buf;
+  fprintf(stderr, "**InvokeBridge Error:\n%s", buf); fflush(stderr);
+}
+
+/* Converting to/from VARIANTs */
+
+/*
+ * Function: fromVariant()
+ *
+ * Unmarshal the contents of a VARIANT, converting its embedded value
+ * into the desired DotnetType (if possible.)
+ *
+ * Returns 1 if successful, 0 otherwise. If the conversion fails, 
+ * *pErrMsg holds the error message string.
+ */
+static
+int
+fromVariant (DotnetType resTy, 
+	     VARIANT* pVar, 
+	     void* res,
+	     char** pErrMsg)
+{
+    VARIANT vNew;
+    HRESULT hr;
+
+    VariantInit(&vNew);
+    switch(resTy) {
+    case Dotnet_Byte:
+    case Dotnet_Char:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_UI1);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_UI1}", pErrMsg);
+	    return FALSE;
+	}
+	*((unsigned char*)res) = vNew.bVal;
+	return 1;
+    case Dotnet_Boolean:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_BOOL);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_BOOL}", pErrMsg);
+	    return 0;
+	}
+	*((unsigned char*)res) = vNew.bVal;
+	return 1;
+    case Dotnet_Int:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_INT);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_INT}", pErrMsg);
+	    return 0;
+	}
+	*((int*)res) = vNew.intVal;
+	return 1;
+    case Dotnet_Int8:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_I1);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_I1}", pErrMsg);
+	    return 0;
+	}
+	*((signed char*)res) = vNew.bVal;
+	return 1;
+    case Dotnet_Int16:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_I2);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_I2}", pErrMsg);
+	    return 0;
+	}
+	*((signed short*)res) = vNew.iVal;
+	return 1;
+    case Dotnet_Int32:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_I4);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_I4}", pErrMsg);
+	    return 0;
+	}
+	*((signed int*)res) = vNew.lVal;
+	return 1;
+    case Dotnet_Int64:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_I8);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_I8}", pErrMsg);
+	    return 0;
+	}
+#ifdef _MSC_VER
+	*((__int64*)res) = vNew.llVal;
+#else
+	*((long long*)res) = vNew.lVal;
+#endif
+	return 1;
+    case Dotnet_Float:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_R4);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_R4}", pErrMsg);
+	    return 0;
+	}
+	*((float*)res) = vNew.fltVal;
+	return 1;
+    case Dotnet_Double:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_R8);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_R4}", pErrMsg);
+	    return 0;
+	}
+	*((double*)res) = vNew.dblVal;
+	return 1;
+    case Dotnet_Word8:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_UI1);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_UI1}", pErrMsg);
+	    return 0;
+	}
+	*((unsigned char*)res) = vNew.bVal;
+	return 1;
+    case Dotnet_Word16:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_UI2);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_UI2}", pErrMsg);
+	    return 0;
+	}
+	*((unsigned short*)res) = vNew.uiVal;
+	return 1;
+    case Dotnet_Word32:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_UI4);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_UI4}", pErrMsg);
+	    return 0;
+	}
+	*((unsigned int*)res) = vNew.ulVal;
+	return 1;
+    case Dotnet_Word64:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_UI8);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_UI8}", pErrMsg);
+	    return 0;
+	}
+#ifdef _MSC_VER
+	*((unsigned __int64*)res) = vNew.ullVal;
+#else
+	*((unsigned long long*)res) = vNew.lVal;
+#endif
+	return 1;
+    case Dotnet_Ptr:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_BYREF);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_BYREF}", pErrMsg);
+	    return 0;
+	}
+	*((void**)res) = vNew.byref;
+	return 1;
+    case Dotnet_Unit:
+	return 0;
+    case Dotnet_Object:
+      if ( pVar->vt == VT_BSTR ) {
+	/* Special handling for strings. If the user has asked for
+	 * the string in object form, give him/her that. 
+	 */
+	VARIANT res;
+
+	VariantInit(&res);
+	hr = InvokeBridge_NewString(pBridge,
+				    pVar->bstrVal,
+				    &res);
+	if (SUCCEEDED(hr)) {
+	  pVar = &res;
+	}
+      }
+	hr = VariantChangeType (&vNew, pVar, 0, VT_UNKNOWN);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_UNKNOWN}", pErrMsg);
+	    return 0;
+	}
+	*((IUnknown**)res) = vNew.punkVal;
+	return 1;
+    case Dotnet_String:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_BSTR);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_BSTR}", pErrMsg);
+	    return 0;
+	}
+	/* Storage is allocated by malloc(), caller is resp for freeing. */
+	*((char**)res) = bstrToString(vNew.bstrVal);
+	return 1;
+    }
+    return 0;
+}
+
+/*
+ * Function: toVariant()
+ *
+ * Convert a DotnetArg into a VARIANT. The VARIANT
+ * is dynamically allocated.
+ *
+ * The result is the pointer to the filled-in VARIANT structure;
+ * NULL if allocation failed.
+ *
+ */
+static
+VARIANT*
+toVariant ( DotnetArg* p )
+{
+  VARIANT* v = (VARIANT*)malloc(sizeof(VARIANT));
+  if (!v) return NULL;
+  VariantInit(v);
+  switch (p->arg_type) {
+  case Dotnet_Byte:
+    v->vt = VT_UI1;
+    v->bVal = p->arg.arg_byte;
+    break;
+  case Dotnet_Char:
+    v->vt = VT_UI1;
+    v->bVal = p->arg.arg_char;
+    break;
+  case Dotnet_Boolean:
+    v->vt = VT_BOOL;
+    v->boolVal = p->arg.arg_bool;
+    break;
+  case Dotnet_Int:
+    v->vt = VT_INT;
+    v->intVal = p->arg.arg_int;
+    break;
+  case Dotnet_Int8:
+    v->vt = VT_I1;
+    v->bVal = p->arg.arg_int8;
+    break;
+  case Dotnet_Int16:
+    v->vt = VT_I2;
+    v->iVal = p->arg.arg_int16;
+    break;
+  case Dotnet_Int32:
+    v->vt = VT_I4;
+    v->lVal = p->arg.arg_int32;
+    break;
+  case Dotnet_Int64:
+    v->vt = VT_I8;
+#ifdef _MSC_VER
+    v->llVal = p->arg.arg_int64;
+#else
+    (long long*)(v->lVal) = p->arg.arg_int64;
+#endif
+    break;
+  case Dotnet_Float:
+    v->vt = VT_R4;
+    v->fltVal = p->arg.arg_float;
+    break;
+  case Dotnet_Double:
+    v->vt = VT_R8;
+    v->dblVal = p->arg.arg_double;
+    break;
+  case Dotnet_Word8:
+    v->vt = VT_UI1;
+    v->bVal = p->arg.arg_word8;
+    break;
+  case Dotnet_Word16:
+    v->vt = VT_UI2;
+    v->uiVal = p->arg.arg_word16;
+    break;
+  case Dotnet_Word32:
+    v->vt = VT_UI4;
+    v->ulVal = p->arg.arg_word32;
+    break;
+  case Dotnet_Word64:
+    v->vt = VT_UI8;
+#ifdef _MSC_VER
+    v->ullVal = p->arg.arg_word64;
+#else
+    (unsigned long long*)(v->lVal) = p->arg.arg_word64;
+#endif
+    break;
+  case Dotnet_Ptr:
+    v->vt = VT_BYREF;
+    v->byref = p->arg.arg_ptr;
+    break;
+  case Dotnet_Unit:
+    v->vt = VT_EMPTY;
+    break;
+  case Dotnet_Object:
+    v->vt = VT_UNKNOWN;
+    v->punkVal = (IUnknown*)p->arg.arg_obj;
+    break;
+  case Dotnet_String: {
+    BSTR b;
+    HRESULT hr;
+    v->vt = VT_BSTR;
+    hr = stringToBSTR((const char*)p->arg.arg_str,&b);
+    v->bstrVal = b;
+    break; }
+  }
+  return v;
+}
diff --git a/rts/dotnet/Invoker.cpp b/rts/dotnet/Invoker.cpp
new file mode 100644
index 0000000000..d8ad87212d
--- /dev/null
+++ b/rts/dotnet/Invoker.cpp
@@ -0,0 +1,338 @@
+//
+// (c) 2002-2003, sof.
+//
+// Dynamic invocation helper classes. The details of how
+// to access the .NET object model via the Reflection API
+// is taken care of by Invoker.{h,cpp}
+//
+#include "Invoker.h"
+
+namespace DynInvoke {
+
+static TypeName* ParseType(String* str) {
+    int curPos = 0;
+    int endPos;
+
+    //    Console::WriteLine("x{0}y", str);
+    TypeName* typeName = new TypeName();
+
+    if ( str->get_Chars(0) == '[' ) {
+      endPos = str->IndexOf(']');
+      curPos = endPos + 1;
+      typeName->m_assembly = str->Substring(1,endPos-1);
+      typeName->m_length = endPos+1;
+    }
+    String* delimStr = " ,()";
+    Char delims __gc [] = delimStr->ToCharArray();
+
+    endPos  = str->IndexOfAny(delims,curPos);
+    //    Console::WriteLine("{0} {1} x{2}x", __box(endPos), __box(curPos), str);
+    if ( endPos == -1 ) {
+      typeName->m_class = str->Substring(curPos);
+    } else {
+      typeName->m_class = str->Substring(curPos,endPos-curPos);
+    }
+
+    //    typeName->m_class = str->Substring(curPos,endPos-curPos);
+    typeName->m_length += endPos-curPos;
+
+    return typeName;
+}
+
+// Method: GetType(String* typeName);
+// 
+// Purpose: Assembly-savvy version of Type::GetType()
+//
+Type* InvokeBridge::GetType(String* typeName) {
+
+  try {
+    Type* t = Type::GetType(typeName);
+    if (t) return t;
+  } catch (Exception*) {
+    ;
+  }
+
+  for (int i=0;i < InvokeBridge::m_assemblies->Count; i++) {
+     try {
+       String* stuff = String::Format("{0},{1}",typeName,InvokeBridge::m_assemblies->get_Item(i)->ToString());
+       //       Console::WriteLine(stuff);
+       Type* t = Type::GetType(stuff);
+       if (t) {
+	 return t;
+       }
+     } catch (Exception*) {
+       continue;
+     }
+  }
+  return 0;
+}
+
+//
+// Method:  CreateInstance(String* typeName, Object* [])
+//
+// Purpose: Assembly-savvy invocation of Activator::CreateInstance
+Object* InvokeBridge::CreateInstance(TypeName* typeName,
+				     Object* args[]) {
+
+  Object* instance = 0;
+  Type*   t = InvokeBridge::GetType(typeName->toStdString());
+
+  //  Console::WriteLine("x{0} y{1}", typeName->toStdString(), t);
+  if (!t) {
+    try {
+      Assembly* localA = Assembly::LoadFrom(typeName->m_assembly);
+      t = localA->GetType(typeName->m_class);
+    } catch (Exception* e) {
+      ;
+    }
+  }
+  
+  if (!t) {
+    try {
+      AppDomain* currentDomain = AppDomain::CurrentDomain;
+      
+      //      Assembly* stuff[] = currentDomain->GetAssemblies();
+      //      for (int i=0;i < stuff.Length; i++) {
+      //	Console::WriteLine("x{0} y{1}", stuff[i]->ToString(), stuff[i]->FullName);
+      //      }
+      //      Console::WriteLine("x{0} y{1}", typeName->toStdString(), t);
+      Assembly* localA = Assembly::LoadWithPartialName("HugsAssembly");
+      t = localA->GetType(typeName->m_class);
+      //      Console::WriteLine("x{0} y{1}", typeName->toStdString(), t);
+    } catch (Exception*) {
+      ;
+    }
+  }
+
+  if (t) {
+    try {
+      Object* o =Activator::CreateInstance(t,(Object* [])args);
+      return o;
+    } catch (Exception* e) {
+      Console::WriteLine("Failure: {0}", e);
+      return 0;
+    }
+  }
+}
+
+//
+// Method: CreateObject(String* objSpec, Object* args[])
+//
+// Purpose: Given a fully qualified name of a class/type, try
+//          to create an instance of it.
+//
+Object* InvokeBridge::CreateObject(String* assemName,
+				   String* objSpec,
+				   Object* args[]) {
+ 
+  Object* instance = 0;
+
+  // Unravel the name of the class/type.
+  TypeName* typeName = ParseType(objSpec);
+  
+  if (assemName != 0 && assemName->Length > 0) {
+    typeName->m_assembly = assemName;
+  }
+
+  // Try creating the instance..
+  try {
+    instance = InvokeBridge::CreateInstance(typeName,(Object* [])args);
+  } catch (Exception* e) {
+    Console::WriteLine("Unable to create instance \"{0}\" {1}", objSpec, e);
+    throw(e);
+  }
+  if (!instance) {
+    Console::WriteLine("Unable to create instance \"{0}\"", objSpec);
+  }
+  return instance;
+}
+
+//
+// Method:  InvokeMethod
+// 
+// Purpose: Given a pointer to an already created object, look up
+//          one of its method. If found, invoke the method passing it
+//          'args' as arguments.
+//
+Object*
+InvokeBridge::InvokeMethod(Object* obj, 
+			   String* methName,
+			   Object* args[]) {
+  // Get the methods from the type
+  MethodInfo* methods __gc[] = obj->GetType()->GetMethods();
+  MethodInfo* mInfo;
+
+  if (!methods) {
+    Console::WriteLine("InvokeMethod: No matching types found");
+    return 0;
+  }
+			
+  System::Reflection::BindingFlags flgs 
+    = (System::Reflection::BindingFlags) // why do I need to cast?
+      (System::Reflection::BindingFlags::Public       |
+       System::Reflection::BindingFlags::NonPublic    |
+       System::Reflection::BindingFlags::Instance     |
+       System::Reflection::BindingFlags::Static       |
+       System::Reflection::BindingFlags::InvokeMethod);
+    
+  /* Caller is assumed to catch any exceptions raised. */
+  return obj->GetType()->InvokeMember(methName,
+				      flgs,
+				      0,
+				      obj,
+				      (Object __gc* [])args);
+}
+
+//
+// Method:  InvokeStaticMethod
+// 
+// Purpose: Invoke a static method, given the fully qualified name
+//          of the method (and its arguments). If found, invoke the
+//          method passing it 'args' as arguments.
+//
+Object* InvokeBridge::InvokeStaticMethod(String* assemName,
+					 String* typeAndMethName,
+					 Object* args[]) {
+
+  // Get the methods from the type
+  MethodInfo* methods __gc[];
+  MethodInfo* mInfo;
+
+  int lastDot = typeAndMethName->LastIndexOf('.');
+  String* className = typeAndMethName->Substring(0,lastDot);
+  String* methName  = typeAndMethName->Substring(lastDot+1);
+
+  // Unravel the name of the class/type.
+  TypeName* typeName = ParseType(className);
+  Type* t;
+  
+  if (assemName != 0 && assemName->Length > 0) {
+    typeName->m_assembly = assemName;
+  }
+  
+  try {
+    t = InvokeBridge::GetType(typeName->toStdString());
+    
+    if (!t) {
+      try {
+	Assembly* localA = Assembly::LoadFrom(typeName->m_assembly);
+	t = localA->GetType(typeName->m_class);
+	//	Console::WriteLine("InvokeStaticMethod: Type {0} found", t);
+      } catch (Exception* e) {
+	;
+      }
+    }
+
+    if (t) {
+      methods = t->GetMethods();
+    } else {
+      Console::WriteLine("InvokeStaticMethod: Type {0} not found", className);
+      return 0;
+    }
+  } catch (Exception *e) {
+      Console::WriteLine("InvokeStaticMethod: Type {0} not found", className);
+      throw(e);
+  }
+
+  System::Reflection::BindingFlags flgs 
+    = (System::Reflection::BindingFlags) // why do I need to cast?
+      (System::Reflection::BindingFlags::DeclaredOnly |
+       System::Reflection::BindingFlags::Public       |
+       System::Reflection::BindingFlags::NonPublic    |
+       System::Reflection::BindingFlags::Static       |
+       System::Reflection::BindingFlags::InvokeMethod);
+    
+  return t->InvokeMember(methName,
+			 flgs,
+			 0,
+			 0,
+			 (Object __gc* [])args);
+}
+
+//
+// Method:  GetField
+//
+// Purpose: Fetch the (boxed) value of named field of a given object.
+//
+Object* InvokeBridge::GetField(Object* obj, System::String* fieldName) {
+
+  FieldInfo* fInfo = obj->GetType()->GetField(fieldName);
+  return fInfo->GetValue(obj);
+}
+
+//
+// Method:  GetStaticField
+//
+// Purpose: Fetch the (boxed) value of named static field.
+//
+Object* InvokeBridge::GetStaticField(System::String* clsName,
+				     System::String* fieldName) {
+
+  Type* ty = InvokeBridge::GetType(clsName);
+  System::Reflection::BindingFlags static_field_flgs 
+    = (System::Reflection::BindingFlags)
+    (System::Reflection::BindingFlags::Public       |
+     System::Reflection::BindingFlags::NonPublic    |
+     System::Reflection::BindingFlags::FlattenHierarchy |
+     System::Reflection::BindingFlags::Static);
+
+  FieldInfo* fInfo = ty->GetField(fieldName, static_field_flgs);
+  return fInfo->GetValue(0); // according to doc, ok to pass any val here.
+}
+
+//
+// Method:  SetField
+//
+// Purpose: Replace the (boxed) value of named field of a given object.
+//
+void InvokeBridge::SetField(Object* obj, System::String* fieldName, Object* val) {
+
+  FieldInfo* fInfo = obj->GetType()->GetField(fieldName);
+  fInfo->SetValue(obj,val);
+  return;
+}
+
+//
+// Method:  SetStaticField
+//
+// Purpose: Replace the (boxed) value of named static field.
+//
+void InvokeBridge::SetStaticField(System::String* clsName,
+				  System::String* fieldName,
+				  Object* val) {
+
+  Type* ty = InvokeBridge::GetType(clsName);
+  System::Reflection::BindingFlags static_field_flgs 
+    = (System::Reflection::BindingFlags)
+    (System::Reflection::BindingFlags::Public       |
+     System::Reflection::BindingFlags::NonPublic    |
+     System::Reflection::BindingFlags::FlattenHierarchy |
+     System::Reflection::BindingFlags::Static);
+  
+  FieldInfo* fInfo = ty->GetField(fieldName,static_field_flgs);
+  fInfo->SetValue(0,val);
+  return;
+}
+
+Object* InvokeBridge::NewString(System::String* s)
+{
+  System::String* c = System::String::Copy(s);
+  return dynamic_cast<Object*>(c);
+}
+
+Array* InvokeBridge::NewArgArray(int sz)
+{
+ return Array::CreateInstance(__typeof(Object), sz); 
+}
+
+void InvokeBridge::SetArg(Object* arr[], Object* val, int idx)
+{
+ arr->SetValue(val,idx);
+}
+
+Object* InvokeBridge::GetArg(Object* arr[], int idx)
+{
+ return arr->GetValue(idx);
+}
+
+} /* namespace */
diff --git a/rts/dotnet/Invoker.h b/rts/dotnet/Invoker.h
new file mode 100644
index 0000000000..d649a4c716
--- /dev/null
+++ b/rts/dotnet/Invoker.h
@@ -0,0 +1,197 @@
+//
+// (c) 2003, sof.
+//
+// Dynamic invocation helper classes. The details of how
+// to access the .NET object model via the Reflection API
+// is taken care of by Invoker.{h,cpp}
+//
+#pragma once
+#using <mscorlib.dll>
+
+using namespace System;
+using namespace System::Reflection;
+using namespace System::Text;
+using namespace System::Runtime::InteropServices;
+
+[assembly:AssemblyKeyFileAttribute(S"invoker.snk")];
+
+namespace DynInvoke {
+
+//
+// Class: TypeName
+//
+// Purpose: pairing up an assembly name and the type/class name.
+//
+[ComVisible(false)]
+public __gc class TypeName {
+
+public:
+  System::String* m_assembly;
+  System::String* m_class;
+  int     m_length;
+
+  TypeName() { 
+    m_assembly = String::Empty;
+    m_class = String::Empty;
+    m_length = 0;
+  }
+
+  void Print() {
+    if (m_assembly && m_assembly != String::Empty ) {
+      Console::Write("[");
+      Console::Write(m_assembly);
+      Console::Write("]");
+    }
+    Console::WriteLine(m_class);
+  }
+  
+  int Length() { return m_length; }
+
+  System::String* toStdString() {
+    System::String* res = new System::String(m_class->ToCharArray());
+    
+    if (m_assembly && m_assembly != String::Empty ){
+      res = String::Concat(res, S",");
+      res = String::Concat(res, m_assembly);
+    }
+    return res;
+  }
+};
+
+// 
+// Class:   InvokeBridge
+// 
+// Purpose: Collection of (static) methods for dynamically creating
+//          objects and accessing methods/fields on them. 
+//
+[ClassInterface(ClassInterfaceType::AutoDual),
+GuidAttribute("39D497D9-60E0-3525-B7F2-7BC096D3A2A3"),
+ComVisible(true)
+]
+public __gc class InvokeBridge {
+public:
+  InvokeBridge() {
+    Assembly* corAss      = Assembly::Load("mscorlib.dll"); 
+    System::String*  dir  = System::IO::Path::GetDirectoryName(corAss->Location);
+   
+    m_assemblies = new System::Collections::ArrayList();
+   
+    System::String* fs[] = System::IO::Directory::GetFiles(dir, "*.dll");
+    for (int i=0;i < fs->Length; i++) {
+      try {
+	Assembly* tAss = Assembly::LoadFrom(fs[i]);
+	m_assemblies->Add(tAss->FullName);
+      } catch (Exception* e) {
+	continue;
+      }
+    }
+  }
+
+  //
+  // Method: CreateObject(String* assemName, String* objSpec, Object* args[])
+  //
+  // Purpose: Given a fully qualified name of a class/type, try
+  //          to create an instance of it.
+  //
+  Object* CreateObject(System::String* assemName,
+		       System::String* objSpec,
+		       Object* args[]);
+	  
+  //
+  // Method:  InvokeMethod
+  // 
+  // Purpose: Given a pointer to an already created object, look up
+  //          one of its method. If found, invoke the method passing it
+  //          'args' as arguments.
+  //
+  // Comments: the format of the method-spec is "methodName(type1,..,typeN)" [N>=0]
+  //
+  Object* InvokeMethod(Object* obj, 
+		       System::String* methSpec,
+		       Object* args[]);
+			      
+  //
+  // Method:  InvokeStaticMethod
+  // 
+  // Purpose: Invoke a static method, given the fully qualified name
+  //          of the method (and its arguments). If found, invoke the
+  //          method passing it 'args' as arguments.
+  //
+  // Comments: the format of the method-spec is 
+  //              "T1.T2.<..>.Tn.methodName(type1,..,typeN)" [N>=0]
+  //
+  Object* InvokeStaticMethod(System::String* assemName,
+			     System::String* methSpec,
+			     Object* args[]);
+			      
+  //
+  // Method:  GetField
+  //
+  // Purpose: Fetch the (boxed) value of named field of a given object.
+  //
+  Object* GetField(Object* obj, System::String* fieldSpec);
+
+  //
+  // Method:  GetField
+  //
+  // Purpose: Fetch the (boxed) value of named static field.
+  //
+  Object* GetStaticField(System::String* clsName, 
+			 System::String* fieldSpec);
+
+  //
+  // Method:  SetField
+  //
+  // Purpose: Replace the (boxed) value of named field of a given object.
+  //
+  void SetField(Object* obj, System::String* fieldSpec, Object* val);
+	    
+  //
+  // Method:  SetStaticField
+  //
+  // Purpose: Replace the (boxed) value of named field of a given object.
+  //
+  void SetStaticField(System::String* clsName,
+		      System::String* fieldSpec,
+		      Object* val);
+	    
+
+  // 
+  // Method:  NewString
+  // 
+  // Purpose: construct a System.String object copy in a manner that avoids
+  //          COM Interop from deconstructing it to a BSTR.
+  //
+  System::Object* NewString( System::String* s);
+
+  //
+  // Method:  NewArgArray
+  //
+  // Purpose: create a new array for holding (boxed) arguments to constructors/
+  //          methods.
+  //
+  Array* NewArgArray(int sz);
+  
+  //
+  // Method: SetArg
+  //
+  // Purpose: set an entry in the argument vector.
+  //
+  void SetArg(Object* arr[], Object* val, int idx);
+
+  //
+  // Method: GetArg
+  //
+  // Purpose: get an entry in the argument vector.
+  //
+  Object* GetArg(Object* arr[], int idx);
+
+  System::Type* InvokeBridge::GetType(System::String* typeName);
+
+protected:
+  System::Collections::ArrayList __gc* m_assemblies;
+  Object* InvokeBridge::CreateInstance(TypeName* typeName,
+				       Object* args[]);
+};
+
+} /* namespace */
diff --git a/rts/dotnet/InvokerClient.h b/rts/dotnet/InvokerClient.h
new file mode 100644
index 0000000000..122f455c01
--- /dev/null
+++ b/rts/dotnet/InvokerClient.h
@@ -0,0 +1,180 @@
+/*
+ * InvokerClient interface defns for use with gcc.
+ *
+ * Note: These declarations mirror those of the InvokeBridge
+ *       class declaration. 
+ *
+ */
+
+#include <windows.h>
+#include <wtypes.h>
+#include <oaidl.h>
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#ifndef STDCALL
+#define STDCALL __stdcall
+#endif
+
+extern const CLSID CLSID_InvokeBridge;
+extern const IID   IID_IUnknown;
+extern const IID   IID_NULL;
+extern const IID   IID_InvokeBridge;
+
+#ifdef WANT_UUID_DECLS
+const CLSID CLSID_InvokeBridge = { 0x39D497D9,0x60E0,0x3525,{0xB7,0xF2,0x7B,0xC0,0x96,0xD3,0xA2,0xA3}};
+//const IID IID_NULL = {0x00000000L, 0x0000, 0x0000, {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}};
+//const IID IID_IUnknown = {0x00000000L, 0x0000, 0x0000, {0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46}};
+const IID IID_InvokeBridge = { 0xAFF5FFCA, 0xC5C2, 0x3D5B, {0xAF, 0xD5, 0xED, 0x8E, 0x4B, 0x38, 0xDB, 0x7B}};
+  //0x3A85D703, 0xFAE4,0x3C5E, {0x9F,0x7E,0x20,0x98,0x31,0xCD,0x61,0x7A}};
+#endif
+
+#ifndef __InvokeBridge_INTERFACE_DEFINED__
+#define __InvokeBridge_INTERFACE_DEFINED__
+#undef INTERFACE
+#define INTERFACE InvokeBridge
+DECLARE_INTERFACE(InvokeBridge)
+{
+    STDMETHOD(QueryInterface)(THIS_ REFIID,PVOID*) PURE;
+    STDMETHOD_(ULONG,AddRef)(THIS) PURE;
+    STDMETHOD_(ULONG,Release)(THIS) PURE;
+    STDMETHOD(GetTypeInfoCount)(THIS_ UINT*) PURE;
+    STDMETHOD(GetTypeInfo)(THIS_ UINT,LCID,LPTYPEINFO*) PURE;
+    STDMETHOD(GetIDsOfNames)(THIS_ REFIID,LPOLESTR*,UINT,LCID,DISPID*) PURE;
+    STDMETHOD(Invoke)(THIS_ DISPID,REFIID,LCID,WORD,DISPPARAMS*,VARIANT*,EXCEPINFO*,UINT*) PURE;
+
+    STDMETHOD(ToString)(THIS_ BSTR*) PURE;
+    STDMETHOD(Equals)(THIS_ BSTR*) PURE;
+    STDMETHOD(GetHashCode)(THIS_ long*) PURE;
+    STDMETHOD(GetType)(THIS_ IUnknown**);
+    STDMETHOD(CreateObject)(THIS_ BSTR,BSTR,SAFEARRAY*, VARIANT*) PURE;
+    STDMETHOD(InvokeMethod)(THIS_ VARIANT,BSTR,SAFEARRAY*,VARIANT*) PURE;
+    STDMETHOD(InvokeStaticMethod)(THIS_ BSTR,BSTR,SAFEARRAY*,VARIANT*) PURE;
+
+    HRESULT ( STDCALL *GetField )( 
+            InvokeBridge * This,
+            /* [in] */ VARIANT obj,
+            /* [in] */ BSTR fieldSpec,
+            /* [retval][out] */ VARIANT *pRetVal);
+        
+        HRESULT ( STDCALL *GetStaticField )( 
+            InvokeBridge * This,
+            /* [in] */ BSTR clsName,
+            /* [in] */ BSTR fieldSpec,
+            /* [retval][out] */ VARIANT *pRetVal);
+        
+        HRESULT ( STDCALL *SetField )( 
+            InvokeBridge * This,
+            /* [in] */ VARIANT obj,
+            /* [in] */ BSTR fieldSpec,
+            /* [in] */ VARIANT val);
+        
+        HRESULT ( STDCALL *SetStaticField )( 
+            InvokeBridge * This,
+            /* [in] */ BSTR clsName,
+            /* [in] */ BSTR fieldSpec,
+            /* [in] */ VARIANT val);
+        
+        HRESULT ( STDCALL *NewString )( 
+            InvokeBridge * This,
+            /* [in] */ BSTR s,
+            /* [retval][out] */VARIANT* pRetVal);
+        
+        HRESULT ( STDCALL *NewArgArray )( 
+            InvokeBridge * This,
+            /* [in] */ long sz,
+            /* [retval][out] */IUnknown **pRetVal);
+        
+        HRESULT ( STDCALL *SetArg )( 
+            InvokeBridge * This,
+            /* [in] */ SAFEARRAY * arr,
+            /* [in] */ VARIANT val,
+            /* [in] */ long idx);
+        
+        HRESULT ( STDCALL *GetArg )( 
+            InvokeBridge * This,
+            /* [in] */ SAFEARRAY * arr,
+            /* [in] */ long idx,
+            /* [retval][out] */ VARIANT *pRetVal);
+        
+        HRESULT ( STDCALL *GetType_2 )( 
+            InvokeBridge * This,
+            /* [in] */ BSTR typeName,
+            /* [retval][out] */ IUnknown **pRetVal);
+};
+#endif
+
+#define InvokeBridge_QueryInterface(This,riid,ppvObject)	\
+    (This)->lpVtbl->QueryInterface(This,riid,ppvObject)
+
+#define InvokeBridge_AddRef(This)	\
+    (This)->lpVtbl->AddRef(This)
+
+#define InvokeBridge_Release(This)	\
+    (This)->lpVtbl->Release(This)
+
+#define InvokeBridge_GetTypeInfoCount(This,pctinfo)	\
+    (This)->lpVtbl->GetTypeInfoCount(This,pctinfo)
+
+#define InvokeBridge_GetTypeInfo(This,iTInfo,lcid,ppTInfo)	\
+    (This)->lpVtbl->GetTypeInfo(This,iTInfo,lcid,ppTInfo)
+
+#define InvokeBridge_GetIDsOfNames(This,riid,rgszNames,cNames,lcid,rgDispId)	\
+    (This)->lpVtbl->GetIDsOfNames(This,riid,rgszNames,cNames,lcid,rgDispId)
+
+#define InvokeBridge_Invoke(This,dispIdMember,riid,lcid,wFlags,pDispParams,pVarResult,pExcepInfo,puArgErr)	\
+    (This)->lpVtbl->Invoke(This,dispIdMember,riid,lcid,wFlags,pDispParams,pVarResult,pExcepInfo,puArgErr)
+
+#define InvokeBridge_get_ToString(This,pRetVal)	\
+    (This)->lpVtbl->get_ToString(This,pRetVal)
+
+#define InvokeBridge_Equals(This,obj,pRetVal)	\
+    (This)->lpVtbl->Equals(This,obj,pRetVal)
+
+#define InvokeBridge_GetHashCode(This,pRetVal)	\
+    (This)->lpVtbl->GetHashCode(This,pRetVal)
+
+#define InvokeBridge_GetType(This,pRetVal)	\
+    (This)->lpVtbl->GetType(This,pRetVal)
+
+#define InvokeBridge_CreateObject(This,assemName,objSpec,args,pRetVal)	\
+    (This)->lpVtbl->CreateObject(This,assemName,objSpec,args,pRetVal)
+
+#define InvokeBridge_InvokeMethod(This,obj,methSpec,args,pRetVal)	\
+    (This)->lpVtbl->InvokeMethod(This,obj,methSpec,args,pRetVal)
+
+#define InvokeBridge_InvokeStaticMethod(This,assemName,methSpec,args,pRetVal)	\
+    (This)->lpVtbl->InvokeStaticMethod(This,assemName,methSpec,args,pRetVal)
+
+#define InvokeBridge_GetField(This,obj,fieldSpec,pRetVal)	\
+    (This)->lpVtbl->GetField(This,obj,fieldSpec,pRetVal)
+
+#define InvokeBridge_GetStaticField(This,clsName,fieldSpec,pRetVal)	\
+    (This)->lpVtbl->GetStaticField(This,clsName,fieldSpec,pRetVal)
+
+#define InvokeBridge_SetField(This,obj,fieldSpec,val)	\
+    (This)->lpVtbl->SetField(This,obj,fieldSpec,val)
+
+#define InvokeBridge_SetStaticField(This,clsName,fieldSpec,val)	\
+    (This)->lpVtbl->SetStaticField(This,clsName,fieldSpec,val)
+
+#define InvokeBridge_NewString(This,s,pRetVal)	\
+    (This)->lpVtbl->NewString(This,s,pRetVal)
+
+#define InvokeBridge_NewArgArray(This,sz,pRetVal)	\
+    (This)->lpVtbl->NewArgArray(This,sz,pRetVal)
+
+#define InvokeBridge_SetArg(This,arr,val,idx)	\
+    (This)->lpVtbl->SetArg(This,arr,val,idx)
+
+#define InvokeBridge_GetArg(This,arr,idx,pRetVal)	\
+    (This)->lpVtbl->GetArg(This,arr,idx,pRetVal)
+
+#define InvokeBridge_GetType_2(This,typeName,pRetVal)	\
+    (This)->lpVtbl->GetType_2(This,typeName,pRetVal)
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/rts/dotnet/Makefile b/rts/dotnet/Makefile
new file mode 100644
index 0000000000..95b6c38890
--- /dev/null
+++ b/rts/dotnet/Makefile
@@ -0,0 +1,53 @@
+#
+# .NET interop for GHC.
+#
+#  (c) 2003, sof.
+# 
+TOP=../..
+include $(TOP)/mk/boilerplate.mk
+
+all :: Invoker.dll Invoke.o
+
+#
+# To compile the dotnet interop bits, you need to have the
+# .NET Framework SDK or VS.NET installed. The following
+# apps are used:
+# 
+MCPP=cl
+TLBEXP=tlbexp
+REGASM=regasm
+GACUTIL=gacutil
+
+Invoker.dll : Invoker.obj 
+	$(MCPP) /LD /clr /o Invoker.dll Invoker.obj
+	$(TLBEXP) Invoker.dll
+	$(REGASM) Invoker.dll
+	$(GACUTIL) /i Invoker.dll
+
+Invoker.obj : Invoker.cpp Invoker.h
+	$(MCPP) /LD /clr /c Invoker.cpp
+
+CLEAN_FILES += $(wildcard *.obj *.dll *.tlb)
+
+# ToDo: 
+#   - switch to /ir (i.e., copy it into the GAC.)
+#   - sort out installation story.
+
+# drop the assembly
+remove : 
+	$(GACUTIL) /u Invoker
+
+#
+# NOTE: For DotnetCc a version of gcc later than gcc-2.95 is
+#       required (I'm using the gcc-3.2 snapshot that comes with mingw-2)
+#
+ifeq "$(DotnetCc)" ""
+DotnetCc=$(CC)
+endif
+DotnetCcOpts=$(CC_OPTS) $(DOTNET_EXTRA_CC_OPTS)
+SRC_CC_OPTS += -I$(TOP)/includes
+
+Invoke.o : Invoke.c 
+	$(DotnetCc) $(DotnetCcOpts) -c $< -o $@ 
+
+include $(TOP)/mk/target.mk
diff --git a/rts/dotnet/invoker.snk b/rts/dotnet/invoker.snk
new file mode 100644
index 0000000000..05a222178a
--- /dev/null
+++ b/rts/dotnet/invoker.snk
diff --git a/rts/ghc-frontpanel.glade b/rts/ghc-frontpanel.glade
new file mode 100644
index 0000000000..9b73afce47
--- /dev/null
+++ b/rts/ghc-frontpanel.glade
@@ -0,0 +1,1622 @@
+<?xml version="1.0" standalone="no"?> <!--*- mode: xml -*-->
+<!DOCTYPE glade-interface SYSTEM "http://glade.gnome.org/glade-2.0.dtd">
+
+<glade-interface>
+
+<widget class="GtkWindow" id="GHC Front Panel">
+  <property name="visible">True</property>
+  <property name="title" translatable="yes">GHC Front Panel</property>
+  <property name="type">GTK_WINDOW_TOPLEVEL</property>
+  <property name="window_position">GTK_WIN_POS_NONE</property>
+  <property name="modal">False</property>
+  <property name="default_width">450</property>
+  <property name="default_height">600</property>
+  <property name="resizable">True</property>
+  <property name="destroy_with_parent">False</property>
+  <property name="decorated">True</property>
+  <property name="skip_taskbar_hint">False</property>
+  <property name="skip_pager_hint">False</property>
+  <property name="type_hint">GDK_WINDOW_TYPE_HINT_NORMAL</property>
+  <property name="gravity">GDK_GRAVITY_NORTH_WEST</property>
+  <property name="focus_on_map">True</property>
+
+  <child>
+    <widget class="GtkVBox" id="vbox1">
+      <property name="visible">True</property>
+      <property name="homogeneous">False</property>
+      <property name="spacing">0</property>
+
+      <child>
+	<widget class="GtkHBox" id="hbox1">
+	  <property name="border_width">10</property>
+	  <property name="visible">True</property>
+	  <property name="homogeneous">False</property>
+	  <property name="spacing">10</property>
+
+	  <child>
+	    <widget class="GtkVBox" id="vbox4">
+	      <property name="visible">True</property>
+	      <property name="homogeneous">False</property>
+	      <property name="spacing">10</property>
+
+	      <child>
+		<widget class="GtkFrame" id="frame3">
+		  <property name="visible">True</property>
+		  <property name="label_xalign">0</property>
+		  <property name="label_yalign">0.5</property>
+		  <property name="shadow_type">GTK_SHADOW_ETCHED_IN</property>
+
+		  <child>
+		    <widget class="GtkHBox" id="hbox3">
+		      <property name="visible">True</property>
+		      <property name="homogeneous">False</property>
+		      <property name="spacing">0</property>
+
+		      <child>
+			<widget class="GtkLabel" id="label40">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Mb</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="padding">0</property>
+			  <property name="expand">False</property>
+			  <property name="fill">False</property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkVRuler" id="map_ruler">
+			  <property name="visible">True</property>
+			  <property name="metric">GTK_PIXELS</property>
+			  <property name="lower">0</property>
+			  <property name="upper">10</property>
+			  <property name="position">1.40845072269</property>
+			  <property name="max_size">10</property>
+			</widget>
+			<packing>
+			  <property name="padding">0</property>
+			  <property name="expand">False</property>
+			  <property name="fill">False</property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkDrawingArea" id="memmap">
+			  <property name="visible">True</property>
+			</widget>
+			<packing>
+			  <property name="padding">0</property>
+			  <property name="expand">True</property>
+			  <property name="fill">True</property>
+			</packing>
+		      </child>
+		    </widget>
+		  </child>
+
+		  <child>
+		    <widget class="GtkLabel" id="label1">
+		      <property name="visible">True</property>
+		      <property name="label" translatable="yes">Memory Map</property>
+		      <property name="use_underline">False</property>
+		      <property name="use_markup">False</property>
+		      <property name="justify">GTK_JUSTIFY_LEFT</property>
+		      <property name="wrap">False</property>
+		      <property name="selectable">False</property>
+		      <property name="xalign">0.5</property>
+		      <property name="yalign">0.5</property>
+		      <property name="xpad">0</property>
+		      <property name="ypad">0</property>
+		      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+		      <property name="width_chars">-1</property>
+		      <property name="single_line_mode">False</property>
+		      <property name="angle">0</property>
+		    </widget>
+		    <packing>
+		      <property name="type">label_item</property>
+		    </packing>
+		  </child>
+		</widget>
+		<packing>
+		  <property name="padding">0</property>
+		  <property name="expand">True</property>
+		  <property name="fill">True</property>
+		</packing>
+	      </child>
+
+	      <child>
+		<widget class="GtkFrame" id="frame8">
+		  <property name="visible">True</property>
+		  <property name="label_xalign">0</property>
+		  <property name="label_yalign">0.5</property>
+		  <property name="shadow_type">GTK_SHADOW_ETCHED_IN</property>
+
+		  <child>
+		    <widget class="GtkVBox" id="vbox14">
+		      <property name="visible">True</property>
+		      <property name="homogeneous">False</property>
+		      <property name="spacing">0</property>
+
+		      <child>
+			<widget class="GtkTable" id="table4">
+			  <property name="visible">True</property>
+			  <property name="n_rows">2</property>
+			  <property name="n_columns">3</property>
+			  <property name="homogeneous">False</property>
+			  <property name="row_spacing">0</property>
+			  <property name="column_spacing">0</property>
+
+			  <child>
+			    <widget class="GtkVRuler" id="gen_ruler">
+			      <property name="visible">True</property>
+			      <property name="metric">GTK_PIXELS</property>
+			      <property name="lower">0</property>
+			      <property name="upper">10</property>
+			      <property name="position">1.69934999943</property>
+			      <property name="max_size">10</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">1</property>
+			      <property name="right_attach">2</property>
+			      <property name="top_attach">0</property>
+			      <property name="bottom_attach">1</property>
+			      <property name="x_options">fill</property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkHBox" id="gen_hbox">
+			      <property name="visible">True</property>
+			      <property name="homogeneous">False</property>
+			      <property name="spacing">0</property>
+
+			      <child>
+				<placeholder/>
+			      </child>
+
+			      <child>
+				<placeholder/>
+			      </child>
+
+			      <child>
+				<placeholder/>
+			      </child>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">2</property>
+			      <property name="right_attach">3</property>
+			      <property name="top_attach">1</property>
+			      <property name="bottom_attach">2</property>
+			      <property name="y_options">fill</property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkDrawingArea" id="generations">
+			      <property name="visible">True</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">2</property>
+			      <property name="right_attach">3</property>
+			      <property name="top_attach">0</property>
+			      <property name="bottom_attach">1</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options">fill</property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="label39">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes">Mb</property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_CENTER</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">0.5</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">0</property>
+			      <property name="right_attach">1</property>
+			      <property name="top_attach">0</property>
+			      <property name="bottom_attach">1</property>
+			      <property name="x_options"></property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+			</widget>
+			<packing>
+			  <property name="padding">0</property>
+			  <property name="expand">True</property>
+			  <property name="fill">True</property>
+			</packing>
+		      </child>
+		    </widget>
+		  </child>
+
+		  <child>
+		    <widget class="GtkLabel" id="label41">
+		      <property name="visible">True</property>
+		      <property name="label" translatable="yes">Generations</property>
+		      <property name="use_underline">False</property>
+		      <property name="use_markup">False</property>
+		      <property name="justify">GTK_JUSTIFY_LEFT</property>
+		      <property name="wrap">False</property>
+		      <property name="selectable">False</property>
+		      <property name="xalign">0.5</property>
+		      <property name="yalign">0.5</property>
+		      <property name="xpad">0</property>
+		      <property name="ypad">0</property>
+		      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+		      <property name="width_chars">-1</property>
+		      <property name="single_line_mode">False</property>
+		      <property name="angle">0</property>
+		    </widget>
+		    <packing>
+		      <property name="type">label_item</property>
+		    </packing>
+		  </child>
+		</widget>
+		<packing>
+		  <property name="padding">0</property>
+		  <property name="expand">True</property>
+		  <property name="fill">True</property>
+		</packing>
+	      </child>
+
+	      <child>
+		<widget class="GtkFrame" id="frame7">
+		  <property name="visible">True</property>
+		  <property name="label_xalign">0</property>
+		  <property name="label_yalign">0.5</property>
+		  <property name="shadow_type">GTK_SHADOW_ETCHED_IN</property>
+
+		  <child>
+		    <widget class="GtkTable" id="table3">
+		      <property name="border_width">2</property>
+		      <property name="visible">True</property>
+		      <property name="n_rows">3</property>
+		      <property name="n_columns">3</property>
+		      <property name="homogeneous">False</property>
+		      <property name="row_spacing">0</property>
+		      <property name="column_spacing">0</property>
+
+		      <child>
+			<widget class="GtkHRuler" id="res_hruler">
+			  <property name="visible">True</property>
+			  <property name="metric">GTK_PIXELS</property>
+			  <property name="lower">0</property>
+			  <property name="upper">10</property>
+			  <property name="position">8.35443019867</property>
+			  <property name="max_size">10</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">2</property>
+			  <property name="right_attach">3</property>
+			  <property name="top_attach">1</property>
+			  <property name="bottom_attach">2</property>
+			  <property name="y_options">fill</property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkVRuler" id="res_vruler">
+			  <property name="visible">True</property>
+			  <property name="metric">GTK_PIXELS</property>
+			  <property name="lower">0</property>
+			  <property name="upper">10</property>
+			  <property name="position">9.69925022125</property>
+			  <property name="max_size">10</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">2</property>
+			  <property name="bottom_attach">3</property>
+			  <property name="x_options">fill</property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkDrawingArea" id="res_drawingarea">
+			  <property name="visible">True</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">2</property>
+			  <property name="right_attach">3</property>
+			  <property name="top_attach">2</property>
+			  <property name="bottom_attach">3</property>
+			  <property name="x_options">fill</property>
+			  <property name="y_options">fill</property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="label37">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Secs</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">2</property>
+			  <property name="right_attach">3</property>
+			  <property name="top_attach">0</property>
+			  <property name="bottom_attach">1</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="label38">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Mb</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">2</property>
+			  <property name="bottom_attach">3</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+		    </widget>
+		  </child>
+
+		  <child>
+		    <widget class="GtkLabel" id="label42">
+		      <property name="visible">True</property>
+		      <property name="label" translatable="yes">Residency</property>
+		      <property name="use_underline">False</property>
+		      <property name="use_markup">False</property>
+		      <property name="justify">GTK_JUSTIFY_LEFT</property>
+		      <property name="wrap">False</property>
+		      <property name="selectable">False</property>
+		      <property name="xalign">0.5</property>
+		      <property name="yalign">0.5</property>
+		      <property name="xpad">0</property>
+		      <property name="ypad">0</property>
+		      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+		      <property name="width_chars">-1</property>
+		      <property name="single_line_mode">False</property>
+		      <property name="angle">0</property>
+		    </widget>
+		    <packing>
+		      <property name="type">label_item</property>
+		    </packing>
+		  </child>
+		</widget>
+		<packing>
+		  <property name="padding">0</property>
+		  <property name="expand">True</property>
+		  <property name="fill">True</property>
+		</packing>
+	      </child>
+	    </widget>
+	    <packing>
+	      <property name="padding">0</property>
+	      <property name="expand">True</property>
+	      <property name="fill">True</property>
+	    </packing>
+	  </child>
+
+	  <child>
+	    <widget class="GtkVBox" id="vbox5">
+	      <property name="visible">True</property>
+	      <property name="homogeneous">False</property>
+	      <property name="spacing">10</property>
+
+	      <child>
+		<widget class="GtkFrame" id="frame5">
+		  <property name="visible">True</property>
+		  <property name="label_xalign">0</property>
+		  <property name="label_yalign">0.5</property>
+		  <property name="shadow_type">GTK_SHADOW_ETCHED_IN</property>
+
+		  <child>
+		    <widget class="GtkVBox" id="vbox6">
+		      <property name="border_width">5</property>
+		      <property name="visible">True</property>
+		      <property name="homogeneous">False</property>
+		      <property name="spacing">0</property>
+
+		      <child>
+			<placeholder/>
+		      </child>
+
+		      <child>
+			<placeholder/>
+		      </child>
+
+		      <child>
+			<widget class="GtkTable" id="table1">
+			  <property name="visible">True</property>
+			  <property name="n_rows">4</property>
+			  <property name="n_columns">3</property>
+			  <property name="homogeneous">False</property>
+			  <property name="row_spacing">0</property>
+			  <property name="column_spacing">7</property>
+
+			  <child>
+			    <widget class="GtkLabel" id="label12">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes">Allocated</property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_RIGHT</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">1</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">0</property>
+			      <property name="right_attach">1</property>
+			      <property name="top_attach">1</property>
+			      <property name="bottom_attach">2</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="label13">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes">Live</property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_RIGHT</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">1</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">0</property>
+			      <property name="right_attach">1</property>
+			      <property name="top_attach">0</property>
+			      <property name="bottom_attach">1</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="label14">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes">Allocation Rate</property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_RIGHT</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">1</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">0</property>
+			      <property name="right_attach">1</property>
+			      <property name="top_attach">3</property>
+			      <property name="bottom_attach">4</property>
+			      <property name="x_options"></property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="label15">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes">		Footprint</property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_RIGHT</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">1</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">0</property>
+			      <property name="right_attach">1</property>
+			      <property name="top_attach">2</property>
+			      <property name="bottom_attach">3</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="label16">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes">M/sec</property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_CENTER</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">0.5</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">2</property>
+			      <property name="right_attach">3</property>
+			      <property name="top_attach">3</property>
+			      <property name="bottom_attach">4</property>
+			      <property name="x_options"></property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="label17">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes">M</property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_LEFT</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">7.45058015283e-09</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">2</property>
+			      <property name="right_attach">3</property>
+			      <property name="top_attach">2</property>
+			      <property name="bottom_attach">3</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="label18">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes">M</property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_CENTER</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">7.45058015283e-09</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">2</property>
+			      <property name="right_attach">3</property>
+			      <property name="top_attach">1</property>
+			      <property name="bottom_attach">2</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="label19">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes">M</property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_CENTER</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">7.45058015283e-09</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">2</property>
+			      <property name="right_attach">3</property>
+			      <property name="top_attach">0</property>
+			      <property name="bottom_attach">1</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="live_label">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes"></property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_CENTER</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">1</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">1</property>
+			      <property name="right_attach">2</property>
+			      <property name="top_attach">0</property>
+			      <property name="bottom_attach">1</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="allocated_label">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes"></property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_CENTER</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">1</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">1</property>
+			      <property name="right_attach">2</property>
+			      <property name="top_attach">1</property>
+			      <property name="bottom_attach">2</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="footprint_label">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes"></property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_CENTER</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">1</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">1</property>
+			      <property name="right_attach">2</property>
+			      <property name="top_attach">2</property>
+			      <property name="bottom_attach">3</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="alloc_rate_label">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes"></property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_CENTER</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">1</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">1</property>
+			      <property name="right_attach">2</property>
+			      <property name="top_attach">3</property>
+			      <property name="bottom_attach">4</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+			</widget>
+			<packing>
+			  <property name="padding">0</property>
+			  <property name="expand">True</property>
+			  <property name="fill">True</property>
+			</packing>
+		      </child>
+		    </widget>
+		  </child>
+
+		  <child>
+		    <widget class="GtkLabel" id="label43">
+		      <property name="visible">True</property>
+		      <property name="label" translatable="yes">Stats</property>
+		      <property name="use_underline">False</property>
+		      <property name="use_markup">False</property>
+		      <property name="justify">GTK_JUSTIFY_LEFT</property>
+		      <property name="wrap">False</property>
+		      <property name="selectable">False</property>
+		      <property name="xalign">0.5</property>
+		      <property name="yalign">0.5</property>
+		      <property name="xpad">0</property>
+		      <property name="ypad">0</property>
+		      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+		      <property name="width_chars">-1</property>
+		      <property name="single_line_mode">False</property>
+		      <property name="angle">0</property>
+		    </widget>
+		    <packing>
+		      <property name="type">label_item</property>
+		    </packing>
+		  </child>
+		</widget>
+		<packing>
+		  <property name="padding">0</property>
+		  <property name="expand">False</property>
+		  <property name="fill">True</property>
+		</packing>
+	      </child>
+
+	      <child>
+		<widget class="GtkFrame" id="frame9">
+		  <property name="visible">True</property>
+		  <property name="label_xalign">0</property>
+		  <property name="label_yalign">0.5</property>
+		  <property name="shadow_type">GTK_SHADOW_ETCHED_IN</property>
+
+		  <child>
+		    <widget class="GtkTable" id="table5">
+		      <property name="border_width">6</property>
+		      <property name="visible">True</property>
+		      <property name="n_rows">9</property>
+		      <property name="n_columns">2</property>
+		      <property name="homogeneous">False</property>
+		      <property name="row_spacing">0</property>
+		      <property name="column_spacing">10</property>
+
+		      <child>
+			<widget class="GtkLabel" id="label20">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Running</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">1</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">0</property>
+			  <property name="bottom_attach">1</property>
+			  <property name="x_options">fill</property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="label21">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Blocked on I/O (Read)</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">1</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">1</property>
+			  <property name="bottom_attach">2</property>
+			  <property name="x_options">fill</property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="label22">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Blocked on MVar</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">1</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">3</property>
+			  <property name="bottom_attach">4</property>
+			  <property name="x_options">fill</property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="label24">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Blocked on throwTo</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">1</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">4</property>
+			  <property name="bottom_attach">5</property>
+			  <property name="x_options">fill</property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="label26">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Blocked on Black Hole</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">1</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">5</property>
+			  <property name="bottom_attach">6</property>
+			  <property name="x_options">fill</property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="label25">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Sleeping</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">1</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">6</property>
+			  <property name="bottom_attach">7</property>
+			  <property name="x_options">fill</property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="label27">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Blocked on I/O (Write)</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">1</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">2</property>
+			  <property name="bottom_attach">3</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="running_label">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">label28</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">0</property>
+			  <property name="bottom_attach">1</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="blockread_label">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">label29</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">1</property>
+			  <property name="bottom_attach">2</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="blockwrite_label">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">label30</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">2</property>
+			  <property name="bottom_attach">3</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="blockmvar_label">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">label31</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">3</property>
+			  <property name="bottom_attach">4</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="blockthrowto_label">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">label32</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">4</property>
+			  <property name="bottom_attach">5</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="blockbh_label">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">label33</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">5</property>
+			  <property name="bottom_attach">6</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="sleeping_label">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">label34</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">6</property>
+			  <property name="bottom_attach">7</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkHSeparator" id="hseparator1">
+			  <property name="visible">True</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">7</property>
+			  <property name="bottom_attach">8</property>
+			  <property name="x_options">fill</property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkHSeparator" id="hseparator2">
+			  <property name="visible">True</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">7</property>
+			  <property name="bottom_attach">8</property>
+			  <property name="x_options">fill</property>
+			  <property name="y_options">fill</property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="label35">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Total</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">1</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">8</property>
+			  <property name="bottom_attach">9</property>
+			  <property name="x_options">fill</property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="total_label">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">label36</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">8</property>
+			  <property name="bottom_attach">9</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+		    </widget>
+		  </child>
+
+		  <child>
+		    <widget class="GtkLabel" id="label44">
+		      <property name="visible">True</property>
+		      <property name="label" translatable="yes">Threads</property>
+		      <property name="use_underline">False</property>
+		      <property name="use_markup">False</property>
+		      <property name="justify">GTK_JUSTIFY_LEFT</property>
+		      <property name="wrap">False</property>
+		      <property name="selectable">False</property>
+		      <property name="xalign">0.5</property>
+		      <property name="yalign">0.5</property>
+		      <property name="xpad">0</property>
+		      <property name="ypad">0</property>
+		      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+		      <property name="width_chars">-1</property>
+		      <property name="single_line_mode">False</property>
+		      <property name="angle">0</property>
+		    </widget>
+		    <packing>
+		      <property name="type">label_item</property>
+		    </packing>
+		  </child>
+		</widget>
+		<packing>
+		  <property name="padding">0</property>
+		  <property name="expand">False</property>
+		  <property name="fill">True</property>
+		</packing>
+	      </child>
+
+	      <child>
+		<widget class="GtkFrame" id="frame6">
+		  <property name="visible">True</property>
+		  <property name="label_xalign">0</property>
+		  <property name="label_yalign">0.5</property>
+		  <property name="shadow_type">GTK_SHADOW_ETCHED_IN</property>
+
+		  <child>
+		    <widget class="GtkVBox" id="vbox7">
+		      <property name="border_width">5</property>
+		      <property name="visible">True</property>
+		      <property name="homogeneous">False</property>
+		      <property name="spacing">10</property>
+
+		      <child>
+			<widget class="GtkVBox" id="vbox9">
+			  <property name="visible">True</property>
+			  <property name="homogeneous">False</property>
+			  <property name="spacing">0</property>
+
+			  <child>
+			    <widget class="GtkRadioButton" id="cont_radio">
+			      <property name="visible">True</property>
+			      <property name="can_focus">True</property>
+			      <property name="label" translatable="yes">Continuous</property>
+			      <property name="use_underline">True</property>
+			      <property name="relief">GTK_RELIEF_NORMAL</property>
+			      <property name="focus_on_click">True</property>
+			      <property name="active">True</property>
+			      <property name="inconsistent">False</property>
+			      <property name="draw_indicator">True</property>
+			      <signal name="clicked" handler="on_cont_radio_clicked"/>
+			    </widget>
+			    <packing>
+			      <property name="padding">0</property>
+			      <property name="expand">False</property>
+			      <property name="fill">False</property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkRadioButton" id="stop_before_radio">
+			      <property name="visible">True</property>
+			      <property name="can_focus">True</property>
+			      <property name="label" translatable="yes">Stop before GC</property>
+			      <property name="use_underline">True</property>
+			      <property name="relief">GTK_RELIEF_NORMAL</property>
+			      <property name="focus_on_click">True</property>
+			      <property name="active">False</property>
+			      <property name="inconsistent">False</property>
+			      <property name="draw_indicator">True</property>
+			      <property name="group">cont_radio</property>
+			      <signal name="clicked" handler="on_stop_before_radio_clicked"/>
+			    </widget>
+			    <packing>
+			      <property name="padding">0</property>
+			      <property name="expand">False</property>
+			      <property name="fill">False</property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkRadioButton" id="stop_after_radio">
+			      <property name="visible">True</property>
+			      <property name="can_focus">True</property>
+			      <property name="label" translatable="yes">Stop after GC</property>
+			      <property name="use_underline">True</property>
+			      <property name="relief">GTK_RELIEF_NORMAL</property>
+			      <property name="focus_on_click">True</property>
+			      <property name="active">False</property>
+			      <property name="inconsistent">False</property>
+			      <property name="draw_indicator">True</property>
+			      <property name="group">cont_radio</property>
+			      <signal name="clicked" handler="on_stop_after_radio_clicked"/>
+			    </widget>
+			    <packing>
+			      <property name="padding">0</property>
+			      <property name="expand">False</property>
+			      <property name="fill">False</property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkRadioButton" id="stop_both_radio">
+			      <property name="visible">True</property>
+			      <property name="can_focus">True</property>
+			      <property name="label" translatable="yes">Stop before &amp; after GC</property>
+			      <property name="use_underline">True</property>
+			      <property name="relief">GTK_RELIEF_NORMAL</property>
+			      <property name="focus_on_click">True</property>
+			      <property name="active">False</property>
+			      <property name="inconsistent">False</property>
+			      <property name="draw_indicator">True</property>
+			      <property name="group">cont_radio</property>
+			      <signal name="clicked" handler="on_stop_both_radio_clicked"/>
+			    </widget>
+			    <packing>
+			      <property name="padding">0</property>
+			      <property name="expand">False</property>
+			      <property name="fill">False</property>
+			    </packing>
+			  </child>
+			</widget>
+			<packing>
+			  <property name="padding">0</property>
+			  <property name="expand">True</property>
+			  <property name="fill">True</property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkVBox" id="vbox8">
+			  <property name="visible">True</property>
+			  <property name="homogeneous">False</property>
+			  <property name="spacing">0</property>
+
+			  <child>
+			    <widget class="GtkButton" id="stop_but">
+			      <property name="visible">True</property>
+			      <property name="can_focus">True</property>
+			      <property name="label" translatable="yes">Stop</property>
+			      <property name="use_underline">True</property>
+			      <property name="relief">GTK_RELIEF_NORMAL</property>
+			      <property name="focus_on_click">True</property>
+			      <signal name="clicked" handler="on_stop_but_clicked"/>
+			    </widget>
+			    <packing>
+			      <property name="padding">0</property>
+			      <property name="expand">False</property>
+			      <property name="fill">False</property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkButton" id="continue_but">
+			      <property name="visible">True</property>
+			      <property name="can_focus">True</property>
+			      <property name="label" translatable="yes">Continue</property>
+			      <property name="use_underline">True</property>
+			      <property name="relief">GTK_RELIEF_NORMAL</property>
+			      <property name="focus_on_click">True</property>
+			      <signal name="clicked" handler="on_continue_but_clicked"/>
+			    </widget>
+			    <packing>
+			      <property name="padding">0</property>
+			      <property name="expand">False</property>
+			      <property name="fill">False</property>
+			    </packing>
+			  </child>
+			</widget>
+			<packing>
+			  <property name="padding">0</property>
+			  <property name="expand">False</property>
+			  <property name="fill">False</property>
+			</packing>
+		      </child>
+		    </widget>
+		  </child>
+
+		  <child>
+		    <widget class="GtkLabel" id="label45">
+		      <property name="visible">True</property>
+		      <property name="label" translatable="yes">Updates</property>
+		      <property name="use_underline">False</property>
+		      <property name="use_markup">False</property>
+		      <property name="justify">GTK_JUSTIFY_LEFT</property>
+		      <property name="wrap">False</property>
+		      <property name="selectable">False</property>
+		      <property name="xalign">0.5</property>
+		      <property name="yalign">0.5</property>
+		      <property name="xpad">0</property>
+		      <property name="ypad">0</property>
+		      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+		      <property name="width_chars">-1</property>
+		      <property name="single_line_mode">False</property>
+		      <property name="angle">0</property>
+		    </widget>
+		    <packing>
+		      <property name="type">label_item</property>
+		    </packing>
+		  </child>
+		</widget>
+		<packing>
+		  <property name="padding">0</property>
+		  <property name="expand">False</property>
+		  <property name="fill">False</property>
+		</packing>
+	      </child>
+
+	      <child>
+		<widget class="GtkButton" id="quit_but">
+		  <property name="visible">True</property>
+		  <property name="can_focus">True</property>
+		  <property name="label" translatable="yes">Quit</property>
+		  <property name="use_underline">True</property>
+		  <property name="relief">GTK_RELIEF_NORMAL</property>
+		  <property name="focus_on_click">True</property>
+		  <signal name="clicked" handler="on_quit_but_clicked"/>
+		</widget>
+		<packing>
+		  <property name="padding">0</property>
+		  <property name="expand">False</property>
+		  <property name="fill">False</property>
+		  <property name="pack_type">GTK_PACK_END</property>
+		</packing>
+	      </child>
+	    </widget>
+	    <packing>
+	      <property name="padding">0</property>
+	      <property name="expand">False</property>
+	      <property name="fill">False</property>
+	      <property name="pack_type">GTK_PACK_END</property>
+	    </packing>
+	  </child>
+	</widget>
+	<packing>
+	  <property name="padding">0</property>
+	  <property name="expand">True</property>
+	  <property name="fill">True</property>
+	</packing>
+      </child>
+
+      <child>
+	<widget class="GtkStatusbar" id="statusbar">
+	  <property name="visible">True</property>
+	  <property name="has_resize_grip">True</property>
+	</widget>
+	<packing>
+	  <property name="padding">0</property>
+	  <property name="expand">False</property>
+	  <property name="fill">False</property>
+	</packing>
+      </child>
+    </widget>
+  </child>
+</widget>
+
+</glade-interface>
diff --git a/rts/gmp/.gdbinit b/rts/gmp/.gdbinit
new file mode 100644
index 0000000000..843c109e89
--- /dev/null
+++ b/rts/gmp/.gdbinit
@@ -0,0 +1,34 @@
+# Copyright (C) 1999 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+define pz
+set __gmpz_dump ($)
+end
+
+define pq
+set __gmpz_dump ($->_mp_num)
+echo /
+set __gmpz_dump ($->_mp_den)
+end
+
+define pf
+set __gmpf_dump ($)
+end
+
diff --git a/rts/gmp/AUTHORS b/rts/gmp/AUTHORS
new file mode 100644
index 0000000000..1fa057af6c
--- /dev/null
+++ b/rts/gmp/AUTHORS
@@ -0,0 +1,12 @@
+Authors if GNU MP (in chronological order)
+Torbj�rn Granlund
+John Amanatides
+Paul Zimmermann
+Ken Weber
+Bennet Yee
+Andreas Schwab
+Robert Harley
+Linus Nordberg
+Kent Boortz
+Kevin Ryde
+Guillaume Hanrot
diff --git a/rts/gmp/COPYING b/rts/gmp/COPYING
new file mode 100644
index 0000000000..a6d7d0188a
--- /dev/null
+++ b/rts/gmp/COPYING
@@ -0,0 +1,336 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year  name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Hereny it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Section
diff --git a/rts/gmp/COPYING.LIB b/rts/gmp/COPYING.LIB
new file mode 100644
index 0000000000..c4792dd27a
--- /dev/null
+++ b/rts/gmp/COPYING.LIB
@@ -0,0 +1,515 @@
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations
+below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+^L
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it
+becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+^L
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control
+compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+^L
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+^L
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+^L
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+^L
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply, and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License
+may add an explicit geographical distribution limitation excluding those
+countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+^L
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+^L
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms
+of the ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.
+It is safest to attach them to the start of each source file to most
+effectively convey the exclusion of warranty; and each file should
+have at least the "copyright" line and a pointer to where the full
+notice is found.
+
+
+    <one line to give the library's name and a brief idea of what it
+does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+
+Also add information on how to contact you by electronic and paper
+mail.
+
+You should also get your employer (if you work as a programmer) or
+your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James
+Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
+
+
diff --git a/rts/gmp/INSTALL b/rts/gmp/INSTALL
new file mode 100644
index 0000000000..62faa1a2e3
--- /dev/null
+++ b/rts/gmp/INSTALL
@@ -0,0 +1,146 @@
+
+                          INSTALLING GNU MP
+                          =================
+
+
+These instructions are only for the impatient.  Others should read the install
+instructions in the manual, gmp.info.  Use
+
+	info -f ./gmp.info
+
+or in emacs
+
+	C-u C-h i gmp.info
+
+
+Here are some brief instructions on how to install GMP, and some examples to
+help you get started using GMP.
+
+First, you need to compile, and optionally install, GMP.  Since you're
+impatient, try this:
+
+	./configure; make
+
+If that fails, or you care about the performance of GMP, you need to read the
+full instructions in the chapter "Installing GMP", in the manual.
+
+Next, try some small test programs, for example the ones below.
+
+In GMP programs, all variables need to be initialized before they are
+assigned, and cleared out before program flow leaves the scope in which they
+were declared.  Here is an example program that reads two numbers from the
+command line, multiplies them, and prints the result to stdout.
+
+
+    #include <stdio.h>
+    #include <gmp.h>         /* All GMP programs need to include gmp.h */
+
+    main (int argc, char **argv)
+    {
+      mpz_t a, b, p;
+
+      if (argc != 3)
+        { printf ("Usage: %s <number> <number>\n", argv[0]); exit (1); }
+
+      /* Initialize variables */
+      mpz_init (a);
+      mpz_init (b);
+      mpz_init (p);
+
+      /* Assign a and b from base 10 strings in argv */
+      mpz_set_str (a, argv[1], 10);
+      mpz_set_str (b, argv[2], 10);
+
+      /* Multiply a and b and put the result in p */
+      mpz_mul (p, a, b);
+
+      /* Print p in base 10 */
+      mpz_out_str (stdout, 10, p);
+      fputc ('\n', stdout);
+
+      /* Clear out variables */
+      mpz_clear (a);
+      mpz_clear (b);
+      mpz_clear (p);
+      exit (0);
+    }
+
+
+This might look tedious, with all the initializing and clearing.  Fortunately
+some of these operations can be combined, and other operations can often be
+avoided.  An experienced GMP user might write:
+
+
+    #include <stdio.h>
+    #include <gmp.h>
+
+    main (int argc, char **argv)
+    {
+      mpz_t a, b, p;
+
+      if (argc != 3)
+        { printf ("Usage: %s <number> <number>\n", argv[0]); exit (1); }
+
+      /* Initialize and assign a and b from base 10 strings in argv */
+      mpz_init_set_str (a, argv[1], 10);
+      mpz_init_set_str (b, argv[2], 10);
+      /* Initialize p */
+      mpz_init (p);
+
+      /* Multiply a and b and put the result in p */
+      mpz_mul (p, a, b);
+
+      /* Print p in base 10 */
+      mpz_out_str (stdout, 10, p);
+      fputc ('\n', stdout);
+
+      /* Since we're about to exit, no need to clear out variables */
+      exit (0);
+    }
+
+
+Now you have to compile your test program, and link it with the GMP library.
+Assuming your working directory is still the gmp source directory, and your
+source file is called example.c, enter:
+
+	gcc -g -I. example.c .libs/libgmp.a
+
+After installing, the command becomes: "gcc -g example.c -lgmp".  Also, GMP is
+libtool based so you can use that to link if you want.
+
+Now try to run the example:
+
+	./a.out 98365871231256752134 319378318340103345227
+	31415926535897932384618573336104570964418
+
+The functions used here all operate on signed integers, and have names
+starting with "mpz_".  There are many more such functions than used in these
+examples.  See the chapter "Integer Functions" in the manual, for a complete
+list.
+
+There are two other main classes of functions in GMP.  They operate on
+rational numbers and floating-point numbers, respectively.  The chapters
+"Rational Number Functions", and "Floating-point Functions" document these
+classes.
+
+To run a set of tests, do "make check".  This will take a while.
+
+To create the printable documentation from the texinfo source, type "make
+gmp.dvi" or "make gmp.ps".  This requires various "tex" commands.
+
+To install the library, do "make install" (then you can use -lgmp instead of
+.libs/libgmp.a).
+
+If you decide to use GMP, it is a good idea you at least read the chapter "GMP
+Basics" in the manual.
+
+Some known build problems are noted in the "Installing GMP" chapter of
+the manual.  Please report other problems to bug-gmp@gnu.org.
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 78
+End:
diff --git a/rts/gmp/Makefile.am b/rts/gmp/Makefile.am
new file mode 100644
index 0000000000..b73b805c6e
--- /dev/null
+++ b/rts/gmp/Makefile.am
@@ -0,0 +1,197 @@
+## Process this file with automake to generate Makefile.in
+
+
+# Copyright (C) 1991, 1993, 1994, 1996, 1997, 1999, 2000 Free Software
+# Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# make check
+#
+#     It'd be good if "make check" first did a "make all" or whatever to
+#     build libgmp.la, but it's not clear how best to do this.  Putting a
+#     "check:" target is overridden by automake, and a "check-local:" runs
+#     too late (due to depth-first subdirectory processing).  For now it's
+#     necessary to do "make && make check".
+#
+# MPF_OBJECTS etc
+#
+#     Libtool needs all the .lo files passed to it if it's going to build
+#     both a static and shared library.  If a convenience library like
+#     mpf/libmpf.la is passed then the resulting libgmp.a gets the PIC .lo
+#     objects rather than the non-PIC .o's.
+#
+#     Unfortunately this leads to the big lists of objects below.  Something
+#     like mpz/*.lo would probably work, but might risk missing something
+#     out or getting something extra.  The source files for each .lo are
+#     listed in the Makefile.am's in the subdirectories.
+
+
+# Libtool -version-info for libgmp.la and libmp.la.  See (libtool)Versioning
+#
+# 1. No interfaces changed, only implementations (good): Increment REVISION.
+#
+# 2. Interfaces added, none removed (good): Increment CURRENT, increment
+#    AGE, set REVISION to 0.
+#
+# 3. Interfaces removed (BAD, breaks upward compatibility): Increment
+#    CURRENT, set AGE and REVISION to 0.
+#
+# Do this separately for libgmp and libmp, only do it just before a release.
+#
+#	  GMP	   -version-info
+#	release   libgmp  libmp
+#	 2.0.x      -       -
+#	 3.0      3:0:0   3:0:0
+#        3.0.1    3:1:0   3:0:0
+#        3.1      4:0:1   4:0:1
+#        3.1.1    4:1:1   4:1:1
+#
+#
+# Starting at 3:0:0 is a slight abuse of the versioning system, but it
+# ensures we're past soname libgmp.so.2, which is what has been used on
+# Debian GNU/Linux packages of gmp 2.  Pretend gmp 2 was 2:0:0, so the
+# interface changes for gmp 3 mean 3:0:0 is right.
+
+LIBGMP_LT_CURRENT =  4
+LIBGMP_LT_REVISION = 1
+LIBGMP_LT_AGE =      1
+
+LIBMP_LT_CURRENT =  4
+LIBMP_LT_REVISION = 1
+LIBMP_LT_AGE =      1
+
+
+AUTOMAKE_OPTIONS = gnu check-news no-dependencies ansi2knr
+
+SUBDIRS = mpn mpz mpq mpf mpbsd mpfr tests demos tune
+
+include_HEADERS = gmp.h $(MPBSD_HEADERS_OPTION) $(MPFR_HEADERS_OPTION)
+EXTRA_HEADERS = mp.h
+
+lib_LTLIBRARIES = libgmp.la $(MPBSD_LTLIBRARIES_OPTION)
+
+EXTRA_DIST = .gdbinit gmp-impl.h longlong.h stack-alloc.h urandom.h doc macos
+
+DISTCLEANFILES = asm-syntax.h config.m4 @gmp_srclinks@
+
+
+MPF_OBJECTS = mpf/init.lo mpf/init2.lo mpf/set.lo mpf/set_ui.lo mpf/set_si.lo \
+  mpf/set_str.lo mpf/set_d.lo mpf/set_z.lo mpf/iset.lo mpf/iset_ui.lo \
+  mpf/iset_si.lo mpf/iset_str.lo mpf/iset_d.lo mpf/clear.lo mpf/get_str.lo \
+  mpf/dump.lo mpf/size.lo mpf/eq.lo mpf/reldiff.lo mpf/sqrt.lo mpf/random2.lo \
+  mpf/inp_str.lo mpf/out_str.lo mpf/add.lo mpf/add_ui.lo mpf/sub.lo \
+  mpf/sub_ui.lo mpf/ui_sub.lo mpf/mul.lo mpf/mul_ui.lo mpf/div.lo \
+  mpf/div_ui.lo mpf/cmp.lo mpf/cmp_ui.lo mpf/cmp_si.lo mpf/mul_2exp.lo \
+  mpf/div_2exp.lo mpf/abs.lo mpf/neg.lo mpf/set_q.lo mpf/get_d.lo \
+  mpf/set_dfl_prec.lo mpf/set_prc.lo mpf/set_prc_raw.lo mpf/get_prc.lo \
+  mpf/ui_div.lo mpf/sqrt_ui.lo mpf/floor.lo mpf/ceil.lo mpf/trunc.lo \
+  mpf/pow_ui.lo mpf/urandomb.lo mpf/swap.lo
+MPZ_OBJECTS = mpz/abs.lo mpz/add.lo mpz/add_ui.lo mpz/addmul_ui.lo mpz/and.lo \
+  mpz/array_init.lo mpz/bin_ui.lo mpz/bin_uiui.lo mpz/cdiv_q.lo \
+  mpz/cdiv_q_ui.lo mpz/cdiv_qr.lo mpz/cdiv_qr_ui.lo mpz/cdiv_r.lo \
+  mpz/cdiv_r_ui.lo mpz/cdiv_ui.lo mpz/clear.lo mpz/clrbit.lo mpz/cmp.lo \
+  mpz/cmp_si.lo mpz/cmp_ui.lo mpz/cmpabs.lo mpz/cmpabs_ui.lo mpz/com.lo \
+  mpz/divexact.lo mpz/dump.lo mpz/fac_ui.lo mpz/fdiv_q.lo mpz/fdiv_q_2exp.lo \
+  mpz/fdiv_q_ui.lo mpz/fdiv_qr.lo mpz/fdiv_qr_ui.lo mpz/fdiv_r.lo \
+  mpz/fdiv_r_2exp.lo mpz/fdiv_r_ui.lo mpz/fdiv_ui.lo mpz/fib_ui.lo \
+  mpz/fits_sint_p.lo mpz/fits_slong_p.lo mpz/fits_sshort_p.lo \
+  mpz/fits_uint_p.lo mpz/fits_ulong_p.lo mpz/fits_ushort_p.lo mpz/gcd.lo \
+  mpz/gcd_ui.lo mpz/gcdext.lo mpz/get_d.lo mpz/get_si.lo mpz/get_str.lo \
+  mpz/get_ui.lo mpz/getlimbn.lo mpz/hamdist.lo mpz/init.lo mpz/inp_raw.lo \
+  mpz/inp_str.lo mpz/invert.lo mpz/ior.lo mpz/iset.lo mpz/iset_d.lo \
+  mpz/iset_si.lo mpz/iset_str.lo mpz/iset_ui.lo mpz/jacobi.lo \
+  mpz/kronsz.lo mpz/kronuz.lo mpz/kronzs.lo mpz/kronzu.lo \
+  mpz/lcm.lo mpz/legendre.lo \
+  mpz/mod.lo mpz/mul.lo mpz/mul_2exp.lo mpz/mul_si.lo mpz/mul_ui.lo \
+  mpz/neg.lo mpz/nextprime.lo mpz/out_raw.lo mpz/out_str.lo mpz/perfpow.lo mpz/perfsqr.lo \
+  mpz/popcount.lo mpz/pow_ui.lo mpz/powm.lo mpz/powm_ui.lo mpz/pprime_p.lo \
+  mpz/random.lo mpz/random2.lo mpz/realloc.lo mpz/remove.lo mpz/root.lo \
+  mpz/rrandomb.lo \
+  mpz/scan0.lo mpz/scan1.lo mpz/set.lo mpz/set_d.lo mpz/set_f.lo mpz/set_q.lo \
+  mpz/set_si.lo mpz/set_str.lo mpz/set_ui.lo mpz/setbit.lo mpz/size.lo \
+  mpz/sizeinbase.lo mpz/sqrt.lo mpz/sqrtrem.lo mpz/sub.lo mpz/sub_ui.lo \
+  mpz/swap.lo mpz/tdiv_ui.lo mpz/tdiv_q.lo mpz/tdiv_q_2exp.lo mpz/tdiv_q_ui.lo \
+  mpz/tdiv_qr.lo mpz/tdiv_qr_ui.lo mpz/tdiv_r.lo mpz/tdiv_r_2exp.lo \
+  mpz/tdiv_r_ui.lo mpz/tstbit.lo mpz/ui_pow_ui.lo mpz/urandomb.lo \
+  mpz/urandomm.lo mpz/xor.lo
+MPQ_OBJECTS = mpq/add.lo mpq/canonicalize.lo mpq/clear.lo mpq/cmp.lo \
+  mpq/cmp_ui.lo mpq/div.lo mpq/get_d.lo mpq/get_den.lo mpq/get_num.lo \
+  mpq/init.lo mpq/inv.lo mpq/mul.lo mpq/neg.lo mpq/out_str.lo \
+  mpq/set.lo mpq/set_den.lo \
+  mpq/set_num.lo mpq/set_si.lo mpq/set_ui.lo mpq/sub.lo mpq/equal.lo \
+  mpq/set_z.lo mpq/set_d.lo mpq/swap.lo
+MPN_OBJECTS = @mpn_objs_in_libgmp@
+
+MPBSD_OBJECTS = mpbsd/add.lo mpbsd/tdiv_qr.lo mpbsd/move.lo mpbsd/powm.lo \
+  mpbsd/sub.lo mpbsd/cmp.lo mpbsd/mfree.lo mpbsd/mtox.lo mpbsd/realloc.lo \
+  mpbsd/gcd.lo mpbsd/itom.lo mpbsd/min.lo mpbsd/mul.lo mpbsd/mout.lo	  \
+  mpbsd/pow_ui.lo mpbsd/sdiv.lo mpbsd/sqrtrem.lo mpbsd/xtom.lo 
+
+# FIXME: Add mpfr/rnd_mode.lo when it's clean.
+MPFR_OBJECTS = mpfr/add.lo mpfr/div_2exp.lo mpfr/neg.lo mpfr/set_dfl_prec.lo \
+  mpfr/set_str_raw.lo mpfr/agm.lo mpfr/get_str.lo mpfr/print_raw.lo \
+  mpfr/set_dfl_rnd.lo mpfr/sqrt.lo mpfr/clear.lo mpfr/init.lo \
+  mpfr/set_f.lo mpfr/sub.lo mpfr/cmp.lo mpfr/mul.lo mpfr/round.lo \
+  mpfr/set_prec.lo mpfr/cmp_ui.lo mpfr/mul_2exp.lo mpfr/set.lo mpfr/set_si.lo \
+  mpfr/div.lo mpfr/mul_ui.lo mpfr/set_d.lo mpfr/pow.lo mpfr/out_str.lo \
+  mpfr/pi.lo mpfr/set_z.lo mpfr/add_ulp.lo mpfr/log2.lo mpfr/random.lo \
+  mpfr/log.lo mpfr/exp.lo mpfr/div_ui.lo mpfr/zeta.lo mpfr/karadiv.lo \
+  mpfr/karasqrt.lo mpfr/print_rnd_mode.lo
+
+
+if WANT_MPFR
+MPFR_HEADERS_OPTION = mpfr/mpfr.h
+MPFR_OBJECTS_OPTION = $(MPFR_OBJECTS)
+MPFR_LIBADD_OPTION = -lm
+endif
+libgmp_la_SOURCES = assert.c compat.c errno.c memory.c mp_set_fns.c	 \
+  mp_clz_tab.c mp_minv_tab.c						 \
+  rand.c randclr.c randlc.c randlc2x.c randraw.c randsd.c		 \
+  randsdui.c version.c stack-alloc.c mp_bpl.c extract-dbl.c insert-dbl.c
+libgmp_la_DEPENDENCIES = \
+  $(MPF_OBJECTS) $(MPZ_OBJECTS) $(MPN_OBJECTS) $(MPQ_OBJECTS) \
+  $(MPFR_OBJECTS_OPTION)
+libgmp_la_LIBADD = $(libgmp_la_DEPENDENCIES) $(MPFR_LIBADD_OPTION)
+libgmp_la_LDFLAGS = \
+  -version-info $(LIBGMP_LT_CURRENT):$(LIBGMP_LT_REVISION):$(LIBGMP_LT_AGE)
+
+
+if WANT_MPBSD
+MPBSD_HEADERS_OPTION = mp.h
+MPBSD_LTLIBRARIES_OPTION = libmp.la
+endif
+libmp_la_SOURCES = assert.c errno.c memory.c mp_bpl.c mp_clz_tab.c	\
+  mp_minv_tab.c mp_set_fns.c stack-alloc.c
+libmp_la_DEPENDENCIES = $(MPBSD_OBJECTS) $(MPN_OBJECTS)			\
+  mpz/add.lo mpz/clear.lo mpz/cmp.lo mpz/init.lo mpz/mod.lo mpz/mul.lo	\
+  mpz/mul_2exp.lo mpz/realloc.lo mpz/set.lo mpz/set_ui.lo mpz/tdiv_r.lo	\
+  mpz/sub.lo
+libmp_la_LIBADD = $(libmp_la_DEPENDENCIES)
+libmp_la_LDFLAGS = \
+  -version-info $(LIBMP_LT_CURRENT):$(LIBMP_LT_REVISION):$(LIBMP_LT_AGE)
+
+
+info_TEXINFOS = gmp.texi
+
+
+# Don't ship CVS directories or emacs backups.
+dist-hook:
+	-find $(distdir) \( -name CVS -type d \) -o -name "*.~*" \
+		| xargs rm -rf
diff --git a/rts/gmp/Makefile.in b/rts/gmp/Makefile.in
new file mode 100644
index 0000000000..e63383e7a7
--- /dev/null
+++ b/rts/gmp/Makefile.in
@@ -0,0 +1,932 @@
+# Makefile.in generated automatically by automake 1.4a from Makefile.am
+
+# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+SHELL = @SHELL@
+
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+prefix = @prefix@
+exec_prefix = @exec_prefix@
+
+bindir = @bindir@
+sbindir = @sbindir@
+libexecdir = @libexecdir@
+datadir = @datadir@
+sysconfdir = @sysconfdir@
+sharedstatedir = @sharedstatedir@
+localstatedir = @localstatedir@
+libdir = @libdir@
+infodir = @infodir@
+mandir = @mandir@
+includedir = @includedir@
+oldincludedir = /usr/include
+
+DESTDIR =
+
+pkgdatadir = $(datadir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+
+top_builddir = .
+
+ACLOCAL = @ACLOCAL@
+AUTOCONF = @AUTOCONF@
+AUTOMAKE = @AUTOMAKE@
+AUTOHEADER = @AUTOHEADER@
+
+INSTALL = @INSTALL@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_FLAG =
+transform = @program_transform_name@
+
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+
+@SET_MAKE@
+build_alias = @build_alias@
+build_triplet = @build@
+host_alias = @host_alias@
+host_triplet = @host@
+target_alias = @target_alias@
+target_triplet = @target@
+AMDEP = @AMDEP@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AWK = @AWK@
+CALLING_CONVENTIONS_OBJS = @CALLING_CONVENTIONS_OBJS@
+CC = @CC@
+CCAS = @CCAS@
+CPP = @CPP@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+EXEEXT = @EXEEXT@
+LIBTOOL = @LIBTOOL@
+LN_S = @LN_S@
+M4 = @M4@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+PACKAGE = @PACKAGE@
+RANLIB = @RANLIB@
+SPEED_CYCLECOUNTER_OBJS = @SPEED_CYCLECOUNTER_OBJS@
+STRIP = @STRIP@
+U = @U@
+VERSION = @VERSION@
+gmp_srclinks = @gmp_srclinks@
+install_sh = @install_sh@
+mpn_objects = @mpn_objects@
+mpn_objs_in_libgmp = @mpn_objs_in_libgmp@
+
+# Copyright (C) 1991, 1993, 1994, 1996, 1997, 1999, 2000 Free Software
+# Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+# make check
+#
+#     It'd be good if "make check" first did a "make all" or whatever to
+#     build libgmp.la, but it's not clear how best to do this.  Putting a
+#     "check:" target is overridden by automake, and a "check-local:" runs
+#     too late (due to depth-first subdirectory processing).  For now it's
+#     necessary to do "make && make check".
+#
+# MPF_OBJECTS etc
+#
+#     Libtool needs all the .lo files passed to it if it's going to build
+#     both a static and shared library.  If a convenience library like
+#     mpf/libmpf.la is passed then the resulting libgmp.a gets the PIC .lo
+#     objects rather than the non-PIC .o's.
+#
+#     Unfortunately this leads to the big lists of objects below.  Something
+#     like mpz/*.lo would probably work, but might risk missing something
+#     out or getting something extra.  The source files for each .lo are
+#     listed in the Makefile.am's in the subdirectories.
+
+# Libtool -version-info for libgmp.la and libmp.la.  See (libtool)Versioning
+#
+# 1. No interfaces changed, only implementations (good): Increment REVISION.
+#
+# 2. Interfaces added, none removed (good): Increment CURRENT, increment
+#    AGE, set REVISION to 0.
+#
+# 3. Interfaces removed (BAD, breaks upward compatibility): Increment
+#    CURRENT, set AGE and REVISION to 0.
+#
+# Do this separately for libgmp and libmp, only do it just before a release.
+#
+#	  GMP	   -version-info
+#	release   libgmp  libmp
+#	 2.0.x      -       -
+#	 3.0      3:0:0   3:0:0
+#        3.0.1    3:1:0   3:0:0
+#        3.1      4:0:1   4:0:1
+#        3.1.1    4:1:1   4:1:1
+#
+#
+# Starting at 3:0:0 is a slight abuse of the versioning system, but it
+# ensures we're past soname libgmp.so.2, which is what has been used on
+# Debian GNU/Linux packages of gmp 2.  Pretend gmp 2 was 2:0:0, so the
+# interface changes for gmp 3 mean 3:0:0 is right.
+
+
+LIBGMP_LT_CURRENT = 4
+LIBGMP_LT_REVISION = 1
+LIBGMP_LT_AGE = 1
+
+LIBMP_LT_CURRENT = 4
+LIBMP_LT_REVISION = 1
+LIBMP_LT_AGE = 1
+
+AUTOMAKE_OPTIONS = gnu check-news no-dependencies ansi2knr
+
+SUBDIRS = mpn mpz
+
+include_HEADERS = gmp.h $(MPBSD_HEADERS_OPTION) $(MPFR_HEADERS_OPTION)
+EXTRA_HEADERS = mp.h
+
+lib_LTLIBRARIES = libgmp.la $(MPBSD_LTLIBRARIES_OPTION)
+
+EXTRA_DIST = .gdbinit gmp-impl.h longlong.h stack-alloc.h urandom.h doc macos
+
+DISTCLEANFILES = asm-syntax.h config.m4 @gmp_srclinks@
+
+MPZ_OBJECTS = mpz/abs.lo mpz/add.lo mpz/add_ui.lo mpz/addmul_ui.lo mpz/and.lo \
+  mpz/array_init.lo mpz/bin_ui.lo mpz/bin_uiui.lo mpz/cdiv_q.lo \
+  mpz/cdiv_q_ui.lo mpz/cdiv_qr.lo mpz/cdiv_qr_ui.lo mpz/cdiv_r.lo \
+  mpz/cdiv_r_ui.lo mpz/cdiv_ui.lo mpz/clear.lo mpz/clrbit.lo mpz/cmp.lo \
+  mpz/cmp_si.lo mpz/cmp_ui.lo mpz/cmpabs.lo mpz/cmpabs_ui.lo mpz/com.lo \
+  mpz/divexact.lo mpz/dump.lo mpz/fac_ui.lo mpz/fdiv_q.lo mpz/fdiv_q_2exp.lo \
+  mpz/fdiv_q_ui.lo mpz/fdiv_qr.lo mpz/fdiv_qr_ui.lo mpz/fdiv_r.lo \
+  mpz/fdiv_r_2exp.lo mpz/fdiv_r_ui.lo mpz/fdiv_ui.lo mpz/fib_ui.lo \
+  mpz/fits_sint_p.lo mpz/fits_slong_p.lo mpz/fits_sshort_p.lo \
+  mpz/fits_uint_p.lo mpz/fits_ulong_p.lo mpz/fits_ushort_p.lo mpz/gcd.lo \
+  mpz/gcd_ui.lo mpz/gcdext.lo mpz/get_d.lo mpz/get_si.lo mpz/get_str.lo \
+  mpz/get_ui.lo mpz/getlimbn.lo mpz/hamdist.lo mpz/init.lo mpz/inp_raw.lo \
+  mpz/inp_str.lo mpz/invert.lo mpz/ior.lo mpz/iset.lo mpz/iset_d.lo \
+  mpz/iset_si.lo mpz/iset_str.lo mpz/iset_ui.lo mpz/jacobi.lo \
+  mpz/kronsz.lo mpz/kronuz.lo mpz/kronzs.lo mpz/kronzu.lo \
+  mpz/lcm.lo mpz/legendre.lo \
+  mpz/mod.lo mpz/mul.lo mpz/mul_2exp.lo mpz/mul_si.lo mpz/mul_ui.lo \
+  mpz/neg.lo mpz/nextprime.lo mpz/out_raw.lo mpz/out_str.lo mpz/perfpow.lo mpz/perfsqr.lo \
+  mpz/popcount.lo mpz/pow_ui.lo mpz/powm.lo mpz/powm_ui.lo mpz/pprime_p.lo \
+  mpz/random.lo mpz/random2.lo mpz/realloc.lo mpz/remove.lo mpz/root.lo \
+  mpz/rrandomb.lo \
+  mpz/scan0.lo mpz/scan1.lo mpz/set.lo mpz/set_d.lo mpz/set_f.lo mpz/set_q.lo \
+  mpz/set_si.lo mpz/set_str.lo mpz/set_ui.lo mpz/setbit.lo mpz/size.lo \
+  mpz/sizeinbase.lo mpz/sqrt.lo mpz/sqrtrem.lo mpz/sub.lo mpz/sub_ui.lo \
+  mpz/swap.lo mpz/tdiv_ui.lo mpz/tdiv_q.lo mpz/tdiv_q_2exp.lo mpz/tdiv_q_ui.lo \
+  mpz/tdiv_qr.lo mpz/tdiv_qr_ui.lo mpz/tdiv_r.lo mpz/tdiv_r_2exp.lo \
+  mpz/tdiv_r_ui.lo mpz/tstbit.lo mpz/ui_pow_ui.lo mpz/urandomb.lo \
+  mpz/urandomm.lo mpz/xor.lo
+
+MPN_OBJECTS = @mpn_objs_in_libgmp@
+
+MPBSD_OBJECTS = mpbsd/add.lo mpbsd/tdiv_qr.lo mpbsd/move.lo mpbsd/powm.lo \
+  mpbsd/sub.lo mpbsd/cmp.lo mpbsd/mfree.lo mpbsd/mtox.lo mpbsd/realloc.lo \
+  mpbsd/gcd.lo mpbsd/itom.lo mpbsd/min.lo mpbsd/mul.lo mpbsd/mout.lo	  \
+  mpbsd/pow_ui.lo mpbsd/sdiv.lo mpbsd/sqrtrem.lo mpbsd/xtom.lo 
+
+
+
+@WANT_MPFR_TRUE@MPFR_HEADERS_OPTION = @WANT_MPFR_TRUE@mpfr/mpfr.h
+@WANT_MPFR_TRUE@MPFR_OBJECTS_OPTION = @WANT_MPFR_TRUE@$(MPFR_OBJECTS)
+@WANT_MPFR_TRUE@MPFR_LIBADD_OPTION = @WANT_MPFR_TRUE@-lm
+libgmp_la_SOURCES = assert.c compat.c errno.c memory.c mp_set_fns.c	 \
+  mp_clz_tab.c mp_minv_tab.c						 \
+  version.c stack-alloc.c mp_bpl.c extract-dbl.c insert-dbl.c
+
+libgmp_la_DEPENDENCIES = \
+  $(MPF_OBJECTS) $(MPZ_OBJECTS) $(MPN_OBJECTS) $(MPQ_OBJECTS) \
+  $(MPFR_OBJECTS_OPTION)
+
+libgmp_la_LIBADD = $(libgmp_la_DEPENDENCIES) $(MPFR_LIBADD_OPTION)
+libgmp_la_LDFLAGS = \
+  -version-info $(LIBGMP_LT_CURRENT):$(LIBGMP_LT_REVISION):$(LIBGMP_LT_AGE)
+
+
+@WANT_MPBSD_TRUE@MPBSD_HEADERS_OPTION = @WANT_MPBSD_TRUE@mp.h
+@WANT_MPBSD_TRUE@MPBSD_LTLIBRARIES_OPTION = @WANT_MPBSD_TRUE@libmp.la
+libmp_la_SOURCES = assert.c errno.c memory.c mp_bpl.c mp_clz_tab.c	\
+  mp_minv_tab.c mp_set_fns.c stack-alloc.c
+
+libmp_la_DEPENDENCIES = $(MPBSD_OBJECTS) $(MPN_OBJECTS)			\
+  mpz/add.lo mpz/clear.lo mpz/cmp.lo mpz/init.lo mpz/mod.lo mpz/mul.lo	\
+  mpz/mul_2exp.lo mpz/realloc.lo mpz/set.lo mpz/set_ui.lo mpz/tdiv_r.lo	\
+  mpz/sub.lo
+
+libmp_la_LIBADD = $(libmp_la_DEPENDENCIES)
+libmp_la_LDFLAGS = \
+  -version-info $(LIBMP_LT_CURRENT):$(LIBMP_LT_REVISION):$(LIBMP_LT_AGE)
+
+
+info_TEXINFOS = gmp.texi
+subdir = .
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs
+CONFIG_HEADER = config.h
+CONFIG_CLEAN_FILES = 
+LTLIBRARIES =  $(lib_LTLIBRARIES)
+
+
+DEFS = @DEFS@ -I. -I$(srcdir) -I.
+CPPFLAGS = @CPPFLAGS@
+LDFLAGS = @LDFLAGS@
+LIBS = @LIBS@
+ANSI2KNR = @ANSI2KNR@
+am_libgmp_la_OBJECTS =  assert$U.lo compat$U.lo errno$U.lo memory$U.lo \
+mp_set_fns$U.lo mp_clz_tab$U.lo mp_minv_tab$U.lo rand$U.lo randclr$U.lo \
+randlc$U.lo randlc2x$U.lo randraw$U.lo randsd$U.lo randsdui$U.lo \
+version$U.lo stack-alloc$U.lo mp_bpl$U.lo extract-dbl$U.lo \
+insert-dbl$U.lo
+libgmp_la_OBJECTS =  $(am_libgmp_la_OBJECTS)
+am_libmp_la_OBJECTS =  assert$U.lo errno$U.lo memory$U.lo mp_bpl$U.lo \
+mp_clz_tab$U.lo mp_minv_tab$U.lo mp_set_fns$U.lo stack-alloc$U.lo
+libmp_la_OBJECTS =  $(am_libmp_la_OBJECTS)
+COMPILE = $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CFLAGS = @CFLAGS@
+CCLD = $(CC)
+LINK = $(LIBTOOL) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+DIST_SOURCES =  $(libgmp_la_SOURCES) $(libmp_la_SOURCES)
+TEXI2DVI = texi2dvi
+# INFO_DEPS = gmp.info
+DVIS = gmp.dvi
+TEXINFOS = gmp.texi
+HEADERS =  $(include_HEADERS)
+
+DIST_COMMON =  README $(EXTRA_HEADERS) $(include_HEADERS) ./stamp-h.in \
+AUTHORS COPYING COPYING.LIB ChangeLog INSTALL Makefile.am Makefile.in \
+NEWS acconfig.h acinclude.m4 aclocal.m4 ansi2knr.1 ansi2knr.c \
+config.guess config.in config.sub configure configure.in depcomp \
+install-sh ltconfig ltmain.sh mdate-sh missing mkinstalldirs stamp-vti \
+texinfo.tex version.texi
+
+
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+
+GZIP_ENV = --best
+depcomp = 
+SOURCES = $(libgmp_la_SOURCES) $(libmp_la_SOURCES)
+OBJECTS = $(am_libgmp_la_OBJECTS) $(am_libmp_la_OBJECTS)
+
+all: all-redirect
+.SUFFIXES:
+.SUFFIXES: .c .dvi .info .lo .o .obj .ps .texi .texinfo .txi
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) 
+	cd $(top_srcdir) && $(AUTOMAKE) --gnu Makefile
+
+Makefile: $(srcdir)/Makefile.in  $(top_builddir)/config.status
+	cd $(top_builddir) \
+	  && CONFIG_FILES=$@ CONFIG_HEADERS= $(SHELL) ./config.status
+
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ configure.in  acinclude.m4
+	cd $(srcdir) && $(ACLOCAL)
+
+config.status: $(srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	$(SHELL) ./config.status --recheck
+$(srcdir)/configure: @MAINTAINER_MODE_TRUE@$(srcdir)/configure.in $(ACLOCAL_M4) $(CONFIGURE_DEPENDENCIES)
+	cd $(srcdir) && $(AUTOCONF)
+
+config.h: stamp-h
+	@if test ! -f $@; then \
+		rm -f stamp-h; \
+		$(MAKE) stamp-h; \
+	else :; fi
+stamp-h: $(srcdir)/config.in $(top_builddir)/config.status
+	@rm -f stamp-h stamp-hT
+	@echo timestamp > stamp-hT 2> /dev/null
+	cd $(top_builddir) \
+	  && CONFIG_FILES= CONFIG_HEADERS=config.h:config.in \
+	     $(SHELL) ./config.status
+	@mv stamp-hT stamp-h
+$(srcdir)/config.in: @MAINTAINER_MODE_TRUE@$(srcdir)/./stamp-h.in
+	@if test ! -f $@; then \
+		rm -f $(srcdir)/./stamp-h.in; \
+		$(MAKE) $(srcdir)/./stamp-h.in; \
+	else :; fi
+$(srcdir)/./stamp-h.in: $(top_srcdir)/configure.in $(ACLOCAL_M4) acconfig.h
+	@rm -f $(srcdir)/./stamp-h.in $(srcdir)/./stamp-h.inT
+	@echo timestamp > $(srcdir)/./stamp-h.inT 2> /dev/null
+	cd $(top_srcdir) && $(AUTOHEADER)
+	@mv $(srcdir)/./stamp-h.inT $(srcdir)/./stamp-h.in
+
+mostlyclean-hdr:
+
+clean-hdr:
+
+distclean-hdr:
+	-rm -f config.h
+
+maintainer-clean-hdr:
+
+mostlyclean-libLTLIBRARIES:
+
+clean-libLTLIBRARIES:
+	-test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES)
+
+distclean-libLTLIBRARIES:
+
+maintainer-clean-libLTLIBRARIES:
+
+install-libLTLIBRARIES: $(lib_LTLIBRARIES)
+	@$(NORMAL_INSTALL)
+	$(mkinstalldirs) $(DESTDIR)$(libdir)
+	@list='$(lib_LTLIBRARIES)'; for p in $$list; do \
+	  if test -f $$p; then \
+	    echo " $(LIBTOOL)  --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$p $(DESTDIR)$(libdir)/$$p"; \
+	    $(LIBTOOL)  --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$p $(DESTDIR)$(libdir)/$$p; \
+	  else :; fi; \
+	done
+
+uninstall-libLTLIBRARIES:
+	@$(NORMAL_UNINSTALL)
+	@list='$(lib_LTLIBRARIES)'; for p in $$list; do \
+	  echo " $(LIBTOOL)  --mode=uninstall rm -f $(DESTDIR)$(libdir)/$$p"; \
+	  $(LIBTOOL)  --mode=uninstall rm -f $(DESTDIR)$(libdir)/$$p; \
+	done
+
+mostlyclean-compile:
+	-rm -f *.o core *.core
+	-rm -f *.$(OBJEXT)
+
+clean-compile:
+
+distclean-compile:
+	-rm -f *.tab.c
+
+maintainer-clean-compile:
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+distclean-libtool:
+
+maintainer-clean-libtool:
+
+mostlyclean-krextra:
+
+clean-krextra:
+	-rm -f ansi2knr
+
+distclean-krextra:
+
+maintainer-clean-krextra:
+ansi2knr: ansi2knr.$(OBJEXT)
+	$(LINK) ansi2knr.$(OBJEXT) $(LIBS)
+ansi2knr.$(OBJEXT): $(CONFIG_HEADER)
+
+
+mostlyclean-kr:
+	-rm -f *_.c
+
+clean-kr:
+
+distclean-kr:
+
+maintainer-clean-kr:
+
+gmp.dll: libgmp.a
+	dllwrap -mno-cygwin --target=i386-unknown-mingw32 \
+		--export-all --dllname gmp.dll --output-lib=libgmp_imp.a \
+		-o gmp.dll libgmp.a
+
+libgmp.la: $(libgmp_la_OBJECTS) $(libgmp_la_DEPENDENCIES)
+	$(LINK) -rpath $(libdir) $(libgmp_la_LDFLAGS) $(libgmp_la_OBJECTS) $(libgmp_la_LIBADD) $(LIBS)
+
+libmp.la: $(libmp_la_OBJECTS) $(libmp_la_DEPENDENCIES)
+	$(LINK) -rpath $(libdir) $(libmp_la_LDFLAGS) $(libmp_la_OBJECTS) $(libmp_la_LIBADD) $(LIBS)
+.c.o:
+	$(COMPILE) -c $<
+.c.obj:
+	$(COMPILE) -c `cygpath -w $<`
+.c.lo:
+	$(LTCOMPILE) -c -o $@ $<
+assert_.c: assert.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/assert.c; then echo $(srcdir)/assert.c; else echo assert.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > assert_.c
+compat_.c: compat.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/compat.c; then echo $(srcdir)/compat.c; else echo compat.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > compat_.c
+errno_.c: errno.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/errno.c; then echo $(srcdir)/errno.c; else echo errno.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > errno_.c
+extract-dbl_.c: extract-dbl.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/extract-dbl.c; then echo $(srcdir)/extract-dbl.c; else echo extract-dbl.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > extract-dbl_.c
+insert-dbl_.c: insert-dbl.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/insert-dbl.c; then echo $(srcdir)/insert-dbl.c; else echo insert-dbl.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > insert-dbl_.c
+memory_.c: memory.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/memory.c; then echo $(srcdir)/memory.c; else echo memory.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > memory_.c
+mp_bpl_.c: mp_bpl.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mp_bpl.c; then echo $(srcdir)/mp_bpl.c; else echo mp_bpl.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > mp_bpl_.c
+mp_clz_tab_.c: mp_clz_tab.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mp_clz_tab.c; then echo $(srcdir)/mp_clz_tab.c; else echo mp_clz_tab.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > mp_clz_tab_.c
+mp_minv_tab_.c: mp_minv_tab.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mp_minv_tab.c; then echo $(srcdir)/mp_minv_tab.c; else echo mp_minv_tab.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > mp_minv_tab_.c
+mp_set_fns_.c: mp_set_fns.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mp_set_fns.c; then echo $(srcdir)/mp_set_fns.c; else echo mp_set_fns.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > mp_set_fns_.c
+rand_.c: rand.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/rand.c; then echo $(srcdir)/rand.c; else echo rand.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > rand_.c
+randclr_.c: randclr.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/randclr.c; then echo $(srcdir)/randclr.c; else echo randclr.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > randclr_.c
+randlc_.c: randlc.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/randlc.c; then echo $(srcdir)/randlc.c; else echo randlc.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > randlc_.c
+randlc2x_.c: randlc2x.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/randlc2x.c; then echo $(srcdir)/randlc2x.c; else echo randlc2x.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > randlc2x_.c
+randraw_.c: randraw.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/randraw.c; then echo $(srcdir)/randraw.c; else echo randraw.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > randraw_.c
+randsd_.c: randsd.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/randsd.c; then echo $(srcdir)/randsd.c; else echo randsd.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > randsd_.c
+randsdui_.c: randsdui.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/randsdui.c; then echo $(srcdir)/randsdui.c; else echo randsdui.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > randsdui_.c
+stack-alloc_.c: stack-alloc.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/stack-alloc.c; then echo $(srcdir)/stack-alloc.c; else echo stack-alloc.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > stack-alloc_.c
+version_.c: version.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/version.c; then echo $(srcdir)/version.c; else echo version.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > version_.c
+assert_.$(OBJEXT) assert_.lo compat_.$(OBJEXT) compat_.lo \
+errno_.$(OBJEXT) errno_.lo extract-dbl_.$(OBJEXT) extract-dbl_.lo \
+insert-dbl_.$(OBJEXT) insert-dbl_.lo memory_.$(OBJEXT) memory_.lo \
+mp_bpl_.$(OBJEXT) mp_bpl_.lo mp_clz_tab_.$(OBJEXT) mp_clz_tab_.lo \
+mp_minv_tab_.$(OBJEXT) mp_minv_tab_.lo mp_set_fns_.$(OBJEXT) \
+mp_set_fns_.lo rand_.$(OBJEXT) rand_.lo randclr_.$(OBJEXT) randclr_.lo \
+randlc_.$(OBJEXT) randlc_.lo randlc2x_.$(OBJEXT) randlc2x_.lo \
+randraw_.$(OBJEXT) randraw_.lo randsd_.$(OBJEXT) randsd_.lo \
+randsdui_.$(OBJEXT) randsdui_.lo stack-alloc_.$(OBJEXT) stack-alloc_.lo \
+version_.$(OBJEXT) version_.lo : $(ANSI2KNR)
+
+$(srcdir)/version.texi: @MAINTAINER_MODE_TRUE@stamp-vti
+	@:
+
+$(srcdir)/stamp-vti: gmp.texi $(top_srcdir)/configure.in
+	@echo "@set UPDATED `$(SHELL) $(srcdir)/mdate-sh $(srcdir)/gmp.texi`" > vti.tmp
+	@echo "@set EDITION $(VERSION)" >> vti.tmp
+	@echo "@set VERSION $(VERSION)" >> vti.tmp
+	@cmp -s vti.tmp $(srcdir)/version.texi \
+	  || (echo "Updating $(srcdir)/version.texi"; \
+	      cp vti.tmp $(srcdir)/version.texi)
+	-@rm -f vti.tmp
+	@cp $(srcdir)/version.texi $@
+
+mostlyclean-vti:
+	-rm -f vti.tmp
+
+clean-vti:
+
+distclean-vti:
+
+maintainer-clean-vti:
+	-@MAINTAINER_MODE_TRUE@rm -f $(srcdir)/stamp-vti $(srcdir)/version.texi
+
+# gmp.info: gmp.texi version.texi
+# gmp.dvi: gmp.texi version.texi
+
+
+DVIPS = dvips
+
+.texi.info:
+	@cd $(srcdir) && rm -f $@ $@-[0-9] $@-[0-9][0-9]
+	cd $(srcdir) \
+	  && $(MAKEINFO) `echo $< | sed 's,.*/,,'`
+
+.texi.dvi:
+	TEXINPUTS=$(srcdir):$$TEXINPUTS \
+	  MAKEINFO='$(MAKEINFO) -I $(srcdir)' $(TEXI2DVI) $<
+
+.texi:
+	@cd $(srcdir) && rm -f $@ $@-[0-9] $@-[0-9][0-9]
+	cd $(srcdir) \
+	  && $(MAKEINFO) `echo $< | sed 's,.*/,,'`
+
+.texinfo.info:
+	@cd $(srcdir) && rm -f $@ $@-[0-9] $@-[0-9][0-9]
+	cd $(srcdir) \
+	  && $(MAKEINFO) `echo $< | sed 's,.*/,,'`
+
+.texinfo:
+	@cd $(srcdir) && rm -f $@ $@-[0-9] $@-[0-9][0-9]
+	cd $(srcdir) \
+	  && $(MAKEINFO) `echo $< | sed 's,.*/,,'`
+
+.texinfo.dvi:
+	TEXINPUTS=$(srcdir):$$TEXINPUTS \
+	  MAKEINFO='$(MAKEINFO) -I $(srcdir)' $(TEXI2DVI) $<
+
+.txi.info:
+	@cd $(srcdir) && rm -f $@ $@-[0-9] $@-[0-9][0-9]
+	cd $(srcdir) \
+	  && $(MAKEINFO) `echo $< | sed 's,.*/,,'`
+
+.txi.dvi:
+	TEXINPUTS=$(srcdir):$$TEXINPUTS \
+	  MAKEINFO='$(MAKEINFO) -I $(srcdir)' $(TEXI2DVI) $<
+
+.txi:
+	@cd $(srcdir) && rm -f $@ $@-[0-9] $@-[0-9][0-9]
+	cd $(srcdir) \
+	  && $(MAKEINFO) `echo $< | sed 's,.*/,,'`
+.dvi.ps:
+	$(DVIPS) $< -o $@
+
+install-info-am: $(INFO_DEPS)
+	@$(NORMAL_INSTALL)
+	$(mkinstalldirs) $(DESTDIR)$(infodir)
+	@list='$(INFO_DEPS)'; \
+	for file in $$list; do \
+	  d=$(srcdir); \
+	  for ifile in `CDPATH=: && cd $$d && echo $$file $$file-[0-9] $$file-[0-9][0-9]`; do \
+	    if test -f $$d/$$ifile; then \
+	      echo " $(INSTALL_DATA) $$d/$$ifile $(DESTDIR)$(infodir)/$$ifile"; \
+	      $(INSTALL_DATA) $$d/$$ifile $(DESTDIR)$(infodir)/$$ifile; \
+	    else : ; fi; \
+	  done; \
+	done
+	@$(POST_INSTALL)
+	@if $(SHELL) -c 'install-info --version | sed 1q | fgrep -s -v -i debian' >/dev/null 2>&1; then \
+	  list='$(INFO_DEPS)'; \
+	  for file in $$list; do \
+	    echo " install-info --info-dir=$(DESTDIR)$(infodir) $(DESTDIR)$(infodir)/$$file";\
+	    install-info --info-dir=$(DESTDIR)$(infodir) $(DESTDIR)$(infodir)/$$file || :;\
+	  done; \
+	else : ; fi
+
+uninstall-info:
+	$(PRE_UNINSTALL)
+	@if $(SHELL) -c 'install-info --version | sed 1q | fgrep -s -v -i debian' >/dev/null 2>&1; then \
+	  list='$(INFO_DEPS)'; \
+	  for file in $$list; do \
+	    echo " install-info --info-dir=$(DESTDIR)$(infodir) --remove $(DESTDIR)$(infodir)/$$file"; \
+	    install-info --info-dir=$(DESTDIR)$(infodir) --remove $(DESTDIR)$(infodir)/$$file; \
+	  done; \
+	else :; fi
+	@$(NORMAL_UNINSTALL)
+	@list='$(INFO_DEPS)'; \
+	for file in $$list; do \
+	  (if cd $(DESTDIR)$(infodir); then \
+	     echo " rm -f $$file $$file-[0-9] $$file-[0-9][0-9])"; \
+	     rm -f $$file $$file-[0-9] $$file-[0-9][0-9]; \
+	   else :; fi); \
+	done
+
+dist-info: $(INFO_DEPS)
+	list='$(INFO_DEPS)'; \
+	for base in $$list; do \
+	  d=$(srcdir); \
+	  for file in `CDPATH=: && cd $$d && eval echo $$base*`; do \
+	    test -f $(distdir)/$$file \
+	    || cp -p $$d/$$file $(distdir)/$$file; \
+	  done; \
+	done
+
+mostlyclean-aminfo:
+	-rm -f gmp.aux gmp.cp gmp.cps gmp.dvi gmp.fn gmp.fns gmp.pgs gmp.ky \
+	  gmp.kys gmp.ps gmp.log gmp.pg gmp.toc gmp.tp gmp.tps gmp.vr \
+	  gmp.vrs gmp.op gmp.tr gmp.cv gmp.cn gmp.cm gmp.ov
+
+clean-aminfo:
+
+distclean-aminfo:
+
+maintainer-clean-aminfo:
+	cd $(srcdir) && for i in $(INFO_DEPS); do \
+	  rm -f $$i; \
+	  if test "`echo $$i-[0-9]*`" != "$$i-[0-9]*"; then \
+	    rm -f $$i-[0-9]*; \
+	  fi; \
+	done
+
+install-includeHEADERS: $(include_HEADERS)
+	@$(NORMAL_INSTALL)
+	$(mkinstalldirs) $(DESTDIR)$(includedir)
+	@list='$(include_HEADERS)'; for p in $$list; do \
+	  if test -f "$$p"; then d= ; else d="$(srcdir)/"; fi; \
+	  f="`echo $$p | sed -e 's|^.*/||'`"; \
+	  echo " $(INSTALL_DATA) $$d$$p $(DESTDIR)$(includedir)/$$f"; \
+	  $(INSTALL_DATA) $$d$$p $(DESTDIR)$(includedir)/$$f; \
+	done
+
+uninstall-includeHEADERS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(include_HEADERS)'; for p in $$list; do \
+	  f="`echo $$p | sed -e 's|^.*/||'`"; \
+	  echo " rm -f $(DESTDIR)$(includedir)/$$f"; \
+	  rm -f $(DESTDIR)$(includedir)/$$f; \
+	done
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+#     (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+
+all-recursive install-data-recursive install-exec-recursive \
+installdirs-recursive install-recursive uninstall-recursive  \
+check-recursive installcheck-recursive info-recursive dvi-recursive:
+	@set fnord $(MAKEFLAGS); amf=$$2; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	   || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+mostlyclean-recursive clean-recursive distclean-recursive \
+maintainer-clean-recursive:
+	@set fnord $(MAKEFLAGS); amf=$$2; \
+	dot_seen=no; \
+	rev=''; list='$(SUBDIRS)'; for subdir in $$list; do \
+	  rev="$$subdir $$rev"; \
+	  if test "$$subdir" = "."; then dot_seen=yes; else :; fi; \
+	done; \
+	test "$$dot_seen" = "no" && rev=". $$rev"; \
+	target=`echo $@ | sed s/-recursive//`; \
+	for subdir in $$rev; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	   || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
+	done && test -z "$$fail"
+tags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+	done
+
+tags: TAGS
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	mkid -f$$here/ID $$unique $(LISP)
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES) config.in $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	tags=; \
+	here=`pwd`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+   if test "$$subdir" = .; then :; else \
+	    test -f $$subdir/TAGS && tags="$$tags -i $$here/$$subdir/TAGS"; \
+   fi; \
+	done; \
+	list='$(SOURCES) $(HEADERS) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	test -z "$(ETAGS_ARGS)config.in$$unique$(LISP)$$tags" \
+	  || etags $(ETAGS_ARGS) $$tags config.in $$unique $(LISP)
+
+mostlyclean-tags:
+
+clean-tags:
+
+distclean-tags:
+	-rm -f TAGS ID
+
+maintainer-clean-tags:
+
+distdir = $(PACKAGE)-$(VERSION)
+top_distdir = $(distdir)
+
+
+# This target untars the dist file and tries a VPATH configuration.  Then
+# it guarantees that the distribution is self-contained by making another
+# tarfile.
+distcheck: dist
+	-chmod -R a+w $(distdir) > /dev/null 2>&1; rm -rf $(distdir)
+	GZIP=$(GZIP_ENV) gunzip -c $(distdir).tar.gz | $(AMTAR) xf -
+	chmod -R a-w $(distdir); chmod a+w $(distdir)
+	mkdir $(distdir)/=build
+	mkdir $(distdir)/=inst
+	chmod a-w $(distdir)
+	dc_install_base=`CDPATH=: && cd $(distdir)/=inst && pwd` \
+	  && cd $(distdir)/=build \
+	  && ../configure --srcdir=.. --prefix=$$dc_install_base \
+	  && $(MAKE) $(AM_MAKEFLAGS) \
+	  && $(MAKE) $(AM_MAKEFLAGS) dvi \
+	  && $(MAKE) $(AM_MAKEFLAGS) check \
+	  && $(MAKE) $(AM_MAKEFLAGS) install \
+	  && $(MAKE) $(AM_MAKEFLAGS) installcheck \
+	  && $(MAKE) $(AM_MAKEFLAGS) uninstall \
+	  && test `find $$dc_install_base -type f -print | wc -l` -le 1 \
+	  && $(MAKE) $(AM_MAKEFLAGS) dist \
+	  && $(MAKE) $(AM_MAKEFLAGS) distclean \
+	  && rm -f $(distdir).tar.gz \
+	  && test `find . -type f -print | wc -l` -eq 0
+	-chmod -R a+w $(distdir) > /dev/null 2>&1; rm -rf $(distdir)
+	@banner="$(distdir).tar.gz is ready for distribution"; \
+	dashes=`echo "$$banner" | sed s/./=/g`; \
+	echo "$$dashes"; \
+	echo "$$banner"; \
+	echo "$$dashes"
+dist: distdir
+	-find $(distdir) -type d ! -perm -777 -exec chmod a+rwx {} \; -o \
+	  ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \
+	  ! -type d ! -perm -400 -exec chmod a+r {} \; -o \
+	  ! -type d ! -perm -444 -exec $(SHELL) $(install_sh) -c -m a+r {} {} \; \
+	|| chmod -R a+r $(distdir)
+	$(AMTAR) chof - $(distdir) | GZIP=$(GZIP_ENV) gzip -c > $(distdir).tar.gz
+	-chmod -R a+w $(distdir) > /dev/null 2>&1; rm -rf $(distdir)
+dist-all: distdir
+	-find $(distdir) -type d ! -perm -777 -exec chmod a+rwx {} \; -o \
+	  ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \
+	  ! -type d ! -perm -400 -exec chmod a+r {} \; -o \
+	  ! -type d ! -perm -444 -exec $(SHELL) $(install_sh) -c -m a+r {} {} \; \
+	|| chmod -R a+r $(distdir)
+	$(AMTAR) chof - $(distdir) | GZIP=$(GZIP_ENV) gzip -c > $(distdir).tar.gz
+	-chmod -R a+w $(distdir) > /dev/null 2>&1; rm -rf $(distdir)
+distdir: $(DISTFILES)
+	@if sed 15q $(srcdir)/NEWS | fgrep -e "$(VERSION)" > /dev/null; then :; else \
+	  echo "NEWS not updated; not releasing" 1>&2; \
+	  exit 1; \
+	fi
+	-chmod -R a+w $(distdir) > /dev/null 2>&1; rm -rf $(distdir)
+	mkdir $(distdir)
+	$(mkinstalldirs) $(distdir)/mpfr
+	@for file in $(DISTFILES); do \
+	  d=$(srcdir); \
+	  if test -d $$d/$$file; then \
+	    cp -pR $$d/$$file $(distdir); \
+	  else \
+	    test -f $(distdir)/$$file \
+	    || cp -p $$d/$$file $(distdir)/$$file || :; \
+	  fi; \
+	done
+	for subdir in $(SUBDIRS); do \
+	  if test "$$subdir" = .; then :; else \
+	    test -d $(distdir)/$$subdir \
+	    || mkdir $(distdir)/$$subdir \
+	    || exit 1; \
+	    (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir=../$(distdir) distdir=../$(distdir)/$$subdir distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+	$(MAKE) $(AM_MAKEFLAGS) top_distdir="$(top_distdir)" distdir="$(distdir)" dist-info
+	$(MAKE) $(AM_MAKEFLAGS) top_distdir="$(top_distdir)" distdir="$(distdir)" dist-hook
+info-am: $(INFO_DEPS)
+info: info-recursive
+dvi-am: $(DVIS)
+dvi: dvi-recursive
+check-am: all-am
+check: check-recursive
+installcheck-am:
+installcheck: installcheck-recursive
+all-recursive-am: config.h
+	$(MAKE) $(AM_MAKEFLAGS) all-recursive
+
+install-exec-am: install-libLTLIBRARIES
+install-exec: install-exec-recursive
+
+install-data-am: install-info-am install-includeHEADERS
+install-data: install-data-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+install: install-recursive
+uninstall-am: uninstall-libLTLIBRARIES uninstall-info \
+		uninstall-includeHEADERS
+uninstall: uninstall-recursive
+all-am: Makefile $(INFO_DEPS) $(ANSI2KNR) $(LTLIBRARIES) $(HEADERS) \
+		config.h
+all-redirect: all-recursive-am
+install-strip:
+	$(MAKE) $(AM_MAKEFLAGS) INSTALL_STRIP_FLAG=-s install
+installdirs: installdirs-recursive
+installdirs-am:
+	$(mkinstalldirs)  $(DESTDIR)$(libdir) $(DESTDIR)$(infodir) \
+		$(DESTDIR)$(includedir)
+
+
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-rm -f Makefile $(CONFIG_CLEAN_FILES)
+	-rm -f config.cache config.log stamp-h stamp-h[0-9]*
+	-test -z "$(DISTCLEANFILES)" || rm -f $(DISTCLEANFILES)
+
+maintainer-clean-generic:
+	-rm -f Makefile.in
+mostlyclean-am:  mostlyclean-hdr mostlyclean-libLTLIBRARIES \
+		mostlyclean-compile mostlyclean-libtool \
+		mostlyclean-krextra mostlyclean-kr mostlyclean-vti \
+		mostlyclean-aminfo mostlyclean-tags mostlyclean-generic
+
+mostlyclean: mostlyclean-recursive
+
+clean-am:  clean-hdr clean-libLTLIBRARIES clean-compile clean-libtool \
+		clean-krextra clean-kr clean-vti clean-aminfo \
+		clean-tags clean-generic mostlyclean-am
+
+clean: clean-recursive
+
+distclean-am:  distclean-hdr distclean-libLTLIBRARIES distclean-compile \
+		distclean-libtool distclean-krextra distclean-kr \
+		distclean-vti distclean-aminfo distclean-tags \
+		distclean-generic clean-am
+	-rm -f libtool
+
+distclean: distclean-recursive
+	-rm -f config.status
+
+maintainer-clean-am:  maintainer-clean-hdr \
+		maintainer-clean-libLTLIBRARIES \
+		maintainer-clean-compile maintainer-clean-libtool \
+		maintainer-clean-krextra maintainer-clean-kr \
+		maintainer-clean-vti maintainer-clean-aminfo \
+		maintainer-clean-tags maintainer-clean-generic \
+		distclean-am
+	@echo "This command is intended for maintainers to use;"
+	@echo "it deletes files that may require special tools to rebuild."
+
+maintainer-clean: maintainer-clean-recursive
+	-rm -f config.status
+
+.PHONY: mostlyclean-hdr distclean-hdr clean-hdr maintainer-clean-hdr \
+mostlyclean-libLTLIBRARIES distclean-libLTLIBRARIES \
+clean-libLTLIBRARIES maintainer-clean-libLTLIBRARIES \
+uninstall-libLTLIBRARIES install-libLTLIBRARIES mostlyclean-compile \
+distclean-compile clean-compile maintainer-clean-compile \
+mostlyclean-libtool distclean-libtool clean-libtool \
+maintainer-clean-libtool mostlyclean-krextra distclean-krextra \
+clean-krextra maintainer-clean-krextra mostlyclean-kr distclean-kr \
+clean-kr maintainer-clean-kr mostlyclean-vti distclean-vti clean-vti \
+maintainer-clean-vti install-info-am uninstall-info mostlyclean-aminfo \
+distclean-aminfo clean-aminfo maintainer-clean-aminfo \
+uninstall-includeHEADERS install-includeHEADERS install-recursive \
+uninstall-recursive install-data-recursive uninstall-data-recursive \
+install-exec-recursive uninstall-exec-recursive installdirs-recursive \
+uninstalldirs-recursive all-recursive check-recursive \
+installcheck-recursive info-recursive dvi-recursive \
+mostlyclean-recursive distclean-recursive clean-recursive \
+maintainer-clean-recursive tags tags-recursive mostlyclean-tags \
+distclean-tags clean-tags maintainer-clean-tags distdir info-am info \
+dvi-am dvi check check-am installcheck-am installcheck all-recursive-am \
+install-exec-am install-exec install-data-am install-data install-am \
+install uninstall-am uninstall all-redirect all-am all install-strip \
+installdirs-am installdirs mostlyclean-generic distclean-generic \
+clean-generic maintainer-clean-generic clean mostlyclean distclean \
+maintainer-clean
+
+
+# Don't ship CVS directories or emacs backups.
+dist-hook:
+	-find $(distdir) \( -name CVS -type d \) -o -name "*.~*" \
+		| xargs rm -rf
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/rts/gmp/NEWS b/rts/gmp/NEWS
new file mode 100644
index 0000000000..3b549d59f3
--- /dev/null
+++ b/rts/gmp/NEWS
@@ -0,0 +1,136 @@
+Changes between MP version 3.1 and 3.1.1
+
+* Bug fixes for division (rare), mpf_get_str, FFT, and miscellaneous minor
+  things.
+
+Changes between MP version 3.0 and 3.1
+
+* Bug fixes.
+* Improved `make check' running more tests.
+* Tuned algorithm cutoff points for many machines.  This will improve speed for
+  a lot of operations, in some cases by a large amount.
+* Major speed improvments: Alpha 21264.
+* Some speed improvments: Cray vector computers, AMD K6 and Athlon, Intel P5
+  and Pentium Pro/II/III.
+* The mpf_get_prec function now works as it did in GMP 2.
+* New utilities for auto-tuning and speed measuring.
+* Multiplication now optionally uses FFT for very large operands.  (To enable
+  it, pass --enable-fft to configure.)
+* Support for new systems: Solaris running on x86, FreeBSD 5, HP-UX 11, Cray
+  vector computers, Rhapsody, Nextstep/Openstep, MacOS.
+* Support for shared libraries on 32-bit HPPA.
+* New integer functions: mpz_mul_si, mpz_odd_p, mpz_even_p.
+* New Kronecker symbol functions: mpz_kronecker_si, mpz_kronecker_ui,
+  mpz_si_kronecker, mpz_ui_kronecker.
+* New rational functions: mpq_out_str, mpq_swap.
+* New float functions: mpf_swap.
+* New mpn functions: mpn_divexact_by3c, mpn_tdiv_qr.
+* New EXPERIMENTAL function layer for accurate floating-point arithmetic, mpfr.
+  To try it, pass --enable-mpfr to configure.  See the mpfr subdirectory for
+  more information; it is not documented in the main GMP manual.
+
+Changes between MP version 3.0 and 3.0.1
+
+* Memory leaks in gmp_randinit and mpz_probab_prime_p fixed.
+* Documentation for gmp_randinit fixed.  Misc documentation errors fixed.
+
+Changes between MP version 2.0 and 3.0
+
+* Source level compatibility with past releases (except mpn_gcd).
+* Bug fixes.
+* Much improved speed thanks to both host independent and host dependent
+  optimizations.
+* Switch to autoconf/automake/libtool.
+* Support for building libgmp as a shared library.
+* Multiplication and squaring using 3-way Toom-Cook.
+* Division using the Burnikel-Ziegler method.
+* New functions computing binomial coefficients: mpz_bin_ui, mpz_bin_uiui.
+* New function computing Fibonacci numbers: mpz_fib_ui.
+* New random number generators: mpf_urandomb, mpz_rrandomb, mpz_urandomb,
+  mpz_urandomm, gmp_randclear, gmp_randinit, gmp_randinit_lc_2exp, gmp_randseed, 
+  gmp_randseed_ui.
+* New function for quickly extracting limbs: mpz_getlimbn.
+* New functions performing integer size tests: mpz_fits_sint_p,
+  mpz_fits_slong_p, mpz_fits_sshort_p, mpz_fits_uint_p, mpz_fits_ulong_p,
+  mpz_fits_ushort_p.
+* New mpf functions: mpf_ceil, mpf_floor, mpf_pow_ui, mpf_trunc.
+* New mpq function: mpq_set_d.
+* New mpz functions: mpz_addmul_ui, mpz_cmpabs, mpz_cmpabs_ui, mpz_lcm,
+  mpz_nextprime, mpz_perfect_power_p, mpz_remove, mpz_root, mpz_swap,
+  mpz_tdiv_ui, mpz_tstbit, mpz_xor.
+* New mpn function: mpn_divexact_by3.
+* New CPU support: DEC Alpha 21264, AMD K6 and Athlon, HPPA 2.0 and 64,
+  Intel Pentium Pro and Pentium-II/III, Sparc 64, PowerPC 64.
+* Almost 10 times faster mpz_invert and mpn_gcdext.
+* The interface of mpn_gcd has changed.
+* Better support for MIPS R4x000 and R5000 under Irix 6.
+* Improved support for SPARCv8 and SPARCv9 processors.
+
+Changes between MP version 2.0 and 2.0.2
+
+* Many bug fixes.
+
+Changes between MP version 1.3.2 and 2.0
+
+* Division routines in the mpz class have changed.  There are three classes of
+  functions, that rounds the quotient to -infinity, 0, and +infinity,
+  respectively.  The first class of functions have names that begin with
+  mpz_fdiv (f is short for floor), the second class' names begin with mpz_tdiv
+  (t is short for trunc), and the third class' names begin with mpz_cdiv (c is
+  short for ceil).
+
+  The old division routines beginning with mpz_m are similar to the new
+  mpz_fdiv, with the exception that some of the new functions return useful
+  values.
+
+  The old function names can still be used.  All the old functions names will
+  now do floor division, not trunc division as some of them used to.  This was
+  changed to make the functions more compatible with common mathematical
+  practice.
+
+  The mpz_mod and mpz_mod_ui functions now compute the mathematical mod
+  function.  I.e., the sign of the 2nd argument is ignored.
+
+* The mpq assignment functions do not canonicalize their results.  A new
+  function, mpq_canonicalize must be called by the user if the result is not
+  known to be canonical.
+* The mpn functions are now documented.  These functions are intended for
+  very time critical applications, or applications that need full control over
+  memory allocation.  Note that the mpn interface is irregular and hard to
+  use.
+* New functions for arbitrary precision floating point arithmetic.  Names
+  begin with `mpf_'.  Associated type mpf_t.
+* New and improved mpz functions, including much faster GCD, fast exact
+  division (mpz_divexact), bit scan (mpz_scan0 and mpz_scan1), and number
+  theoretical functions like Jacobi (mpz_jacobi) and multiplicative inverse
+  (mpz_invert).
+* New variable types (mpz_t and mpq_t) are available that makes syntax of
+  mpz and mpq calls nicer (no need for & before variables).  The MP_INT and
+  MP_RAT types are still available for compatibility.
+* Uses GNU configure.  This makes it possible to choose target architecture
+  and CPU variant, and to compile into a separate object directory.
+* Carefully optimized assembly for important inner loops.  Support for DEC
+  Alpha, Amd 29000, HPPA 1.0 and 1.1, Intel Pentium and generic x86, Intel
+  i960, Motorola MC68000, MC68020, MC88100, and MC88110, Motorola/IBM
+  PowerPC, National NS32000, IBM POWER, MIPS R3000, R4000, SPARCv7,
+  SuperSPARC, generic SPARCv8, and DEC VAX.  Some support also for ARM,
+  Clipper, IBM ROMP (RT), and Pyramid AP/XP.
+* Faster.  Thanks to the assembler code, new algorithms, and general tuning.
+  In particular, the speed on machines without GCC is improved.
+* Support for machines without alloca.
+* Now under the LGPL.
+
+INCOMPATIBILITIES BETWEEN GMP 1 AND GMP 2
+
+* mpq assignment functions do not canonicalize their results.
+* mpz division functions round differently.
+* mpz mod functions now really compute mod.
+* mpz_powm and mpz_powm_ui now really use mod for reduction.
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/README b/rts/gmp/README
new file mode 100644
index 0000000000..177c97eb12
--- /dev/null
+++ b/rts/gmp/README
@@ -0,0 +1,84 @@
+
+			THE GNU MP LIBRARY
+
+
+GNU MP is a library for arbitrary precision arithmetic, operating on signed
+integers, rational numbers, and floating point numbers.  It has a rich set of
+functions, and the functions have a regular interface.
+
+GNU MP is designed to be as fast as possible, both for small operands and huge
+operands.  The speed is achieved by using fullwords as the basic arithmetic
+type, by using fast algorithms, with carefully optimized assembly code for the
+most common inner loops for lots of CPUs, and by a general emphasis on speed
+(instead of simplicity or elegance).
+
+GNU MP is believed to be faster than any other similar library.  Its advantage
+increases with operand sizes for certain operations, since GNU MP in many
+cases has asymptotically faster algorithms.
+
+GNU MP is free software and may be freely copied on the terms contained in the
+files COPYING.LIB and COPYING (most of GNU MP is under the former, some under
+the latter).
+
+
+
+			OVERVIEW OF GNU MP
+
+There are five classes of functions in GNU MP.
+
+ 1. Signed integer arithmetic functions (mpz).  These functions are intended
+    to be easy to use, with their regular interface.  The associated type is
+    `mpz_t'.
+
+ 2. Rational arithmetic functions (mpq).  For now, just a small set of
+    functions necessary for basic rational arithmetics.  The associated type
+    is `mpq_t'.
+
+ 3. Floating-point arithmetic functions (mpf).  If the C type `double'
+    doesn't give enough precision for your application, declare your
+    variables as `mpf_t' instead, set the precision to any number desired,
+    and call the functions in the mpf class for the arithmetic operations.
+
+ 4. Positive-integer, hard-to-use, very low overhead functions are in the
+    mpn class.  No memory management is performed.  The caller must ensure
+    enough space is available for the results.  The set of functions is not
+    regular, nor is the calling interface.  These functions accept input
+    arguments in the form of pairs consisting of a pointer to the least
+    significant word, and an integral size telling how many limbs (= words)
+    the pointer points to.
+
+    Almost all calculations, in the entire package, are made by calling these
+    low-level functions.
+
+ 5. Berkeley MP compatible functions.
+
+    To use these functions, include the file "mp.h".  You can test if you are
+    using the GNU version by testing if the symbol __GNU_MP__ is defined.
+
+For more information on how to use GNU MP, please refer to the documentation.
+It is composed from the file gmp.texi, and can be displayed on the screen or
+printed.  How to do that, as well how to build the library, is described in
+the INSTALL file in this directory.
+
+
+
+			REPORTING BUGS
+
+If you find a bug in the library, please make sure to tell us about it!
+
+You should first check the GNU MP web pages at http://www.swox.com/gmp/,
+under "Status of the current release".  There will be patches for all known
+serious bugs there.
+
+Report bugs to bug-gmp@gnu.org.  What information is needed in a good bug
+report is described in the manual.  The same address can be used for
+suggesting modifications and enhancements.
+
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 78
+End:
diff --git a/rts/gmp/acconfig.h b/rts/gmp/acconfig.h
new file mode 100644
index 0000000000..dfb1b0b039
--- /dev/null
+++ b/rts/gmp/acconfig.h
@@ -0,0 +1,92 @@
+/*
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+@TOP@
+
+/* Define if a limb is long long. */
+#undef _LONG_LONG_LIMB
+
+/* Define if we have native implementation of function. */
+#undef HAVE_NATIVE_
+#undef HAVE_NATIVE_mpn_add                 
+#undef HAVE_NATIVE_mpn_add_1               
+#undef HAVE_NATIVE_mpn_add_n               
+#undef HAVE_NATIVE_mpn_add_nc              
+#undef HAVE_NATIVE_mpn_addmul_1            
+#undef HAVE_NATIVE_mpn_addmul_1c
+#undef HAVE_NATIVE_mpn_addsub_n            
+#undef HAVE_NATIVE_mpn_addsub_nc
+#undef HAVE_NATIVE_mpn_and_n               
+#undef HAVE_NATIVE_mpn_andn_n              
+#undef HAVE_NATIVE_mpn_bdivmod             
+#undef HAVE_NATIVE_mpn_cmp                 
+#undef HAVE_NATIVE_mpn_com_n               
+#undef HAVE_NATIVE_mpn_copyd               
+#undef HAVE_NATIVE_mpn_copyi               
+#undef HAVE_NATIVE_mpn_divexact_by3c
+#undef HAVE_NATIVE_mpn_divrem              
+#undef HAVE_NATIVE_mpn_divrem_1            
+#undef HAVE_NATIVE_mpn_divrem_1c
+#undef HAVE_NATIVE_mpn_divrem_2            
+#undef HAVE_NATIVE_mpn_divrem_newton       
+#undef HAVE_NATIVE_mpn_divrem_classic      
+#undef HAVE_NATIVE_mpn_dump                
+#undef HAVE_NATIVE_mpn_gcd                 
+#undef HAVE_NATIVE_mpn_gcd_1               
+#undef HAVE_NATIVE_mpn_gcdext              
+#undef HAVE_NATIVE_mpn_get_str             
+#undef HAVE_NATIVE_mpn_hamdist             
+#undef HAVE_NATIVE_mpn_invert_limb
+#undef HAVE_NATIVE_mpn_ior_n               
+#undef HAVE_NATIVE_mpn_iorn_n              
+#undef HAVE_NATIVE_mpn_lshift              
+#undef HAVE_NATIVE_mpn_mod_1               
+#undef HAVE_NATIVE_mpn_mod_1c
+#undef HAVE_NATIVE_mpn_mul                 
+#undef HAVE_NATIVE_mpn_mul_1               
+#undef HAVE_NATIVE_mpn_mul_1c
+#undef HAVE_NATIVE_mpn_mul_basecase        
+#undef HAVE_NATIVE_mpn_mul_n               
+#undef HAVE_NATIVE_mpn_nand_n              
+#undef HAVE_NATIVE_mpn_nior_n              
+#undef HAVE_NATIVE_mpn_perfect_square_p    
+#undef HAVE_NATIVE_mpn_popcount            
+#undef HAVE_NATIVE_mpn_preinv_mod_1        
+#undef HAVE_NATIVE_mpn_random2             
+#undef HAVE_NATIVE_mpn_random              
+#undef HAVE_NATIVE_mpn_rawrandom           
+#undef HAVE_NATIVE_mpn_rshift              
+#undef HAVE_NATIVE_mpn_scan0               
+#undef HAVE_NATIVE_mpn_scan1               
+#undef HAVE_NATIVE_mpn_set_str             
+#undef HAVE_NATIVE_mpn_sqrtrem             
+#undef HAVE_NATIVE_mpn_sqr_basecase        
+#undef HAVE_NATIVE_mpn_sub                 
+#undef HAVE_NATIVE_mpn_sub_1               
+#undef HAVE_NATIVE_mpn_sub_n               
+#undef HAVE_NATIVE_mpn_sub_nc              
+#undef HAVE_NATIVE_mpn_submul_1            
+#undef HAVE_NATIVE_mpn_submul_1c
+#undef HAVE_NATIVE_mpn_udiv_w_sdiv         
+#undef HAVE_NATIVE_mpn_umul_ppmm
+#undef HAVE_NATIVE_mpn_udiv_qrnnd
+#undef HAVE_NATIVE_mpn_xor_n               
+#undef HAVE_NATIVE_mpn_xnor_n              
diff --git a/rts/gmp/acinclude.m4 b/rts/gmp/acinclude.m4
new file mode 100644
index 0000000000..a02394a963
--- /dev/null
+++ b/rts/gmp/acinclude.m4
@@ -0,0 +1,835 @@
+dnl  GMP specific autoconf macros
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+dnl  GMP_HEADER_GETVAL(NAME,FILE)
+dnl  ----------------------------
+dnl  Expand to the value of a "#define NAME" from the given FILE.
+dnl  The regexps here aren't very rugged, but are enough for gmp.
+dnl  /dev/null as a parameter prevents a hang if $2 is accidentally omitted.
+
+define(GMP_HEADER_GETVAL,
+[patsubst(patsubst(
+esyscmd([grep "^#define $1 " $2 /dev/null 2>/dev/null]),
+[^.*$1[ 	]+],[]),
+[[
+ 	]*$],[])])
+
+
+dnl  GMP_VERSION
+dnl  -----------
+dnl  The gmp version number, extracted from the #defines in gmp.h.
+dnl  Two digits like 3.0 if patchlevel <= 0, or three digits like 3.0.1 if
+dnl  patchlevel > 0.
+
+define(GMP_VERSION,
+[GMP_HEADER_GETVAL(__GNU_MP_VERSION,gmp.h)[]dnl
+.GMP_HEADER_GETVAL(__GNU_MP_VERSION_MINOR,gmp.h)[]dnl
+ifelse(m4_eval(GMP_HEADER_GETVAL(__GNU_MP_VERSION_PATCHLEVEL,gmp.h) > 0),1,
+[.GMP_HEADER_GETVAL(__GNU_MP_VERSION_PATCHLEVEL,gmp.h)])])
+
+
+dnl  GMP_PROG_M4()
+dnl  -------------
+dnl
+dnl  Find a working m4, either in $PATH or likely locations, and setup $M4
+dnl  and an AC_SUBST accordingly.  If $M4 is already set then it's a user
+dnl  choice and is accepted with no checks.  GMP_PROG_M4 is like
+dnl  AC_PATH_PROG or AC_CHECK_PROG, but it tests each m4 found to see if
+dnl  it's good enough.
+dnl 
+dnl  See mpn/asm-defs.m4 for details on the known bad m4s.
+
+AC_DEFUN(GMP_PROG_M4,
+[AC_CACHE_CHECK([for suitable m4],
+                gmp_cv_prog_m4,
+[if test -n "$M4"; then
+  gmp_cv_prog_m4="$M4"
+else
+  cat >conftest.m4 <<\EOF
+dnl  must protect this against being expanded during autoconf m4!
+[define(dollarhash,``$][#'')dnl
+ifelse(dollarhash(x),1,`define(t1,Y)',
+``bad: $][# not supported (SunOS /usr/bin/m4)
+'')dnl
+ifelse(eval(89),89,`define(t2,Y)',
+`bad: eval() doesnt support 8 or 9 in a constant (OpenBSD 2.6 m4)
+')dnl
+ifelse(t1`'t2,YY,`good
+')dnl]
+EOF
+  echo "trying m4" 1>&AC_FD_CC
+  gmp_tmp_val="`(m4 conftest.m4) 2>&AC_FD_CC`"
+  echo "$gmp_tmp_val" 1>&AC_FD_CC
+  if test "$gmp_tmp_val" = good; then
+    gmp_cv_prog_m4="m4"
+  else
+    IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+dnl $ac_dummy forces splitting on constant user-supplied paths.
+dnl POSIX.2 word splitting is done only on the output of word expansions,
+dnl not every word.  This closes a longstanding sh security hole.
+    ac_dummy="$PATH:/usr/5bin"
+    for ac_dir in $ac_dummy; do
+      test -z "$ac_dir" && ac_dir=.
+      echo "trying $ac_dir/m4" 1>&AC_FD_CC
+      gmp_tmp_val="`($ac_dir/m4 conftest.m4) 2>&AC_FD_CC`"
+      echo "$gmp_tmp_val" 1>&AC_FD_CC
+      if test "$gmp_tmp_val" = good; then
+        gmp_cv_prog_m4="$ac_dir/m4"
+        break
+      fi
+    done
+    IFS="$ac_save_ifs"
+    if test -z "$gmp_cv_prog_m4"; then
+      AC_MSG_ERROR([No usable m4 in \$PATH or /usr/5bin (see config.log for reasons).])
+    fi
+  fi
+  rm -f conftest.m4
+fi])
+M4="$gmp_cv_prog_m4"
+AC_SUBST(M4)
+])
+
+
+dnl  GMP_PROG_CC_FIND([CC_LIST], [REQ_64BIT_CC])
+dnl  Find first working compiler in CC_LIST.
+dnl  If REQ_64BIT_CC is "yes", the compiler is required to be able to 
+dnl  produce 64-bit code.
+dnl  NOTE: If a compiler needs any special flags for producing 64-bit code,
+dnl  these have to be found in shell variable `gmp_cflags64_{cc}', where `{cc}'
+dnl  is the name of the compiler.
+dnl  Set CC to the name of the first working compiler.
+dnl  If a 64-bit compiler is found, set CC64 to the name of the compiler and
+dnl  CFLAGS64 to flags to use.
+dnl  This macro does not test if any of the compilers found is a GNU compiler.
+dnl  To do this, when you have finally made up your mind on which one to use, 
+dnl  and set CC accordingly, invoke [GMP_PROG_CC_SELECT].  That macro will 
+dnl  also make sure that your selection of CFLAGS is valid.
+dnl
+AC_DEFUN(GMP_PROG_CC_FIND,
+[AC_BEFORE([$0], [CC_PROG_CPP])
+ifelse([$1], , gmp_cc_list="gcc cc", gmp_cc_list="[$1]")
+ifelse([$2], , gmp_req_64bit_cc="no", gmp_req_64bit_cc="[$2]")
+
+CC32=
+CC64=
+for c in $gmp_cc_list; do
+  # Avoid cache hits.
+  unset CC
+  unset ac_cv_prog_CC
+  AC_CHECK_TOOL(CC, $c, $c)
+  if test -n "$CC"; then
+    eval c_flags=\$gmp_cflags_$c
+    GMP_PROG_CC_WORKS($CC, $c_flags,
+		      gmp_prog_cc_works=yes, 
+		      gmp_prog_cc_works=no)
+
+    if test "$gmp_prog_cc_works" != "yes"; then
+      continue
+    fi
+
+    # Save first working compiler, whether 32- or 64-bit capable.
+    if test -z "$CC32"; then
+      CC32="$CC"
+    fi
+    if test "$gmp_req_64bit_cc" = "yes"; then
+      eval c_flags=\$gmp_cflags64_$c
+
+      # Verify that the compiler works in 64-bit mode as well.
+      # /usr/ucb/cc on Solaris 7 can *compile* in 64-bit mode, but not link.
+      GMP_PROG_CC_WORKS($c, $c_flags,
+		      	gmp_prog_cc_works=yes, 
+		      	gmp_prog_cc_works=no)
+
+      if test "$gmp_prog_cc_works" = "yes"; then
+        GMP_CHECK_CC_64BIT($c, $c_flags)
+        if test "$gmp_cv_cc_64bit" = "yes"; then
+          test -z "$CC64" && CC64="$c"
+          test -z "$CFLAGS64" && CFLAGS64="$c_flags"
+	  # We have CC64 so we're done.
+          break
+        fi
+      fi
+    else
+      # We have CC32, and we don't need a 64-bit compiler so we're done.
+      break
+    fi
+  fi
+done
+CC="$CC32"
+])dnl
+
+dnl  GMP_PROG_CC_SELECT
+dnl  Check that `CC' works with `CFLAGS'.  Check if `CC' is a GNU compiler.
+dnl  Cache the result as `ac_cv_prog_CC'.
+AC_DEFUN(GMP_PROG_CC_SELECT,
+[AC_BEFORE([$0], [CC_PROG_CPP])
+AC_PROG_CC_WORKS
+AC_PROG_CC_GNU
+
+if test "$ac_cv_prog_gcc" = "yes"; then
+  GCC=yes
+else
+  GCC=
+fi
+
+# Set CFLAGS if not already set.
+if test -z "$CFLAGS"; then
+  CFLAGS="-g"
+  if test "$GCC" = "yes"; then
+    CFLAGS="$CFLAGS -O2"
+  fi
+fi
+
+AC_SUBST(CC)
+AC_CACHE_VAL(ac_cv_prog_CC, ac_cv_prog_CC="$CC")
+AC_PROVIDE([AC_PROG_CC])
+])dnl
+
+dnl  GMP_CHECK_CC_64BIT(cc, cflags64)
+dnl  Find out if `CC' can produce 64-bit code.
+dnl  Requires NM to be set to nm for target.
+dnl  FIXME: Cache result.
+AC_DEFUN(GMP_CHECK_CC_64BIT,
+[
+  gmp_tmp_CC_save="$CC"
+  CC="[$1]"
+  AC_MSG_CHECKING([whether the C compiler ($CC) is 64-bit capable])
+  if test -z "$NM"; then
+    echo; echo ["configure: $0: fatal: need nm"]
+    exit 1
+  fi
+  gmp_tmp_CFLAGS_save="$CFLAGS"
+  CFLAGS="[$2]"
+
+  case "$target" in 
+    hppa2.0*-*-*)
+      # FIXME: If gcc is installed under another name than "gcc", we will 
+      # test the wrong thing.
+      if test "$CC" != "gcc"; then
+        dnl Let compiler version A.10.32.30 or higher be ok.
+        dnl Bad compiler output:
+        dnl   ccom: HP92453-01 G.10.32.05 HP C Compiler
+        dnl Good compiler output:
+        dnl   ccom: HP92453-01 A.10.32.30 HP C Compiler
+        echo >conftest.c
+        gmp_tmp_vs=`$CC $CFLAGS -V -c -o conftest.o conftest.c 2>&1 | grep "^ccom:"`
+        rm conftest*
+        gmp_tmp_v1=`echo $gmp_tmp_vs | sed 's/.* .\.\(.*\)\..*\..* HP C.*/\1/'`
+        gmp_tmp_v2=`echo $gmp_tmp_vs | sed 's/.* .\..*\.\(.*\)\..* HP C.*/\1/'`
+        gmp_tmp_v3=`echo $gmp_tmp_vs | sed 's/.* .\..*\..*\.\(.*\) HP C.*/\1/'`
+	gmp_cv_cc_64bit=no
+	test -n "$gmp_tmp_v1" && test "$gmp_tmp_v1" -ge "10" \
+  	  && test -n "$gmp_tmp_v2" && test "$gmp_tmp_v2" -ge "32" \
+    	  && test -n "$gmp_tmp_v3" && test "$gmp_tmp_v3" -ge "30" \
+	  && gmp_cv_cc_64bit=yes
+      else	# gcc
+	# FIXME: Compile a minimal file and determine if the resulting object 
+	# file is an ELF file.  If so, gcc can produce 64-bit code.
+	# Do we have file(1) for target?
+	gmp_cv_cc_64bit=no
+      fi
+      ;;
+    mips-sgi-irix6.*)
+      # We use `-n32' to cc and `-mabi=n32' to gcc, resulting in 64-bit 
+      # arithmetic but not 64-bit pointers, so the general test for sizeof
+      # (void *) is not valid.
+      # Simply try to compile an empty main.  If that succeeds return
+      # true.
+      AC_TRY_COMPILE( , ,
+                     gmp_cv_cc_64bit=yes, gmp_cv_cc_64bit=no,
+                     gmp_cv_cc_64bit=no)
+      ;;
+    *-*-*)
+      # Allocate an array of size sizeof (void *) and use nm to determine its 
+      # size.  We depend on the first declared variable being put at address 0.
+      cat >conftest.c <<EOF
+[char arr[sizeof (void *)]={0};
+char post=0;]
+EOF
+      gmp_compile="$CC $CFLAGS -c conftest.c 1>&AC_FD_CC"
+      if AC_TRY_EVAL(gmp_compile); then
+        changequote(<,>)dnl
+	gmp_tmp_val=`$NM conftest.o | grep post | sed -e 's;[[][0-9][]]\(.*\);\1;' \
+          -e 's;[^1-9]*\([0-9]*\).*;\1;'`
+        changequote([, ])dnl
+        if test "$gmp_tmp_val" = "8"; then
+	  gmp_cv_cc_64bit=yes
+	else
+	  gmp_cv_cc_64bit=no
+        fi
+      else
+        echo "configure: failed program was:" >&AC_FD_CC
+        cat conftest.$ac_ext >&AC_FD_CC
+        gmp_cv_cc_64bit=no
+      fi
+      rm -f conftest*
+      ;;
+  esac
+
+  CC="$gmp_tmp_CC_save"
+  CFLAGS="$gmp_tmp_CFLAGS_save"
+  AC_MSG_RESULT($gmp_cv_cc_64bit)
+])dnl
+
+dnl  GMP_INIT([M4-DEF-FILE])
+dnl  
+AC_DEFUN(GMP_INIT,
+[ifelse([$1], , gmp_configm4=config.m4, gmp_configm4="[$1]")
+gmp_tmpconfigm4=cnfm4.tmp
+gmp_tmpconfigm4i=cnfm4i.tmp
+gmp_tmpconfigm4p=cnfm4p.tmp
+test -f $gmp_tmpconfigm4 && rm $gmp_tmpconfigm4
+test -f $gmp_tmpconfigm4i && rm $gmp_tmpconfigm4i
+test -f $gmp_tmpconfigm4p && rm $gmp_tmpconfigm4p
+])dnl
+
+dnl  GMP_FINISH
+dnl  ----------
+dnl  Create config.m4 from its accumulated parts.
+dnl
+dnl  __CONFIG_M4_INCLUDED__ is used so that a second or subsequent include
+dnl  of config.m4 is harmless.
+dnl
+dnl  A separate ifdef on the angle bracket quoted part ensures the quoting
+dnl  style there is respected.  The basic defines from gmp_tmpconfigm4 are
+dnl  fully quoted but are still put under an ifdef in case any have been
+dnl  redefined by one of the m4 include files.
+dnl
+dnl  Doing a big ifdef within asm-defs.m4 and/or other macro files wouldn't
+dnl  work, since it'd interpret parentheses and quotes in dnl comments, and
+dnl  having a whole file as a macro argument would overflow the string space
+dnl  on BSD m4.
+
+AC_DEFUN(GMP_FINISH,
+[AC_REQUIRE([GMP_INIT])
+echo "creating $gmp_configm4"
+echo ["dnl $gmp_configm4.  Generated automatically by configure."] > $gmp_configm4
+if test -f $gmp_tmpconfigm4; then
+  echo ["changequote(<,>)dnl"] >> $gmp_configm4
+  echo ["ifdef(<__CONFIG_M4_INCLUDED__>,,<"] >> $gmp_configm4
+  cat $gmp_tmpconfigm4 >> $gmp_configm4
+  echo [">)"] >> $gmp_configm4
+  echo ["changequote(\`,')dnl"] >> $gmp_configm4
+  rm $gmp_tmpconfigm4
+fi
+echo ["ifdef(\`__CONFIG_M4_INCLUDED__',,\`"] >> $gmp_configm4
+if test -f $gmp_tmpconfigm4i; then
+  cat $gmp_tmpconfigm4i >> $gmp_configm4
+  rm $gmp_tmpconfigm4i
+fi
+if test -f $gmp_tmpconfigm4p; then
+  cat $gmp_tmpconfigm4p >> $gmp_configm4
+  rm $gmp_tmpconfigm4p
+fi
+echo ["')"] >> $gmp_configm4
+echo ["define(\`__CONFIG_M4_INCLUDED__')"] >> $gmp_configm4
+])dnl
+
+dnl  GMP_INCLUDE(FILE)
+AC_DEFUN(GMP_INCLUDE,
+[AC_REQUIRE([GMP_INIT])
+echo ["include(\`$1')"] >> $gmp_tmpconfigm4i
+])dnl
+
+dnl  GMP_SINCLUDE(FILE)
+AC_DEFUN(GMP_SINCLUDE,
+[AC_REQUIRE([GMP_INIT])
+echo ["sinclude(\`$1')"] >> $gmp_tmpconfigm4i
+])dnl
+
+dnl GMP_DEFINE(MACRO, DEFINITION [, LOCATION])
+dnl [ Define M4 macro MACRO as DEFINITION in temporary file.		]
+dnl [ If LOCATION is `POST', the definition will appear after any	]
+dnl [ include() directives inserted by GMP_INCLUDE/GMP_SINCLUDE.	]
+dnl [ Mind the quoting!  No shell variables will get expanded.		]
+dnl [ Don't forget to invoke GMP_FINISH to create file config.m4.	]
+dnl [ config.m4 uses `<' and '>' as quote characters for all defines.	]
+AC_DEFUN(GMP_DEFINE, 
+[AC_REQUIRE([GMP_INIT])
+echo ['define(<$1>, <$2>)'] >> ifelse([$3], [POST], $gmp_tmpconfigm4p, $gmp_tmpconfigm4)
+])dnl
+
+dnl GMP_DEFINE_RAW(STRING, [, LOCATION])
+dnl [ Put STRING in temporary file.					]
+dnl [ If LOCATION is `POST', the definition will appear after any	]
+dnl [ include() directives inserted by GMP_INCLUDE/GMP_SINCLUDE.	]
+dnl [ Don't forget to invoke GMP_FINISH to create file config.m4.	]
+AC_DEFUN(GMP_DEFINE_RAW,
+[AC_REQUIRE([GMP_INIT])
+echo [$1] >> ifelse([$2], [POST], $gmp_tmpconfigm4p, $gmp_tmpconfigm4)
+])dnl
+
+dnl  GMP_CHECK_ASM_LABEL_SUFFIX
+dnl  Should a label have a colon or not?
+AC_DEFUN(GMP_CHECK_ASM_LABEL_SUFFIX,
+[AC_CACHE_CHECK([what assembly label suffix to use],
+               gmp_cv_check_asm_label_suffix,
+[case "$target" in 
+  *-*-hpux*) gmp_cv_check_asm_label_suffix=[""] ;;
+  *) gmp_cv_check_asm_label_suffix=[":"] ;;
+esac
+])
+echo ["define(<LABEL_SUFFIX>, <\$][1$gmp_cv_check_asm_label_suffix>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_UNDERSCORE([ACTION-IF-FOUND [, ACTION-IF-NOT-FOUND]])
+dnl  Shamelessly borrowed from glibc.
+AC_DEFUN(GMP_CHECK_ASM_UNDERSCORE,
+[AC_CACHE_CHECK([if symbols are prefixed by underscore], 
+	        gmp_cv_check_asm_underscore,
+[cat > conftest.$ac_ext <<EOF
+dnl This sometimes fails to find confdefs.h, for some reason.
+dnl [#]line __oline__ "[$]0"
+[#]line __oline__ "configure"
+#include "confdefs.h"
+int underscore_test() {
+return; }
+EOF
+if AC_TRY_EVAL(ac_compile); then
+  if grep _underscore_test conftest* >/dev/null; then
+    gmp_cv_check_asm_underscore=yes
+  else
+    gmp_cv_check_asm_underscore=no
+  fi
+else
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.$ac_ext >&AC_FD_CC
+fi
+rm -f conftest*
+])
+if test "$gmp_cv_check_asm_underscore" = "yes"; then
+  GMP_DEFINE(GSYM_PREFIX, [_])
+  ifelse([$1], , :, [$1])
+else
+  GMP_DEFINE(GSYM_PREFIX, [])
+  ifelse([$2], , :, [$2])
+fi    
+])dnl
+
+dnl  GMP_CHECK_ASM_ALIGN_LOG([ACTION-IF-FOUND [, ACTION-IF-NOT-FOUND]])
+dnl  Is parameter to `.align' logarithmic?
+dnl  Requires NM to be set to nm for target.
+AC_DEFUN(GMP_CHECK_ASM_ALIGN_LOG,
+[AC_REQUIRE([GMP_CHECK_ASM_GLOBL])
+AC_REQUIRE([GMP_CHECK_ASM_DATA])
+AC_REQUIRE([GMP_CHECK_ASM_LABEL_SUFFIX])
+AC_CACHE_CHECK([if .align assembly directive is logarithmic],
+		gmp_cv_check_asm_align_log,
+[if test -z "$NM"; then
+  echo; echo ["configure: $0: fatal: need nm"]
+  exit 1
+fi
+cat > conftest.s <<EOF
+      	$gmp_cv_check_asm_data
+      	.align  4
+	$gmp_cv_check_asm_globl	foo
+	.byte	1
+	.align	4
+foo$gmp_cv_check_asm_label_suffix
+	.byte	2
+EOF
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+if AC_TRY_EVAL(ac_assemble); then
+  changequote(<,>)
+  gmp_tmp_val=`$NM conftest.o | grep foo | sed -e 's;[[][0-9][]]\(.*\);\1;' \
+       -e 's;[^1-9]*\([0-9]*\).*;\1;'`
+  changequote([, ])dnl
+  if test "$gmp_tmp_val" = "10" || test "$gmp_tmp_val" = "16"; then
+    gmp_cv_check_asm_align_log=yes
+  else
+    gmp_cv_check_asm_align_log=no
+  fi
+else 
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.s >&AC_FD_CC
+fi
+rm -f conftest*
+])
+GMP_DEFINE_RAW(["define(<ALIGN_LOGARITHMIC>,<$gmp_cv_check_asm_align_log>)"])
+if test "$gmp_cv_check_asm_align_log" = "yes"; then
+  ifelse([$1], , :, [$1])
+else
+  ifelse([$2], , :, [$2])
+fi  
+])dnl
+
+
+dnl  GMP_CHECK_ASM_ALIGN_FILL_0x90
+dnl  -----------------------------
+dnl  Determine whether a ",0x90" suffix works on a .align directive.
+dnl  This is only meant for use on x86, where 0x90 is a "nop".
+dnl
+dnl  Old gas, eg. 1.92.3 - needs ",0x90" or else the fill is an invalid 0x00.
+dnl  New gas, eg. 2.91 - generates the good multibyte nop fills even when
+dnl                      ",0x90" is given.
+dnl  Solaris 2.6 as - doesn't allow ",0x90", gives a fatal error.
+dnl  Solaris 2.8 as - gives a warning for ",0x90", no ill effect.
+dnl
+dnl  Note that both solaris "as"s only care about ",0x90" if they actually
+dnl  have to use it to fill something, hence the .byte in the sample.  It's
+dnl  only the second .align that provokes an error or warning.
+dnl
+dnl  We prefer to suppress the warning from solaris 2.8 to stop anyone
+dnl  worrying something might be wrong.
+
+AC_DEFUN(GMP_CHECK_ASM_ALIGN_FILL_0x90,
+[AC_CACHE_CHECK([if the .align directive accepts an 0x90 fill in .text],
+                gmp_cv_check_asm_align_fill_0x90,
+[AC_REQUIRE([GMP_CHECK_ASM_TEXT])
+cat > conftest.s <<EOF
+      	$gmp_cv_check_asm_text
+      	.align  4, 0x90
+	.byte   0
+      	.align  4, 0x90
+EOF
+gmp_tmp_val="`$CCAS $CFLAGS conftest.s 2>&1`"
+if test $? = 0; then
+  echo "$gmp_tmp_val" 1>&AC_FD_CC
+  if echo "$gmp_tmp_val" | grep "Warning: Fill parameter ignored for executable section"; then
+    echo "Supressing this warning by omitting 0x90" 1>&AC_FD_CC
+    gmp_cv_check_asm_align_fill_0x90=no
+  else
+    gmp_cv_check_asm_align_fill_0x90=yes
+  fi
+else
+  echo "Non-zero exit code" 1>&AC_FD_CC
+  echo "$gmp_tmp_val" 1>&AC_FD_CC
+  gmp_cv_check_asm_align_fill_0x90=no
+fi
+rm -f conftest*
+])
+GMP_DEFINE_RAW(
+["define(<ALIGN_FILL_0x90>,<$gmp_cv_check_asm_align_fill_0x90>)"])
+])
+
+
+dnl  GMP_CHECK_ASM_TEXT
+AC_DEFUN(GMP_CHECK_ASM_TEXT,
+[AC_CACHE_CHECK([how to switch to text section], gmp_cv_check_asm_text,
+[case "$target" in
+  *-*-aix*)
+    changequote({, })
+    gmp_cv_check_asm_text={".csect .text[PR]"}
+    changequote([, ])
+    ;;
+  *-*-hpux*) gmp_cv_check_asm_text=[".code"] ;;
+  *) gmp_cv_check_asm_text=[".text"] ;;
+esac
+])
+echo ["define(<TEXT>, <$gmp_cv_check_asm_text>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_DATA
+dnl  Can we say `.data'?
+AC_DEFUN(GMP_CHECK_ASM_DATA,
+[AC_CACHE_CHECK([how to switch to data section], gmp_cv_check_asm_data,
+[case "$target" in
+  *-*-aix*)
+    changequote({, })
+    gmp_cv_check_asm_data={".csect .data[RW]"}
+    changequote([, ])
+    ;;
+  *) gmp_cv_check_asm_data=[".data"] ;;
+esac
+])
+echo ["define(<DATA>, <$gmp_cv_check_asm_data>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_GLOBL
+dnl  Can we say `.global'?
+AC_DEFUN(GMP_CHECK_ASM_GLOBL,
+[AC_CACHE_CHECK([how to export a symbol], gmp_cv_check_asm_globl,
+[case "$target" in
+  *-*-hpux*) gmp_cv_check_asm_globl=[".export"] ;;
+  *) gmp_cv_check_asm_globl=[".globl"] ;;
+esac
+])
+echo ["define(<GLOBL>, <$gmp_cv_check_asm_globl>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_TYPE
+dnl  Can we say `.type'?
+AC_DEFUN(GMP_CHECK_ASM_TYPE,
+[AC_CACHE_CHECK([how the .type assembly directive should be used],
+gmp_cv_check_asm_type,
+[ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+for gmp_tmp_prefix in @ \# %; do
+  echo "	.type	sym,${gmp_tmp_prefix}function" > conftest.s
+  if AC_TRY_EVAL(ac_assemble); then
+    gmp_cv_check_asm_type="[.type	\$][1,${gmp_tmp_prefix}\$][2]"
+    break
+  fi
+done
+if test -z "$gmp_cv_check_asm_type"; then
+  gmp_cv_check_asm_type="[dnl]"
+fi
+])
+echo ["define(<TYPE>, <$gmp_cv_check_asm_type>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_SIZE
+dnl  Can we say `.size'?
+AC_DEFUN(GMP_CHECK_ASM_SIZE,
+[AC_CACHE_CHECK([if the .size assembly directive works], gmp_cv_check_asm_size,
+[ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+echo '	.size	sym,1' > conftest.s
+if AC_TRY_EVAL(ac_assemble); then
+  gmp_cv_check_asm_size="[.size	\$][1,\$][2]"
+else
+  gmp_cv_check_asm_size="[dnl]"
+fi
+])
+echo ["define(<SIZE>, <$gmp_cv_check_asm_size>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_LSYM_PREFIX
+dnl  What is the prefix for a local label?
+dnl  Requires NM to be set to nm for target.
+AC_DEFUN(GMP_CHECK_ASM_LSYM_PREFIX,
+[AC_REQUIRE([GMP_CHECK_ASM_LABEL_SUFFIX])
+AC_CACHE_CHECK([what prefix to use for a local label], 
+gmp_cv_check_asm_lsym_prefix,
+[if test -z "$NM"; then
+  echo; echo ["$0: fatal: need nm"]
+  exit 1
+fi
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+gmp_cv_check_asm_lsym_prefix="L"
+for gmp_tmp_pre in L .L $ L$; do
+  cat > conftest.s <<EOF
+dummy${gmp_cv_check_asm_label_suffix}
+${gmp_tmp_pre}gurkmacka${gmp_cv_check_asm_label_suffix}
+	.byte 0
+EOF
+  if AC_TRY_EVAL(ac_assemble); then
+    $NM conftest.o >/dev/null 2>&1
+    gmp_rc=$?
+    if test "$gmp_rc" != "0"; then
+      echo "configure: $NM failure, using default"
+      break
+    fi
+    if $NM conftest.o | grep gurkmacka >/dev/null; then true; else
+      gmp_cv_check_asm_lsym_prefix="$gmp_tmp_pre"
+      break
+    fi
+  else
+    echo "configure: failed program was:" >&AC_FD_CC
+    cat conftest.s >&AC_FD_CC
+    # Use default.
+  fi
+done
+rm -f conftest*
+])
+echo ["define(<LSYM_PREFIX>, <${gmp_cv_check_asm_lsym_prefix}>)"] >> $gmp_tmpconfigm4
+])
+
+dnl  GMP_CHECK_ASM_W32
+dnl  How to [define] a 32-bit word.
+dnl  Requires NM to be set to nm for target.
+AC_DEFUN(GMP_CHECK_ASM_W32,
+[AC_REQUIRE([GMP_CHECK_ASM_DATA])
+AC_REQUIRE([GMP_CHECK_ASM_GLOBL])
+AC_REQUIRE([GMP_CHECK_ASM_LABEL_SUFFIX])
+AC_CACHE_CHECK([how to [define] a 32-bit word],
+	       gmp_cv_check_asm_w32,
+[if test -z "$NM"; then
+  echo; echo ["configure: $0: fatal: need nm"]
+  exit 1
+fi
+
+# FIXME: HPUX puts first symbol at 0x40000000, breaking our assumption
+# that it's at 0x0.  We'll have to declare another symbol before the
+# .long/.word and look at the distance between the two symbols.  The
+# only problem is that the sed expression(s) barfs (on Solaris, for
+# example) for the symbol with value 0.  For now, HPUX uses .word.
+
+case "$target" in 
+  *-*-hpux*)
+    gmp_cv_check_asm_w32=".word"
+    ;;
+  *-*-*)
+    ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+    for gmp_tmp_op in .long .word; do
+      cat > conftest.s <<EOF
+	$gmp_cv_check_asm_data
+	$gmp_cv_check_asm_globl	foo
+	$gmp_tmp_op	0
+foo${gmp_cv_check_asm_label_suffix}
+	.byte	0
+EOF
+      if AC_TRY_EVAL(ac_assemble); then
+        changequote(<,>)
+        gmp_tmp_val=`$NM conftest.o | grep foo | sed -e 's;[[][0-9][]]\(.*\);\1;' \
+             -e 's;[^1-9]*\([0-9]*\).*;\1;'`
+        changequote([, ])dnl
+        if test "$gmp_tmp_val" = "4"; then
+          gmp_cv_check_asm_w32="$gmp_tmp_op"
+          break
+        fi
+      fi
+    done
+    ;;
+esac
+
+if test -z "$gmp_cv_check_asm_w32"; then
+  echo; echo ["configure: $0: fatal: do not know how to define a 32-bit word"]
+  exit 1
+fi
+rm -f conftest*
+])
+echo ["define(<W32>, <$gmp_cv_check_asm_w32>)"] >> $gmp_tmpconfigm4
+])
+
+dnl  GMP_CHECK_ASM_MMX([ACTION-IF-FOUND, [ACTION-IF-NOT-FOUND]])
+dnl  Can we assemble MMX insns?
+AC_DEFUN(GMP_CHECK_ASM_MMX,
+[AC_REQUIRE([GMP_CHECK_ASM_TEXT])
+AC_CACHE_CHECK([if the assembler knows about MMX instructions],
+		gmp_cv_check_asm_mmx,
+[cat > conftest.s <<EOF
+	$gmp_cv_check_asm_text
+	por	%mm0, %mm0
+EOF
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+if AC_TRY_EVAL(ac_assemble); then
+  gmp_cv_check_asm_mmx=yes
+else 
+  gmp_cv_check_asm_mmx=no
+fi
+rm -f conftest*
+])
+if test "$gmp_cv_check_asm_mmx" = "yes"; then
+  ifelse([$1], , :, [$1])
+else
+  AC_MSG_WARN([+----------------------------------------------------------])
+  AC_MSG_WARN([| WARNING WARNING WARNING])
+  AC_MSG_WARN([| Target CPU has MMX code, but it can't be assembled by])
+  AC_MSG_WARN([|     $CCAS $CFLAGS])
+  AC_MSG_WARN([| Non-MMX replacements will be used.])
+  AC_MSG_WARN([| This will be an inferior build.])
+  AC_MSG_WARN([+----------------------------------------------------------])
+  ifelse([$2], , :, [$2])
+fi
+])dnl
+
+dnl  GMP_CHECK_ASM_SHLDL_CL([ACTION-IF-FOUND, [ACTION-IF-NOT-FOUND]])
+AC_DEFUN(GMP_CHECK_ASM_SHLDL_CL,
+[AC_REQUIRE([GMP_CHECK_ASM_TEXT])
+AC_CACHE_CHECK([if the assembler takes cl with shldl],
+		gmp_cv_check_asm_shldl_cl,
+[cat > conftest.s <<EOF
+	$gmp_cv_check_asm_text
+	shldl	%cl, %eax, %ebx
+EOF
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+if AC_TRY_EVAL(ac_assemble); then
+  gmp_cv_check_asm_shldl_cl=yes
+else 
+  gmp_cv_check_asm_shldl_cl=no
+fi
+rm -f conftest*
+])
+if test "$gmp_cv_check_asm_shldl_cl" = "yes"; then
+  ifelse([$1], , :, [$1])
+else
+  ifelse([$2], , :, [$2])
+fi
+])dnl
+
+dnl  GMP_PROG_CC_WORKS(CC, CFLAGS, ACTION-IF-WORKS, [ACTION-IF-NOT-WORKS])
+dnl  Check if CC can compile and link.  Perform various target specific tests.
+dnl  FIXME: Require `$target'.
+AC_DEFUN(GMP_PROG_CC_WORKS,
+[AC_LANG_C	dnl  Note: Destructive.
+CC="[$1]"
+CFLAGS="[$2]"
+AC_MSG_CHECKING([if the C compiler ($CC) works with flags $CFLAGS])
+
+# Simple test for all targets.
+AC_TRY_COMPILER([int main(){return(0);}],
+                tmp_works, tmp_cross)
+
+# Target specific tests.
+if test "$tmp_works" = "yes"; then
+  case "$target" in 
+    *-*-aix*)	# Returning a funcptr.
+      AC_TRY_COMPILE( , [} void *g(); void *f() { return g(); } int bar(){],
+                      tmp_works=yes, tmp_works=no)
+      ;;
+  esac
+fi
+
+if test "$tmp_works" = "yes"; then
+  [$3]
+else
+  ifelse([$4], , :, [$4])
+fi
+
+AC_MSG_RESULT($tmp_works)
+])dnl
+
+
+dnl  GMP_C_ANSI2KNR
+dnl  --------------
+dnl  Setup to use ansi2knr if necessary.
+dnl
+dnl  The test here is simply that if an ANSI style function works then
+dnl  ansi2knr isn't needed.  The normal tests for whether $CC works mean we
+dnl  don't need to worry here about anything badly broken.
+dnl
+dnl  AM_C_PROTOTYPES is the normal way to set up ansi2knr, but (in automake
+dnl  March 2000) it gives the wrong answer on a C++ compiler because its
+dnl  test requires that the compiler accept both ANSI and K&R, or otherwise
+dnl  ansi2knr is used.  A C++ compiler fails on the K&R part, which makes
+dnl  AM_C_PROTOTYPES think it needs ansi2knr!  GMP has no bare K&R so we
+dnl  only need ANSI or K&R to work, not both.
+
+AC_DEFUN(GMP_C_ANSI2KNR,
+[AC_CACHE_CHECK([if ansi2knr should be used],
+                gmp_cv_c_ansi2knr,
+[cat >conftest.c <<EOF
+int main (int argc, char *argv[]) { return 0; }
+EOF
+if AC_TRY_EVAL(ac_compile); then
+  gmp_cv_c_ansi2knr=no
+else
+  gmp_cv_c_ansi2knr=yes
+fi
+rm -f conftest.*
+])
+if test $gmp_cv_c_ansi2knr = no; then
+  U= ANSI2KNR=
+else
+  U=_ ANSI2KNR=./ansi2knr
+  # Ensure some checks needed by ansi2knr itself.
+  AC_HEADER_STDC
+  AC_CHECK_HEADERS(string.h)
+fi
+AC_SUBST(U)
+AC_SUBST(ANSI2KNR)
+])
+
+
+dnl  Deal with bad synchronization of Autoconf with Libtool.
+AC_DEFUN(AC_CANONICAL_BUILD, [_AC_CANONICAL_BUILD])
+AC_DEFUN(AC_CHECK_TOOL_PREFIX, [_AC_CHECK_TOOL_PREFIX])
diff --git a/rts/gmp/aclocal.m4 b/rts/gmp/aclocal.m4
new file mode 100644
index 0000000000..086c77915c
--- /dev/null
+++ b/rts/gmp/aclocal.m4
@@ -0,0 +1,1963 @@
+dnl aclocal.m4 generated automatically by aclocal 1.4a
+
+dnl Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc.
+dnl This file is free software; the Free Software Foundation
+dnl gives unlimited permission to copy and/or distribute it,
+dnl with or without modifications, as long as this notice is preserved.
+
+dnl This program is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+dnl even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+dnl PARTICULAR PURPOSE.
+
+dnl  GMP specific autoconf macros
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+dnl  GMP_HEADER_GETVAL(NAME,FILE)
+dnl  ----------------------------
+dnl  Expand to the value of a "#define NAME" from the given FILE.
+dnl  The regexps here aren't very rugged, but are enough for gmp.
+dnl  /dev/null as a parameter prevents a hang if $2 is accidentally omitted.
+
+define(GMP_HEADER_GETVAL,
+[patsubst(patsubst(
+esyscmd([grep "^#define $1 " $2 /dev/null 2>/dev/null]),
+[^.*$1[ 	]+],[]),
+[[
+ 	]*$],[])])
+
+
+dnl  GMP_VERSION
+dnl  -----------
+dnl  The gmp version number, extracted from the #defines in gmp.h.
+dnl  Two digits like 3.0 if patchlevel <= 0, or three digits like 3.0.1 if
+dnl  patchlevel > 0.
+
+define(GMP_VERSION,
+[GMP_HEADER_GETVAL(__GNU_MP_VERSION,gmp.h)[]dnl
+.GMP_HEADER_GETVAL(__GNU_MP_VERSION_MINOR,gmp.h)[]dnl
+ifelse(m4_eval(GMP_HEADER_GETVAL(__GNU_MP_VERSION_PATCHLEVEL,gmp.h) > 0),1,
+[.GMP_HEADER_GETVAL(__GNU_MP_VERSION_PATCHLEVEL,gmp.h)])])
+
+
+dnl  GMP_PROG_M4()
+dnl  -------------
+dnl
+dnl  Find a working m4, either in $PATH or likely locations, and setup $M4
+dnl  and an AC_SUBST accordingly.  If $M4 is already set then it's a user
+dnl  choice and is accepted with no checks.  GMP_PROG_M4 is like
+dnl  AC_PATH_PROG or AC_CHECK_PROG, but it tests each m4 found to see if
+dnl  it's good enough.
+dnl 
+dnl  See mpn/asm-defs.m4 for details on the known bad m4s.
+
+AC_DEFUN(GMP_PROG_M4,
+[AC_CACHE_CHECK([for suitable m4],
+                gmp_cv_prog_m4,
+[if test -n "$M4"; then
+  gmp_cv_prog_m4="$M4"
+else
+  cat >conftest.m4 <<\EOF
+dnl  must protect this against being expanded during autoconf m4!
+[define(dollarhash,``$][#'')dnl
+ifelse(dollarhash(x),1,`define(t1,Y)',
+``bad: $][# not supported (SunOS /usr/bin/m4)
+'')dnl
+ifelse(eval(89),89,`define(t2,Y)',
+`bad: eval() doesnt support 8 or 9 in a constant (OpenBSD 2.6 m4)
+')dnl
+ifelse(t1`'t2,YY,`good
+')dnl]
+EOF
+  echo "trying m4" 1>&AC_FD_CC
+  gmp_tmp_val="`(m4 conftest.m4) 2>&AC_FD_CC`"
+  echo "$gmp_tmp_val" 1>&AC_FD_CC
+  if test "$gmp_tmp_val" = good; then
+    gmp_cv_prog_m4="m4"
+  else
+    IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+dnl $ac_dummy forces splitting on constant user-supplied paths.
+dnl POSIX.2 word splitting is done only on the output of word expansions,
+dnl not every word.  This closes a longstanding sh security hole.
+    ac_dummy="$PATH:/usr/5bin"
+    for ac_dir in $ac_dummy; do
+      test -z "$ac_dir" && ac_dir=.
+      echo "trying $ac_dir/m4" 1>&AC_FD_CC
+      gmp_tmp_val="`($ac_dir/m4 conftest.m4) 2>&AC_FD_CC`"
+      echo "$gmp_tmp_val" 1>&AC_FD_CC
+      if test "$gmp_tmp_val" = good; then
+        gmp_cv_prog_m4="$ac_dir/m4"
+        break
+      fi
+    done
+    IFS="$ac_save_ifs"
+    if test -z "$gmp_cv_prog_m4"; then
+      AC_MSG_ERROR([No usable m4 in \$PATH or /usr/5bin (see config.log for reasons).])
+    fi
+  fi
+  rm -f conftest.m4
+fi])
+M4="$gmp_cv_prog_m4"
+AC_SUBST(M4)
+])
+
+
+dnl  GMP_PROG_CC_FIND([CC_LIST], [REQ_64BIT_CC])
+dnl  Find first working compiler in CC_LIST.
+dnl  If REQ_64BIT_CC is "yes", the compiler is required to be able to 
+dnl  produce 64-bit code.
+dnl  NOTE: If a compiler needs any special flags for producing 64-bit code,
+dnl  these have to be found in shell variable `gmp_cflags64_{cc}', where `{cc}'
+dnl  is the name of the compiler.
+dnl  Set CC to the name of the first working compiler.
+dnl  If a 64-bit compiler is found, set CC64 to the name of the compiler and
+dnl  CFLAGS64 to flags to use.
+dnl  This macro does not test if any of the compilers found is a GNU compiler.
+dnl  To do this, when you have finally made up your mind on which one to use, 
+dnl  and set CC accordingly, invoke [GMP_PROG_CC_SELECT].  That macro will 
+dnl  also make sure that your selection of CFLAGS is valid.
+dnl
+AC_DEFUN(GMP_PROG_CC_FIND,
+[AC_BEFORE([$0], [CC_PROG_CPP])
+ifelse([$1], , gmp_cc_list="gcc cc", gmp_cc_list="[$1]")
+ifelse([$2], , gmp_req_64bit_cc="no", gmp_req_64bit_cc="[$2]")
+
+CC32=
+CC64=
+for c in $gmp_cc_list; do
+  # Avoid cache hits.
+  unset CC
+  unset ac_cv_prog_CC
+  AC_CHECK_TOOL(CC, $c, $c)
+  if test -n "$CC"; then
+    eval c_flags=\$gmp_cflags_$c
+    GMP_PROG_CC_WORKS($CC, $c_flags,
+		      gmp_prog_cc_works=yes, 
+		      gmp_prog_cc_works=no)
+
+    if test "$gmp_prog_cc_works" != "yes"; then
+      continue
+    fi
+
+    # Save first working compiler, whether 32- or 64-bit capable.
+    if test -z "$CC32"; then
+      CC32="$CC"
+    fi
+    if test "$gmp_req_64bit_cc" = "yes"; then
+      eval c_flags=\$gmp_cflags64_$c
+
+      # Verify that the compiler works in 64-bit mode as well.
+      # /usr/ucb/cc on Solaris 7 can *compile* in 64-bit mode, but not link.
+      GMP_PROG_CC_WORKS($c, $c_flags,
+		      	gmp_prog_cc_works=yes, 
+		      	gmp_prog_cc_works=no)
+
+      if test "$gmp_prog_cc_works" = "yes"; then
+        GMP_CHECK_CC_64BIT($c, $c_flags)
+        if test "$gmp_cv_cc_64bit" = "yes"; then
+          test -z "$CC64" && CC64="$c"
+          test -z "$CFLAGS64" && CFLAGS64="$c_flags"
+	  # We have CC64 so we're done.
+          break
+        fi
+      fi
+    else
+      # We have CC32, and we don't need a 64-bit compiler so we're done.
+      break
+    fi
+  fi
+done
+CC="$CC32"
+])dnl
+
+dnl  GMP_PROG_CC_SELECT
+dnl  Check that `CC' works with `CFLAGS'.  Check if `CC' is a GNU compiler.
+dnl  Cache the result as `ac_cv_prog_CC'.
+AC_DEFUN(GMP_PROG_CC_SELECT,
+[AC_BEFORE([$0], [CC_PROG_CPP])
+AC_PROG_CC_WORKS
+AC_PROG_CC_GNU
+
+if test "$ac_cv_prog_gcc" = "yes"; then
+  GCC=yes
+else
+  GCC=
+fi
+
+# Set CFLAGS if not already set.
+if test -z "$CFLAGS"; then
+  CFLAGS="-g"
+  if test "$GCC" = "yes"; then
+    CFLAGS="$CFLAGS -O2"
+  fi
+fi
+
+AC_SUBST(CC)
+AC_CACHE_VAL(ac_cv_prog_CC, ac_cv_prog_CC="$CC")
+AC_PROVIDE([AC_PROG_CC])
+])dnl
+
+dnl  GMP_CHECK_CC_64BIT(cc, cflags64)
+dnl  Find out if `CC' can produce 64-bit code.
+dnl  Requires NM to be set to nm for target.
+dnl  FIXME: Cache result.
+AC_DEFUN(GMP_CHECK_CC_64BIT,
+[
+  gmp_tmp_CC_save="$CC"
+  CC="[$1]"
+  AC_MSG_CHECKING([whether the C compiler ($CC) is 64-bit capable])
+  if test -z "$NM"; then
+    echo; echo ["configure: $0: fatal: need nm"]
+    exit 1
+  fi
+  gmp_tmp_CFLAGS_save="$CFLAGS"
+  CFLAGS="[$2]"
+
+  case "$target" in 
+    hppa2.0*-*-*)
+      # FIXME: If gcc is installed under another name than "gcc", we will 
+      # test the wrong thing.
+      if test "$CC" != "gcc"; then
+        dnl Let compiler version A.10.32.30 or higher be ok.
+        dnl Bad compiler output:
+        dnl   ccom: HP92453-01 G.10.32.05 HP C Compiler
+        dnl Good compiler output:
+        dnl   ccom: HP92453-01 A.10.32.30 HP C Compiler
+        echo >conftest.c
+        gmp_tmp_vs=`$CC $CFLAGS -V -c -o conftest.o conftest.c 2>&1 | grep "^ccom:"`
+        rm conftest*
+        gmp_tmp_v1=`echo $gmp_tmp_vs | sed 's/.* .\.\(.*\)\..*\..* HP C.*/\1/'`
+        gmp_tmp_v2=`echo $gmp_tmp_vs | sed 's/.* .\..*\.\(.*\)\..* HP C.*/\1/'`
+        gmp_tmp_v3=`echo $gmp_tmp_vs | sed 's/.* .\..*\..*\.\(.*\) HP C.*/\1/'`
+	gmp_cv_cc_64bit=no
+	test -n "$gmp_tmp_v1" && test "$gmp_tmp_v1" -ge "10" \
+  	  && test -n "$gmp_tmp_v2" && test "$gmp_tmp_v2" -ge "32" \
+    	  && test -n "$gmp_tmp_v3" && test "$gmp_tmp_v3" -ge "30" \
+	  && gmp_cv_cc_64bit=yes
+      else	# gcc
+	# FIXME: Compile a minimal file and determine if the resulting object 
+	# file is an ELF file.  If so, gcc can produce 64-bit code.
+	# Do we have file(1) for target?
+	gmp_cv_cc_64bit=no
+      fi
+      ;;
+    mips-sgi-irix6.*)
+      # We use `-n32' to cc and `-mabi=n32' to gcc, resulting in 64-bit 
+      # arithmetic but not 64-bit pointers, so the general test for sizeof
+      # (void *) is not valid.
+      # Simply try to compile an empty main.  If that succeeds return
+      # true.
+      AC_TRY_COMPILE( , ,
+                     gmp_cv_cc_64bit=yes, gmp_cv_cc_64bit=no,
+                     gmp_cv_cc_64bit=no)
+      ;;
+    *-*-*)
+      # Allocate an array of size sizeof (void *) and use nm to determine its 
+      # size.  We depend on the first declared variable being put at address 0.
+      cat >conftest.c <<EOF
+[char arr[sizeof (void *)]={0};
+char post=0;]
+EOF
+      gmp_compile="$CC $CFLAGS -c conftest.c 1>&AC_FD_CC"
+      if AC_TRY_EVAL(gmp_compile); then
+        changequote(<,>)dnl
+	gmp_tmp_val=`$NM conftest.o | grep post | sed -e 's;[[][0-9][]]\(.*\);\1;' \
+          -e 's;[^1-9]*\([0-9]*\).*;\1;'`
+        changequote([, ])dnl
+        if test "$gmp_tmp_val" = "8"; then
+	  gmp_cv_cc_64bit=yes
+	else
+	  gmp_cv_cc_64bit=no
+        fi
+      else
+        echo "configure: failed program was:" >&AC_FD_CC
+        cat conftest.$ac_ext >&AC_FD_CC
+        gmp_cv_cc_64bit=no
+      fi
+      rm -f conftest*
+      ;;
+  esac
+
+  CC="$gmp_tmp_CC_save"
+  CFLAGS="$gmp_tmp_CFLAGS_save"
+  AC_MSG_RESULT($gmp_cv_cc_64bit)
+])dnl
+
+dnl  GMP_INIT([M4-DEF-FILE])
+dnl  
+AC_DEFUN(GMP_INIT,
+[ifelse([$1], , gmp_configm4=config.m4, gmp_configm4="[$1]")
+gmp_tmpconfigm4=cnfm4.tmp
+gmp_tmpconfigm4i=cnfm4i.tmp
+gmp_tmpconfigm4p=cnfm4p.tmp
+test -f $gmp_tmpconfigm4 && rm $gmp_tmpconfigm4
+test -f $gmp_tmpconfigm4i && rm $gmp_tmpconfigm4i
+test -f $gmp_tmpconfigm4p && rm $gmp_tmpconfigm4p
+])dnl
+
+dnl  GMP_FINISH
+dnl  ----------
+dnl  Create config.m4 from its accumulated parts.
+dnl
+dnl  __CONFIG_M4_INCLUDED__ is used so that a second or subsequent include
+dnl  of config.m4 is harmless.
+dnl
+dnl  A separate ifdef on the angle bracket quoted part ensures the quoting
+dnl  style there is respected.  The basic defines from gmp_tmpconfigm4 are
+dnl  fully quoted but are still put under an ifdef in case any have been
+dnl  redefined by one of the m4 include files.
+dnl
+dnl  Doing a big ifdef within asm-defs.m4 and/or other macro files wouldn't
+dnl  work, since it'd interpret parentheses and quotes in dnl comments, and
+dnl  having a whole file as a macro argument would overflow the string space
+dnl  on BSD m4.
+
+AC_DEFUN(GMP_FINISH,
+[AC_REQUIRE([GMP_INIT])
+echo "creating $gmp_configm4"
+echo ["dnl $gmp_configm4.  Generated automatically by configure."] > $gmp_configm4
+if test -f $gmp_tmpconfigm4; then
+  echo ["changequote(<,>)dnl"] >> $gmp_configm4
+  echo ["ifdef(<__CONFIG_M4_INCLUDED__>,,<"] >> $gmp_configm4
+  cat $gmp_tmpconfigm4 >> $gmp_configm4
+  echo [">)"] >> $gmp_configm4
+  echo ["changequote(\`,')dnl"] >> $gmp_configm4
+  rm $gmp_tmpconfigm4
+fi
+echo ["ifdef(\`__CONFIG_M4_INCLUDED__',,\`"] >> $gmp_configm4
+if test -f $gmp_tmpconfigm4i; then
+  cat $gmp_tmpconfigm4i >> $gmp_configm4
+  rm $gmp_tmpconfigm4i
+fi
+if test -f $gmp_tmpconfigm4p; then
+  cat $gmp_tmpconfigm4p >> $gmp_configm4
+  rm $gmp_tmpconfigm4p
+fi
+echo ["')"] >> $gmp_configm4
+echo ["define(\`__CONFIG_M4_INCLUDED__')"] >> $gmp_configm4
+])dnl
+
+dnl  GMP_INCLUDE(FILE)
+AC_DEFUN(GMP_INCLUDE,
+[AC_REQUIRE([GMP_INIT])
+echo ["include(\`$1')"] >> $gmp_tmpconfigm4i
+])dnl
+
+dnl  GMP_SINCLUDE(FILE)
+AC_DEFUN(GMP_SINCLUDE,
+[AC_REQUIRE([GMP_INIT])
+echo ["sinclude(\`$1')"] >> $gmp_tmpconfigm4i
+])dnl
+
+dnl GMP_DEFINE(MACRO, DEFINITION [, LOCATION])
+dnl [ Define M4 macro MACRO as DEFINITION in temporary file.		]
+dnl [ If LOCATION is `POST', the definition will appear after any	]
+dnl [ include() directives inserted by GMP_INCLUDE/GMP_SINCLUDE.	]
+dnl [ Mind the quoting!  No shell variables will get expanded.		]
+dnl [ Don't forget to invoke GMP_FINISH to create file config.m4.	]
+dnl [ config.m4 uses `<' and '>' as quote characters for all defines.	]
+AC_DEFUN(GMP_DEFINE, 
+[AC_REQUIRE([GMP_INIT])
+echo ['define(<$1>, <$2>)'] >> ifelse([$3], [POST], $gmp_tmpconfigm4p, $gmp_tmpconfigm4)
+])dnl
+
+dnl GMP_DEFINE_RAW(STRING, [, LOCATION])
+dnl [ Put STRING in temporary file.					]
+dnl [ If LOCATION is `POST', the definition will appear after any	]
+dnl [ include() directives inserted by GMP_INCLUDE/GMP_SINCLUDE.	]
+dnl [ Don't forget to invoke GMP_FINISH to create file config.m4.	]
+AC_DEFUN(GMP_DEFINE_RAW,
+[AC_REQUIRE([GMP_INIT])
+echo [$1] >> ifelse([$2], [POST], $gmp_tmpconfigm4p, $gmp_tmpconfigm4)
+])dnl
+
+dnl  GMP_CHECK_ASM_LABEL_SUFFIX
+dnl  Should a label have a colon or not?
+AC_DEFUN(GMP_CHECK_ASM_LABEL_SUFFIX,
+[AC_CACHE_CHECK([what assembly label suffix to use],
+               gmp_cv_check_asm_label_suffix,
+[case "$target" in 
+  *-*-hpux*) gmp_cv_check_asm_label_suffix=[""] ;;
+  *) gmp_cv_check_asm_label_suffix=[":"] ;;
+esac
+])
+echo ["define(<LABEL_SUFFIX>, <\$][1$gmp_cv_check_asm_label_suffix>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_UNDERSCORE([ACTION-IF-FOUND [, ACTION-IF-NOT-FOUND]])
+dnl  Shamelessly borrowed from glibc.
+AC_DEFUN(GMP_CHECK_ASM_UNDERSCORE,
+[AC_CACHE_CHECK([if symbols are prefixed by underscore], 
+	        gmp_cv_check_asm_underscore,
+[cat > conftest.$ac_ext <<EOF
+dnl This sometimes fails to find confdefs.h, for some reason.
+dnl [#]line __oline__ "[$]0"
+[#]line __oline__ "configure"
+#include "confdefs.h"
+int underscore_test() {
+return; }
+EOF
+if AC_TRY_EVAL(ac_compile); then
+  if grep _underscore_test conftest* >/dev/null; then
+    gmp_cv_check_asm_underscore=yes
+  else
+    gmp_cv_check_asm_underscore=no
+  fi
+else
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.$ac_ext >&AC_FD_CC
+fi
+rm -f conftest*
+])
+if test "$gmp_cv_check_asm_underscore" = "yes"; then
+  GMP_DEFINE(GSYM_PREFIX, [_])
+  ifelse([$1], , :, [$1])
+else
+  GMP_DEFINE(GSYM_PREFIX, [])
+  ifelse([$2], , :, [$2])
+fi    
+])dnl
+
+dnl  GMP_CHECK_ASM_ALIGN_LOG([ACTION-IF-FOUND [, ACTION-IF-NOT-FOUND]])
+dnl  Is parameter to `.align' logarithmic?
+dnl  Requires NM to be set to nm for target.
+AC_DEFUN(GMP_CHECK_ASM_ALIGN_LOG,
+[AC_REQUIRE([GMP_CHECK_ASM_GLOBL])
+AC_REQUIRE([GMP_CHECK_ASM_DATA])
+AC_REQUIRE([GMP_CHECK_ASM_LABEL_SUFFIX])
+AC_CACHE_CHECK([if .align assembly directive is logarithmic],
+		gmp_cv_check_asm_align_log,
+[if test -z "$NM"; then
+  echo; echo ["configure: $0: fatal: need nm"]
+  exit 1
+fi
+cat > conftest.s <<EOF
+      	$gmp_cv_check_asm_data
+      	.align  4
+	$gmp_cv_check_asm_globl	foo
+	.byte	1
+	.align	4
+foo$gmp_cv_check_asm_label_suffix
+	.byte	2
+EOF
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+if AC_TRY_EVAL(ac_assemble); then
+  changequote(<,>)
+  gmp_tmp_val=`$NM conftest.o | grep foo | sed -e 's;[[][0-9][]]\(.*\);\1;' \
+       -e 's;[^1-9]*\([0-9]*\).*;\1;'`
+  changequote([, ])dnl
+  if test "$gmp_tmp_val" = "10" || test "$gmp_tmp_val" = "16"; then
+    gmp_cv_check_asm_align_log=yes
+  else
+    gmp_cv_check_asm_align_log=no
+  fi
+else 
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.s >&AC_FD_CC
+fi
+rm -f conftest*
+])
+GMP_DEFINE_RAW(["define(<ALIGN_LOGARITHMIC>,<$gmp_cv_check_asm_align_log>)"])
+if test "$gmp_cv_check_asm_align_log" = "yes"; then
+  ifelse([$1], , :, [$1])
+else
+  ifelse([$2], , :, [$2])
+fi  
+])dnl
+
+
+dnl  GMP_CHECK_ASM_ALIGN_FILL_0x90
+dnl  -----------------------------
+dnl  Determine whether a ",0x90" suffix works on a .align directive.
+dnl  This is only meant for use on x86, where 0x90 is a "nop".
+dnl
+dnl  Old gas, eg. 1.92.3 - needs ",0x90" or else the fill is an invalid 0x00.
+dnl  New gas, eg. 2.91 - generates the good multibyte nop fills even when
+dnl                      ",0x90" is given.
+dnl  Solaris 2.6 as - doesn't allow ",0x90", gives a fatal error.
+dnl  Solaris 2.8 as - gives a warning for ",0x90", no ill effect.
+dnl
+dnl  Note that both solaris "as"s only care about ",0x90" if they actually
+dnl  have to use it to fill something, hence the .byte in the sample.  It's
+dnl  only the second .align that provokes an error or warning.
+dnl
+dnl  We prefer to suppress the warning from solaris 2.8 to stop anyone
+dnl  worrying something might be wrong.
+
+AC_DEFUN(GMP_CHECK_ASM_ALIGN_FILL_0x90,
+[AC_CACHE_CHECK([if the .align directive accepts an 0x90 fill in .text],
+                gmp_cv_check_asm_align_fill_0x90,
+[AC_REQUIRE([GMP_CHECK_ASM_TEXT])
+cat > conftest.s <<EOF
+      	$gmp_cv_check_asm_text
+      	.align  4, 0x90
+	.byte   0
+      	.align  4, 0x90
+EOF
+gmp_tmp_val="`$CCAS $CFLAGS conftest.s 2>&1`"
+if test $? = 0; then
+  echo "$gmp_tmp_val" 1>&AC_FD_CC
+  if echo "$gmp_tmp_val" | grep "Warning: Fill parameter ignored for executable section"; then
+    echo "Supressing this warning by omitting 0x90" 1>&AC_FD_CC
+    gmp_cv_check_asm_align_fill_0x90=no
+  else
+    gmp_cv_check_asm_align_fill_0x90=yes
+  fi
+else
+  echo "Non-zero exit code" 1>&AC_FD_CC
+  echo "$gmp_tmp_val" 1>&AC_FD_CC
+  gmp_cv_check_asm_align_fill_0x90=no
+fi
+rm -f conftest*
+])
+GMP_DEFINE_RAW(
+["define(<ALIGN_FILL_0x90>,<$gmp_cv_check_asm_align_fill_0x90>)"])
+])
+
+
+dnl  GMP_CHECK_ASM_TEXT
+AC_DEFUN(GMP_CHECK_ASM_TEXT,
+[AC_CACHE_CHECK([how to switch to text section], gmp_cv_check_asm_text,
+[case "$target" in
+  *-*-aix*)
+    changequote({, })
+    gmp_cv_check_asm_text={".csect .text[PR]"}
+    changequote([, ])
+    ;;
+  *-*-hpux*) gmp_cv_check_asm_text=[".code"] ;;
+  *) gmp_cv_check_asm_text=[".text"] ;;
+esac
+])
+echo ["define(<TEXT>, <$gmp_cv_check_asm_text>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_DATA
+dnl  Can we say `.data'?
+AC_DEFUN(GMP_CHECK_ASM_DATA,
+[AC_CACHE_CHECK([how to switch to data section], gmp_cv_check_asm_data,
+[case "$target" in
+  *-*-aix*)
+    changequote({, })
+    gmp_cv_check_asm_data={".csect .data[RW]"}
+    changequote([, ])
+    ;;
+  *) gmp_cv_check_asm_data=[".data"] ;;
+esac
+])
+echo ["define(<DATA>, <$gmp_cv_check_asm_data>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_GLOBL
+dnl  Can we say `.global'?
+AC_DEFUN(GMP_CHECK_ASM_GLOBL,
+[AC_CACHE_CHECK([how to export a symbol], gmp_cv_check_asm_globl,
+[case "$target" in
+  *-*-hpux*) gmp_cv_check_asm_globl=[".export"] ;;
+  *) gmp_cv_check_asm_globl=[".globl"] ;;
+esac
+])
+echo ["define(<GLOBL>, <$gmp_cv_check_asm_globl>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_TYPE
+dnl  Can we say `.type'?
+AC_DEFUN(GMP_CHECK_ASM_TYPE,
+[AC_CACHE_CHECK([how the .type assembly directive should be used],
+gmp_cv_check_asm_type,
+[ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+for gmp_tmp_prefix in @ \# %; do
+  echo "	.type	sym,${gmp_tmp_prefix}function" > conftest.s
+  if AC_TRY_EVAL(ac_assemble); then
+    gmp_cv_check_asm_type="[.type	\$][1,${gmp_tmp_prefix}\$][2]"
+    break
+  fi
+done
+if test -z "$gmp_cv_check_asm_type"; then
+  gmp_cv_check_asm_type="[dnl]"
+fi
+])
+echo ["define(<TYPE>, <$gmp_cv_check_asm_type>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_SIZE
+dnl  Can we say `.size'?
+AC_DEFUN(GMP_CHECK_ASM_SIZE,
+[AC_CACHE_CHECK([if the .size assembly directive works], gmp_cv_check_asm_size,
+[ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+echo '	.size	sym,1' > conftest.s
+if AC_TRY_EVAL(ac_assemble); then
+  gmp_cv_check_asm_size="[.size	\$][1,\$][2]"
+else
+  gmp_cv_check_asm_size="[dnl]"
+fi
+])
+echo ["define(<SIZE>, <$gmp_cv_check_asm_size>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_LSYM_PREFIX
+dnl  What is the prefix for a local label?
+dnl  Requires NM to be set to nm for target.
+AC_DEFUN(GMP_CHECK_ASM_LSYM_PREFIX,
+[AC_REQUIRE([GMP_CHECK_ASM_LABEL_SUFFIX])
+AC_CACHE_CHECK([what prefix to use for a local label], 
+gmp_cv_check_asm_lsym_prefix,
+[if test -z "$NM"; then
+  echo; echo ["$0: fatal: need nm"]
+  exit 1
+fi
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+gmp_cv_check_asm_lsym_prefix="L"
+for gmp_tmp_pre in L .L $ L$; do
+  cat > conftest.s <<EOF
+dummy${gmp_cv_check_asm_label_suffix}
+${gmp_tmp_pre}gurkmacka${gmp_cv_check_asm_label_suffix}
+	.byte 0
+EOF
+  if AC_TRY_EVAL(ac_assemble); then
+    $NM conftest.o >/dev/null 2>&1
+    gmp_rc=$?
+    if test "$gmp_rc" != "0"; then
+      echo "configure: $NM failure, using default"
+      break
+    fi
+    if $NM conftest.o | grep gurkmacka >/dev/null; then true; else
+      gmp_cv_check_asm_lsym_prefix="$gmp_tmp_pre"
+      break
+    fi
+  else
+    echo "configure: failed program was:" >&AC_FD_CC
+    cat conftest.s >&AC_FD_CC
+    # Use default.
+  fi
+done
+rm -f conftest*
+])
+echo ["define(<LSYM_PREFIX>, <${gmp_cv_check_asm_lsym_prefix}>)"] >> $gmp_tmpconfigm4
+])
+
+dnl  GMP_CHECK_ASM_W32
+dnl  How to [define] a 32-bit word.
+dnl  Requires NM to be set to nm for target.
+AC_DEFUN(GMP_CHECK_ASM_W32,
+[AC_REQUIRE([GMP_CHECK_ASM_DATA])
+AC_REQUIRE([GMP_CHECK_ASM_GLOBL])
+AC_REQUIRE([GMP_CHECK_ASM_LABEL_SUFFIX])
+AC_CACHE_CHECK([how to [define] a 32-bit word],
+	       gmp_cv_check_asm_w32,
+[if test -z "$NM"; then
+  echo; echo ["configure: $0: fatal: need nm"]
+  exit 1
+fi
+
+# FIXME: HPUX puts first symbol at 0x40000000, breaking our assumption
+# that it's at 0x0.  We'll have to declare another symbol before the
+# .long/.word and look at the distance between the two symbols.  The
+# only problem is that the sed expression(s) barfs (on Solaris, for
+# example) for the symbol with value 0.  For now, HPUX uses .word.
+
+case "$target" in 
+  *-*-hpux*)
+    gmp_cv_check_asm_w32=".word"
+    ;;
+  *-*-*)
+    ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+    for gmp_tmp_op in .long .word; do
+      cat > conftest.s <<EOF
+	$gmp_cv_check_asm_data
+	$gmp_cv_check_asm_globl	foo
+	$gmp_tmp_op	0
+foo${gmp_cv_check_asm_label_suffix}
+	.byte	0
+EOF
+      if AC_TRY_EVAL(ac_assemble); then
+        changequote(<,>)
+        gmp_tmp_val=`$NM conftest.o | grep foo | sed -e 's;[[][0-9][]]\(.*\);\1;' \
+             -e 's;[^1-9]*\([0-9]*\).*;\1;'`
+        changequote([, ])dnl
+        if test "$gmp_tmp_val" = "4"; then
+          gmp_cv_check_asm_w32="$gmp_tmp_op"
+          break
+        fi
+      fi
+    done
+    ;;
+esac
+
+if test -z "$gmp_cv_check_asm_w32"; then
+  echo; echo ["configure: $0: fatal: do not know how to define a 32-bit word"]
+  exit 1
+fi
+rm -f conftest*
+])
+echo ["define(<W32>, <$gmp_cv_check_asm_w32>)"] >> $gmp_tmpconfigm4
+])
+
+dnl  GMP_CHECK_ASM_MMX([ACTION-IF-FOUND, [ACTION-IF-NOT-FOUND]])
+dnl  Can we assemble MMX insns?
+AC_DEFUN(GMP_CHECK_ASM_MMX,
+[AC_REQUIRE([GMP_CHECK_ASM_TEXT])
+AC_CACHE_CHECK([if the assembler knows about MMX instructions],
+		gmp_cv_check_asm_mmx,
+[cat > conftest.s <<EOF
+	$gmp_cv_check_asm_text
+	por	%mm0, %mm0
+EOF
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+if AC_TRY_EVAL(ac_assemble); then
+  gmp_cv_check_asm_mmx=yes
+else 
+  gmp_cv_check_asm_mmx=no
+fi
+rm -f conftest*
+])
+if test "$gmp_cv_check_asm_mmx" = "yes"; then
+  ifelse([$1], , :, [$1])
+else
+  AC_MSG_WARN([+----------------------------------------------------------])
+  AC_MSG_WARN([| WARNING WARNING WARNING])
+  AC_MSG_WARN([| Target CPU has MMX code, but it can't be assembled by])
+  AC_MSG_WARN([|     $CCAS $CFLAGS])
+  AC_MSG_WARN([| Non-MMX replacements will be used.])
+  AC_MSG_WARN([| This will be an inferior build.])
+  AC_MSG_WARN([+----------------------------------------------------------])
+  ifelse([$2], , :, [$2])
+fi
+])dnl
+
+dnl  GMP_CHECK_ASM_SHLDL_CL([ACTION-IF-FOUND, [ACTION-IF-NOT-FOUND]])
+AC_DEFUN(GMP_CHECK_ASM_SHLDL_CL,
+[AC_REQUIRE([GMP_CHECK_ASM_TEXT])
+AC_CACHE_CHECK([if the assembler takes cl with shldl],
+		gmp_cv_check_asm_shldl_cl,
+[cat > conftest.s <<EOF
+	$gmp_cv_check_asm_text
+	shldl	%cl, %eax, %ebx
+EOF
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+if AC_TRY_EVAL(ac_assemble); then
+  gmp_cv_check_asm_shldl_cl=yes
+else 
+  gmp_cv_check_asm_shldl_cl=no
+fi
+rm -f conftest*
+])
+if test "$gmp_cv_check_asm_shldl_cl" = "yes"; then
+  ifelse([$1], , :, [$1])
+else
+  ifelse([$2], , :, [$2])
+fi
+])dnl
+
+dnl  GMP_PROG_CC_WORKS(CC, CFLAGS, ACTION-IF-WORKS, [ACTION-IF-NOT-WORKS])
+dnl  Check if CC can compile and link.  Perform various target specific tests.
+dnl  FIXME: Require `$target'.
+AC_DEFUN(GMP_PROG_CC_WORKS,
+[AC_LANG_C	dnl  Note: Destructive.
+CC="[$1]"
+CFLAGS="[$2]"
+AC_MSG_CHECKING([if the C compiler ($CC) works with flags $CFLAGS])
+
+# Simple test for all targets.
+AC_TRY_COMPILER([int main(){return(0);}],
+                tmp_works, tmp_cross)
+
+# Target specific tests.
+if test "$tmp_works" = "yes"; then
+  case "$target" in 
+    *-*-aix*)	# Returning a funcptr.
+      AC_TRY_COMPILE( , [} void *g(); void *f() { return g(); } int bar(){],
+                      tmp_works=yes, tmp_works=no)
+      ;;
+  esac
+fi
+
+if test "$tmp_works" = "yes"; then
+  [$3]
+else
+  ifelse([$4], , :, [$4])
+fi
+
+AC_MSG_RESULT($tmp_works)
+])dnl
+
+
+dnl  GMP_C_ANSI2KNR
+dnl  --------------
+dnl  Setup to use ansi2knr if necessary.
+dnl
+dnl  The test here is simply that if an ANSI style function works then
+dnl  ansi2knr isn't needed.  The normal tests for whether $CC works mean we
+dnl  don't need to worry here about anything badly broken.
+dnl
+dnl  AM_C_PROTOTYPES is the normal way to set up ansi2knr, but (in automake
+dnl  March 2000) it gives the wrong answer on a C++ compiler because its
+dnl  test requires that the compiler accept both ANSI and K&R, or otherwise
+dnl  ansi2knr is used.  A C++ compiler fails on the K&R part, which makes
+dnl  AM_C_PROTOTYPES think it needs ansi2knr!  GMP has no bare K&R so we
+dnl  only need ANSI or K&R to work, not both.
+
+AC_DEFUN(GMP_C_ANSI2KNR,
+[AC_CACHE_CHECK([if ansi2knr should be used],
+                gmp_cv_c_ansi2knr,
+[cat >conftest.c <<EOF
+int main (int argc, char *argv[]) { return 0; }
+EOF
+if AC_TRY_EVAL(ac_compile); then
+  gmp_cv_c_ansi2knr=no
+else
+  gmp_cv_c_ansi2knr=yes
+fi
+rm -f conftest.*
+])
+if test $gmp_cv_c_ansi2knr = no; then
+  U= ANSI2KNR=
+else
+  U=_ ANSI2KNR=./ansi2knr
+  # Ensure some checks needed by ansi2knr itself.
+  AC_HEADER_STDC
+  AC_CHECK_HEADERS(string.h)
+fi
+AC_SUBST(U)
+AC_SUBST(ANSI2KNR)
+])
+
+
+dnl  Deal with bad synchronization of Autoconf with Libtool.
+AC_DEFUN(AC_CANONICAL_BUILD, [_AC_CANONICAL_BUILD])
+AC_DEFUN(AC_CHECK_TOOL_PREFIX, [_AC_CHECK_TOOL_PREFIX])
+
+
+# serial 1
+
+AC_DEFUN(AM_C_PROTOTYPES,
+[AC_REQUIRE([AM_PROG_CC_STDC])
+AC_REQUIRE([AC_PROG_CPP])
+AC_MSG_CHECKING([for function prototypes])
+if test "$am_cv_prog_cc_stdc" != no; then
+  AC_MSG_RESULT(yes)
+  AC_DEFINE(PROTOTYPES,1,[Define if compiler has function prototypes])
+  U= ANSI2KNR=
+else
+  AC_MSG_RESULT(no)
+  U=_ ANSI2KNR=./ansi2knr
+  # Ensure some checks needed by ansi2knr itself.
+  AC_HEADER_STDC
+  AC_CHECK_HEADERS(string.h)
+fi
+AC_SUBST(U)dnl
+AC_SUBST(ANSI2KNR)dnl
+])
+
+
+# serial 1
+
+# @defmac AC_PROG_CC_STDC
+# @maindex PROG_CC_STDC
+# @ovindex CC
+# If the C compiler in not in ANSI C mode by default, try to add an option
+# to output variable @code{CC} to make it so.  This macro tries various
+# options that select ANSI C on some system or another.  It considers the
+# compiler to be in ANSI C mode if it handles function prototypes correctly.
+#
+# If you use this macro, you should check after calling it whether the C
+# compiler has been set to accept ANSI C; if not, the shell variable
+# @code{am_cv_prog_cc_stdc} is set to @samp{no}.  If you wrote your source
+# code in ANSI C, you can make an un-ANSIfied copy of it by using the
+# program @code{ansi2knr}, which comes with Ghostscript.
+# @end defmac
+
+AC_DEFUN(AM_PROG_CC_STDC,
+[AC_REQUIRE([AC_PROG_CC])
+AC_BEFORE([$0], [AC_C_INLINE])
+AC_BEFORE([$0], [AC_C_CONST])
+dnl Force this before AC_PROG_CPP.  Some cpp's, eg on HPUX, require
+dnl a magic option to avoid problems with ANSI preprocessor commands
+dnl like #elif.
+dnl FIXME: can't do this because then AC_AIX won't work due to a
+dnl circular dependency.
+dnl AC_BEFORE([$0], [AC_PROG_CPP])
+AC_MSG_CHECKING(for ${CC-cc} option to accept ANSI C)
+AC_CACHE_VAL(am_cv_prog_cc_stdc,
+[am_cv_prog_cc_stdc=no
+ac_save_CC="$CC"
+# Don't try gcc -ansi; that turns off useful extensions and
+# breaks some systems' header files.
+# AIX			-qlanglvl=ansi
+# Ultrix and OSF/1	-std1
+# HP-UX 10.20 and later	-Ae
+# HP-UX older versions	-Aa -D_HPUX_SOURCE
+# SVR4			-Xc -D__EXTENSIONS__
+for ac_arg in "" -qlanglvl=ansi -std1 -Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
+do
+  CC="$ac_save_CC $ac_arg"
+  AC_TRY_COMPILE(
+[#include <stdarg.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+/* Most of the following tests are stolen from RCS 5.7's src/conf.sh.  */
+struct buf { int x; };
+FILE * (*rcsopen) (struct buf *, struct stat *, int);
+static char *e (p, i)
+     char **p;
+     int i;
+{
+  return p[i];
+}
+static char *f (char * (*g) (char **, int), char **p, ...)
+{
+  char *s;
+  va_list v;
+  va_start (v,p);
+  s = g (p, va_arg (v,int));
+  va_end (v);
+  return s;
+}
+int test (int i, double x);
+struct s1 {int (*f) (int a);};
+struct s2 {int (*f) (double a);};
+int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int);
+int argc;
+char **argv;
+], [
+return f (e, argv, 0) != argv[0]  ||  f (e, argv, 1) != argv[1];
+],
+[am_cv_prog_cc_stdc="$ac_arg"; break])
+done
+CC="$ac_save_CC"
+])
+if test -z "$am_cv_prog_cc_stdc"; then
+  AC_MSG_RESULT([none needed])
+else
+  AC_MSG_RESULT($am_cv_prog_cc_stdc)
+fi
+case "x$am_cv_prog_cc_stdc" in
+  x|xno) ;;
+  *) CC="$CC $am_cv_prog_cc_stdc" ;;
+esac
+])
+
+# Do all the work for Automake.  This macro actually does too much --
+# some checks are only needed if your package does certain things.
+# But this isn't really a big deal.
+
+# serial 1
+
+dnl Usage:
+dnl AM_INIT_AUTOMAKE(package,version, [no-define])
+
+AC_DEFUN(AM_INIT_AUTOMAKE,
+[AC_REQUIRE([AC_PROG_INSTALL])
+dnl We require 2.13 because we rely on SHELL being computed by configure.
+AC_PREREQ([2.13])
+PACKAGE=[$1]
+AC_SUBST(PACKAGE)
+VERSION=[$2]
+AC_SUBST(VERSION)
+dnl test to see if srcdir already configured
+if test "`CDPATH=: && cd $srcdir && pwd`" != "`pwd`" &&
+   test -f $srcdir/config.status; then
+  AC_MSG_ERROR([source directory already configured; run "make distclean" there first])
+fi
+ifelse([$3],,
+AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of package])
+AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version number of package]))
+AC_REQUIRE([AM_SANITY_CHECK])
+AC_REQUIRE([AC_ARG_PROGRAM])
+AM_MISSING_PROG(ACLOCAL, aclocal)
+AM_MISSING_PROG(AUTOCONF, autoconf)
+AM_MISSING_PROG(AUTOMAKE, automake)
+AM_MISSING_PROG(AUTOHEADER, autoheader)
+AM_MISSING_PROG(MAKEINFO, makeinfo)
+AM_MISSING_PROG(AMTAR, tar)
+AM_MISSING_INSTALL_SH
+dnl We need awk for the "check" target.  The system "awk" is bad on
+dnl some platforms.
+AC_REQUIRE([AC_PROG_AWK])
+AC_REQUIRE([AC_PROG_MAKE_SET])
+AC_REQUIRE([AM_DEP_TRACK])
+AC_REQUIRE([AM_SET_DEPDIR])
+ifdef([AC_PROVIDE_AC_PROG_CC], [AM_DEPENDENCIES(CC)], [
+   define([AC_PROG_CC], defn([AC_PROG_CC])[AM_DEPENDENCIES(CC)])])
+ifdef([AC_PROVIDE_AC_PROG_CXX], [AM_DEPENDENCIES(CXX)], [
+   define([AC_PROG_CXX], defn([AC_PROG_CXX])[AM_DEPENDENCIES(CXX)])])
+])
+
+#
+# Check to make sure that the build environment is sane.
+#
+
+AC_DEFUN(AM_SANITY_CHECK,
+[AC_MSG_CHECKING([whether build environment is sane])
+# Just in case
+sleep 1
+echo timestamp > conftestfile
+# Do `set' in a subshell so we don't clobber the current shell's
+# arguments.  Must try -L first in case configure is actually a
+# symlink; some systems play weird games with the mod time of symlinks
+# (eg FreeBSD returns the mod time of the symlink's containing
+# directory).
+if (
+   set X `ls -Lt $srcdir/configure conftestfile 2> /dev/null`
+   if test "[$]*" = "X"; then
+      # -L didn't work.
+      set X `ls -t $srcdir/configure conftestfile`
+   fi
+   if test "[$]*" != "X $srcdir/configure conftestfile" \
+      && test "[$]*" != "X conftestfile $srcdir/configure"; then
+
+      # If neither matched, then we have a broken ls.  This can happen
+      # if, for instance, CONFIG_SHELL is bash and it inherits a
+      # broken ls alias from the environment.  This has actually
+      # happened.  Such a system could not be considered "sane".
+      AC_MSG_ERROR([ls -t appears to fail.  Make sure there is not a broken
+alias in your environment])
+   fi
+
+   test "[$]2" = conftestfile
+   )
+then
+   # Ok.
+   :
+else
+   AC_MSG_ERROR([newly created file is older than distributed files!
+Check your system clock])
+fi
+rm -f conftest*
+AC_MSG_RESULT(yes)])
+
+dnl AM_MISSING_PROG(NAME, PROGRAM)
+AC_DEFUN(AM_MISSING_PROG, [
+AC_REQUIRE([AM_MISSING_HAS_RUN])
+$1=${$1-"${am_missing_run}$2"}
+AC_SUBST($1)])
+
+dnl Like AM_MISSING_PROG, but only looks for install-sh.
+dnl AM_MISSING_INSTALL_SH()
+AC_DEFUN(AM_MISSING_INSTALL_SH, [
+AC_REQUIRE([AM_MISSING_HAS_RUN])
+if test -z "$install_sh"; then
+   install_sh="$ac_aux_dir/install-sh"
+   test -f "$install_sh" || install_sh="$ac_aux_dir/install.sh"
+   test -f "$install_sh" || install_sh="${am_missing_run}${ac_auxdir}/install-sh"
+   dnl FIXME: an evil hack: we remove the SHELL invocation from
+   dnl install_sh because automake adds it back in.  Sigh.
+   install_sh="`echo $install_sh | sed -e 's/\${SHELL}//'`"
+fi
+AC_SUBST(install_sh)])
+
+dnl AM_MISSING_HAS_RUN.
+dnl Define MISSING if not defined so far and test if it supports --run.
+dnl If it does, set am_missing_run to use it, otherwise, to nothing.
+AC_DEFUN([AM_MISSING_HAS_RUN], [
+test x"${MISSING+set}" = xset || \
+  MISSING="\${SHELL} `CDPATH=: && cd $ac_aux_dir && pwd`/missing"
+dnl Use eval to expand $SHELL
+if eval "$MISSING --run :"; then
+  am_missing_run="$MISSING --run "
+else
+  am_missing_run=
+  am_backtick='`'
+  AC_MSG_WARN([${am_backtick}missing' script is too old or missing])
+fi
+])
+
+dnl See how the compiler implements dependency checking.
+dnl Usage:
+dnl AM_DEPENDENCIES(NAME)
+dnl NAME is "CC", "CXX" or "OBJC".
+
+dnl We try a few techniques and use that to set a single cache variable.
+
+AC_DEFUN(AM_DEPENDENCIES,[
+AC_REQUIRE([AM_SET_DEPDIR])
+AC_REQUIRE([AM_OUTPUT_DEPENDENCY_COMMANDS])
+ifelse([$1],CC,[
+AC_REQUIRE([AC_PROG_CC])
+AC_REQUIRE([AC_PROG_CPP])
+depcc="$CC"
+depcpp="$CPP"],[$1],CXX,[
+AC_REQUIRE([AC_PROG_CXX])
+AC_REQUIRE([AC_PROG_CXXCPP])
+depcc="$CXX"
+depcpp="$CXXCPP"],[$1],OBJC,[
+am_cv_OBJC_dependencies_compiler_type=gcc],[
+AC_REQUIRE([AC_PROG_][$1])
+depcc="$[$1]"
+depcpp=""])
+AC_MSG_CHECKING([dependency style of $depcc])
+AC_CACHE_VAL(am_cv_[$1]_dependencies_compiler_type,[
+if test -z "$AMDEP"; then
+  echo '#include "conftest.h"' > conftest.c
+  echo 'int i;' > conftest.h
+
+  am_cv_[$1]_dependencies_compiler_type=none
+  for depmode in `sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < "$am_depcomp"`; do
+    case "$depmode" in
+    nosideeffect)
+      # after this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested
+      if test "x$enable_dependency_tracking" = xyes; then
+	continue
+      else
+	break
+      fi
+      ;;
+    none) break ;;
+    esac
+    if depmode="$depmode" \
+       source=conftest.c object=conftest.o \
+       depfile=conftest.Po tmpdepfile=conftest.TPo \
+       $SHELL $am_depcomp $depcc -c conftest.c 2>/dev/null &&
+       grep conftest.h conftest.Po > /dev/null 2>&1; then
+      am_cv_[$1]_dependencies_compiler_type="$depmode"
+      break
+    fi
+  done
+
+  rm -f conftest.*
+else
+  am_cv_[$1]_dependencies_compiler_type=none
+fi
+])
+AC_MSG_RESULT($am_cv_[$1]_dependencies_compiler_type)
+[$1]DEPMODE="depmode=$am_cv_[$1]_dependencies_compiler_type"
+AC_SUBST([$1]DEPMODE)
+])
+
+dnl Choose a directory name for dependency files.
+dnl This macro is AC_REQUIREd in AM_DEPENDENCIES
+
+AC_DEFUN(AM_SET_DEPDIR,[
+if test -d .deps || mkdir .deps 2> /dev/null || test -d .deps; then
+  DEPDIR=.deps
+else
+  DEPDIR=_deps
+fi
+AC_SUBST(DEPDIR)
+])
+
+AC_DEFUN(AM_DEP_TRACK,[
+AC_ARG_ENABLE(dependency-tracking,
+[  --disable-dependency-tracking Speeds up one-time builds
+  --enable-dependency-tracking  Do not reject slow dependency extractors])
+if test "x$enable_dependency_tracking" = xno; then
+  AMDEP="#"
+else
+  am_depcomp="$ac_aux_dir/depcomp"
+  if test ! -f "$am_depcomp"; then
+    AMDEP="#"
+  else
+    AMDEP=
+  fi
+fi
+AC_SUBST(AMDEP)
+if test -z "$AMDEP"; then
+  AMDEPBACKSLASH='\'
+else
+  AMDEPBACKSLASH=
+fi
+pushdef([subst], defn([AC_SUBST]))
+subst(AMDEPBACKSLASH)
+popdef([subst])
+])
+
+dnl Generate code to set up dependency tracking.
+dnl This macro should only be invoked once -- use via AC_REQUIRE.
+dnl Usage:
+dnl AM_OUTPUT_DEPENDENCY_COMMANDS
+
+dnl
+dnl This code is only required when automatic dependency tracking
+dnl is enabled.  FIXME.  This creates each `.P' file that we will
+dnl need in order to bootstrap the dependency handling code.
+AC_DEFUN(AM_OUTPUT_DEPENDENCY_COMMANDS,[
+AC_OUTPUT_COMMANDS([
+test x"$AMDEP" != x"" ||
+for mf in $CONFIG_FILES; do
+  case "$mf" in
+  Makefile) dirpart=.;;
+  */Makefile) dirpart=`echo "$mf" | sed -e 's|/[^/]*$||'`;;
+  *) continue;;
+  esac
+  grep '^DEP_FILES *= *[^ #]' < "$mf" > /dev/null || continue
+  # Extract the definition of DEP_FILES from the Makefile without
+  # running `make'.
+  DEPDIR=`sed -n -e '/^DEPDIR = / s///p' < "$mf"`
+  test -z "$DEPDIR" && continue
+  # When using ansi2knr, U may be empty or an underscore; expand it
+  U=`sed -n -e '/^U = / s///p' < "$mf"`
+  test -d "$dirpart/$DEPDIR" || mkdir "$dirpart/$DEPDIR"
+  # We invoke sed twice because it is the simplest approach to
+  # changing $(DEPDIR) to its actual value in the expansion.
+  for file in `sed -n -e '
+    /^DEP_FILES = .*\\\\$/ {
+      s/^DEP_FILES = //
+      :loop
+	s/\\\\$//
+	p
+	n
+	/\\\\$/ b loop
+      p
+    }
+    /^DEP_FILES = / s/^DEP_FILES = //p' < "$mf" | \
+       sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
+    # Make sure the directory exists.
+    test -f "$dirpart/$file" && continue
+    fdir=`echo "$file" | sed -e 's|/[^/]*$||'`
+    $ac_aux_dir/mkinstalldirs "$dirpart/$fdir" > /dev/null 2>&1
+    # echo "creating $dirpart/$file"
+    echo '# dummy' > "$dirpart/$file"
+  done
+done
+], [AMDEP="$AMDEP"
+ac_aux_dir="$ac_aux_dir"])])
+
+# Like AC_CONFIG_HEADER, but automatically create stamp file.
+
+AC_DEFUN(AM_CONFIG_HEADER,
+[AC_PREREQ([2.12])
+AC_CONFIG_HEADER([$1])
+dnl When config.status generates a header, we must update the stamp-h file.
+dnl This file resides in the same directory as the config header
+dnl that is generated.  We must strip everything past the first ":",
+dnl and everything past the last "/".
+AC_OUTPUT_COMMANDS(changequote(<<,>>)dnl
+ifelse(patsubst(<<$1>>, <<[^ ]>>, <<>>), <<>>,
+<<test -z "<<$>>CONFIG_HEADERS" || echo timestamp > patsubst(<<$1>>, <<^\([^:]*/\)?.*>>, <<\1>>)stamp-h<<>>dnl>>,
+<<am_indx=1
+for am_file in <<$1>>; do
+  case " <<$>>CONFIG_HEADERS " in
+  *" <<$>>am_file "*<<)>>
+    echo timestamp > `echo <<$>>am_file | sed -e 's%:.*%%' -e 's%[^/]*$%%'`stamp-h$am_indx
+    ;;
+  esac
+  am_indx=`expr "<<$>>am_indx" + 1`
+done<<>>dnl>>)
+changequote([,]))])
+
+# Add --enable-maintainer-mode option to configure.
+# From Jim Meyering
+
+# serial 1
+
+AC_DEFUN(AM_MAINTAINER_MODE,
+[AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
+  dnl maintainer-mode is disabled by default
+  AC_ARG_ENABLE(maintainer-mode,
+[  --enable-maintainer-mode enable make rules and dependencies not useful
+                          (and sometimes confusing) to the casual installer],
+      USE_MAINTAINER_MODE=$enableval,
+      USE_MAINTAINER_MODE=no)
+  AC_MSG_RESULT($USE_MAINTAINER_MODE)
+  AM_CONDITIONAL(MAINTAINER_MODE, test $USE_MAINTAINER_MODE = yes)
+  MAINT=$MAINTAINER_MODE_TRUE
+  AC_SUBST(MAINT)dnl
+]
+)
+
+# Define a conditional.
+
+AC_DEFUN(AM_CONDITIONAL,
+[AC_SUBST($1_TRUE)
+AC_SUBST($1_FALSE)
+if $2; then
+  $1_TRUE=
+  $1_FALSE='#'
+else
+  $1_TRUE='#'
+  $1_FALSE=
+fi])
+
+
+# serial 42 AC_PROG_LIBTOOL
+AC_DEFUN(AC_PROG_LIBTOOL,
+[AC_REQUIRE([AC_LIBTOOL_SETUP])dnl
+
+# Save cache, so that ltconfig can load it
+AC_CACHE_SAVE
+
+# Actually configure libtool.  ac_aux_dir is where install-sh is found.
+AR="$AR" CC="$CC" CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" \
+MAGIC="$MAGIC" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig --no-reexec \
+$libtool_flags --no-verify --build="$build" $ac_aux_dir/ltmain.sh $lt_target \
+|| AC_MSG_ERROR([libtool configure failed])
+
+# Reload cache, that may have been modified by ltconfig
+AC_CACHE_LOAD
+
+# This can be used to rebuild libtool when needed
+LIBTOOL_DEPS="$ac_aux_dir/ltconfig $ac_aux_dir/ltmain.sh"
+
+# Always use our own libtool.
+LIBTOOL='$(SHELL) $(top_builddir)/libtool'
+AC_SUBST(LIBTOOL)dnl
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+])
+
+AC_DEFUN(AC_LIBTOOL_SETUP,
+[AC_PREREQ(2.13)dnl
+AC_REQUIRE([AC_ENABLE_SHARED])dnl
+AC_REQUIRE([AC_ENABLE_STATIC])dnl
+AC_REQUIRE([AC_ENABLE_FAST_INSTALL])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_CANONICAL_BUILD])dnl
+AC_REQUIRE([AC_PROG_CC])dnl
+AC_REQUIRE([AC_PROG_LD])dnl
+AC_REQUIRE([AC_PROG_LD_RELOAD_FLAG])dnl
+AC_REQUIRE([AC_PROG_NM])dnl
+AC_REQUIRE([AC_PROG_LN_S])dnl
+AC_REQUIRE([AC_DEPLIBS_CHECK_METHOD])dnl
+AC_REQUIRE([AC_OBJEXT])dnl
+AC_REQUIRE([AC_EXEEXT])dnl
+dnl
+
+# Only perform the check for file, if the check method requires it
+case "$deplibs_check_method" in
+file_magic*)
+  if test "$file_magic_cmd" = '${MAGIC}'; then
+    AC_PATH_MAGIC
+  fi
+  ;;
+esac
+
+case "$target" in
+NONE) lt_target="$host" ;;
+*) lt_target="$target" ;;
+esac
+
+AC_CHECK_TOOL(RANLIB, ranlib, :)
+AC_CHECK_TOOL(STRIP, strip, :)
+
+# Check for any special flags to pass to ltconfig.
+libtool_flags="--cache-file=$cache_file"
+test "$enable_shared" = no && libtool_flags="$libtool_flags --disable-shared"
+test "$enable_static" = no && libtool_flags="$libtool_flags --disable-static"
+test "$enable_fast_install" = no && libtool_flags="$libtool_flags --disable-fast-install"
+test "$ac_cv_prog_gcc" = yes && libtool_flags="$libtool_flags --with-gcc"
+test "$ac_cv_prog_gnu_ld" = yes && libtool_flags="$libtool_flags --with-gnu-ld"
+ifdef([AC_PROVIDE_AC_LIBTOOL_DLOPEN],
+[libtool_flags="$libtool_flags --enable-dlopen"])
+ifdef([AC_PROVIDE_AC_LIBTOOL_WIN32_DLL],
+[libtool_flags="$libtool_flags --enable-win32-dll"])
+AC_ARG_ENABLE(libtool-lock,
+  [  --disable-libtool-lock  avoid locking (might break parallel builds)])
+test "x$enable_libtool_lock" = xno && libtool_flags="$libtool_flags --disable-lock"
+test x"$silent" = xyes && libtool_flags="$libtool_flags --silent"
+
+AC_ARG_WITH(pic,
+  [  --with-pic              try to use only PIC/non-PIC objects [default=use both]],
+     pic_mode="$withval", pic_mode=default)
+test x"$pic_mode" = xyes && libtool_flags="$libtool_flags --prefer-pic"
+test x"$pic_mode" = xno && libtool_flags="$libtool_flags --prefer-non-pic"
+
+# Some flags need to be propagated to the compiler or linker for good
+# libtool support.
+case "$lt_target" in
+*-*-irix6*)
+  # Find out which ABI we are using.
+  echo '[#]line __oline__ "configure"' > conftest.$ac_ext
+  if AC_TRY_EVAL(ac_compile); then
+    case "`/usr/bin/file conftest.o`" in
+    *32-bit*)
+      LD="${LD-ld} -32"
+      ;;
+    *N32*)
+      LD="${LD-ld} -n32"
+      ;;
+    *64-bit*)
+      LD="${LD-ld} -64"
+      ;;
+    esac
+  fi
+  rm -rf conftest*
+  ;;
+
+*-*-sco3.2v5*)
+  # On SCO OpenServer 5, we need -belf to get full-featured binaries.
+  SAVE_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -belf"
+  AC_CACHE_CHECK([whether the C compiler needs -belf], lt_cv_cc_needs_belf,
+    [AC_LANG_SAVE
+     AC_LANG_C
+     AC_TRY_LINK([],[],[lt_cv_cc_needs_belf=yes],[lt_cv_cc_needs_belf=no])
+     AC_LANG_RESTORE])
+  if test x"$lt_cv_cc_needs_belf" != x"yes"; then
+    # this is probably gcc 2.8.0, egcs 1.0 or newer; no need for -belf
+    CFLAGS="$SAVE_CFLAGS"
+  fi
+  ;;
+
+ifdef([AC_PROVIDE_AC_LIBTOOL_WIN32_DLL],
+[*-*-cygwin* | *-*-mingw*)
+  AC_CHECK_TOOL(DLLTOOL, dlltool, false)
+  AC_CHECK_TOOL(AS, as, false)
+  AC_CHECK_TOOL(OBJDUMP, objdump, false)
+
+  # recent cygwin and mingw systems supply a stub DllMain which the user
+  # can override, but on older systems we have to supply one
+  AC_CACHE_CHECK([if libtool should supply DllMain function], lt_cv_need_dllmain,
+    [AC_TRY_LINK([],
+      [extern int __attribute__((__stdcall__)) DllMain(void*, int, void*);
+      DllMain (0, 0, 0);],
+      [lt_cv_need_dllmain=no],[lt_cv_need_dllmain=yes])])
+
+  case "$lt_target/$CC" in
+  *-*-cygwin*/gcc*-mno-cygwin*|*-*-mingw*)
+    # old mingw systems require "-dll" to link a DLL, while more recent ones
+    # require "-mdll"
+    SAVE_CFLAGS="$CFLAGS"
+    CFLAGS="$CFLAGS -mdll"
+    AC_CACHE_CHECK([how to link DLLs], lt_cv_cc_dll_switch,
+      [AC_TRY_LINK([], [], [lt_cv_cc_dll_switch=-mdll],[lt_cv_cc_dll_switch=-dll])])
+    CFLAGS="$SAVE_CFLAGS" ;;
+  *-*-cygwin*)
+    # cygwin systems need to pass --dll to the linker, and not link
+    # crt.o which will require a WinMain@16 definition.
+    lt_cv_cc_dll_switch="-Wl,--dll -nostartfiles" ;;
+  esac
+  ;;
+  ])
+esac
+])
+
+# AC_LIBTOOL_DLOPEN - enable checks for dlopen support
+AC_DEFUN(AC_LIBTOOL_DLOPEN, [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])])
+
+# AC_LIBTOOL_WIN32_DLL - declare package support for building win32 dll's
+AC_DEFUN(AC_LIBTOOL_WIN32_DLL, [AC_BEFORE([$0], [AC_LIBTOOL_SETUP])])
+
+# AC_ENABLE_SHARED - implement the --enable-shared flag
+# Usage: AC_ENABLE_SHARED[(DEFAULT)]
+#   Where DEFAULT is either `yes' or `no'.  If omitted, it defaults to
+#   `yes'.
+AC_DEFUN(AC_ENABLE_SHARED, [dnl
+define([AC_ENABLE_SHARED_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(shared,
+changequote(<<, >>)dnl
+<<  --enable-shared[=PKGS]  build shared libraries [default=>>AC_ENABLE_SHARED_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case "$enableval" in
+yes) enable_shared=yes ;;
+no) enable_shared=no ;;
+*)
+  enable_shared=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_shared=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac],
+enable_shared=AC_ENABLE_SHARED_DEFAULT)dnl
+])
+
+# AC_DISABLE_SHARED - set the default shared flag to --disable-shared
+AC_DEFUN(AC_DISABLE_SHARED, [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_SHARED(no)])
+
+# AC_ENABLE_STATIC - implement the --enable-static flag
+# Usage: AC_ENABLE_STATIC[(DEFAULT)]
+#   Where DEFAULT is either `yes' or `no'.  If omitted, it defaults to
+#   `yes'.
+AC_DEFUN(AC_ENABLE_STATIC, [dnl
+define([AC_ENABLE_STATIC_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(static,
+changequote(<<, >>)dnl
+<<  --enable-static[=PKGS]  build static libraries [default=>>AC_ENABLE_STATIC_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case "$enableval" in
+yes) enable_static=yes ;;
+no) enable_static=no ;;
+*)
+  enable_static=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_static=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac],
+enable_static=AC_ENABLE_STATIC_DEFAULT)dnl
+])
+
+# AC_DISABLE_STATIC - set the default static flag to --disable-static
+AC_DEFUN(AC_DISABLE_STATIC, [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_STATIC(no)])
+
+
+# AC_ENABLE_FAST_INSTALL - implement the --enable-fast-install flag
+# Usage: AC_ENABLE_FAST_INSTALL[(DEFAULT)]
+#   Where DEFAULT is either `yes' or `no'.  If omitted, it defaults to
+#   `yes'.
+AC_DEFUN(AC_ENABLE_FAST_INSTALL, [dnl
+define([AC_ENABLE_FAST_INSTALL_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(fast-install,
+changequote(<<, >>)dnl
+<<  --enable-fast-install[=PKGS]  optimize for fast installation [default=>>AC_ENABLE_FAST_INSTALL_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case "$enableval" in
+yes) enable_fast_install=yes ;;
+no) enable_fast_install=no ;;
+*)
+  enable_fast_install=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_fast_install=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac],
+enable_fast_install=AC_ENABLE_FAST_INSTALL_DEFAULT)dnl
+])
+
+# AC_ENABLE_FAST_INSTALL - set the default to --disable-fast-install
+AC_DEFUN(AC_DISABLE_FAST_INSTALL, [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_FAST_INSTALL(no)])
+
+
+# AC_PATH_TOOL_PREFIX - find a file program which can recognise shared library
+AC_DEFUN(AC_PATH_TOOL_PREFIX,
+[AC_MSG_CHECKING([for $1])
+AC_CACHE_VAL(lt_cv_path_MAGIC,
+[case "$MAGIC" in
+  /*)
+  lt_cv_path_MAGIC="$MAGIC" # Let the user override the test with a path.
+  ;;
+  ?:/*)
+  ac_cv_path_MAGIC="$MAGIC" # Let the user override the test with a dos path.
+  ;;
+  *)
+  ac_save_MAGIC="$MAGIC"
+  IFS="${IFS=   }"; ac_save_ifs="$IFS"; IFS=":"
+dnl $ac_dummy forces splitting on constant user-supplied paths.
+dnl POSIX.2 word splitting is done only on the output of word expansions,
+dnl not every word.  This closes a longstanding sh security hole.
+  ac_dummy="ifelse([$2], , $PATH, [$2])"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$1; then
+      lt_cv_path_MAGIC="$ac_dir/$1"
+      if test -n "$file_magic_test_file"; then
+	case "$deplibs_check_method" in
+	"file_magic "*)
+	  file_magic_regex="`expr \"$deplibs_check_method\" : \"file_magic \(.*\)\"`"
+	  MAGIC="$lt_cv_path_MAGIC"
+	  if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+	    egrep "$file_magic_regex" > /dev/null; then
+	    :
+	  else
+	    cat <<EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such.  This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem.  Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
+EOF
+	  fi ;;
+	esac
+      fi
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+  MAGIC="$ac_save_MAGIC"
+  ;;
+esac])
+MAGIC="$lt_cv_path_MAGIC"
+if test -n "$MAGIC"; then
+  AC_MSG_RESULT($MAGIC)
+else
+  AC_MSG_RESULT(no)
+fi
+])
+
+
+# AC_PATH_MAGIC - find a file program which can recognise a shared library
+AC_DEFUN(AC_PATH_MAGIC,
+[AC_REQUIRE([AC_CHECK_TOOL_PREFIX])dnl
+AC_PATH_TOOL_PREFIX(${ac_tool_prefix}file, /usr/bin:$PATH)
+if test -z "$lt_cv_path_MAGIC"; then
+  if test -n "$ac_tool_prefix"; then
+    AC_PATH_TOOL_PREFIX(file, /usr/bin:$PATH)
+  else
+    MAGIC=:
+  fi
+fi
+])
+
+
+# AC_PROG_LD - find the path to the GNU or non-GNU linker
+AC_DEFUN(AC_PROG_LD,
+[AC_ARG_WITH(gnu-ld,
+[  --with-gnu-ld           assume the C compiler uses GNU ld [default=no]],
+test "$withval" = no || with_gnu_ld=yes, with_gnu_ld=no)
+AC_REQUIRE([AC_PROG_CC])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_CANONICAL_BUILD])dnl
+ac_prog=ld
+if test "$ac_cv_prog_gcc" = yes; then
+  # Check if gcc -print-prog-name=ld gives a path.
+  AC_MSG_CHECKING([for ld used by GCC])
+  case $lt_target in
+  *-*-mingw*)
+    # gcc leaves a trailing carriage return which upsets mingw
+    ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
+  *)
+    ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
+  esac
+  case "$ac_prog" in
+    # Accept absolute paths.
+changequote(,)dnl
+    [\\/]* | [A-Za-z]:[\\/]*)
+      re_direlt='/[^/][^/]*/\.\./'
+changequote([,])dnl
+      # Canonicalize the path of ld
+      ac_prog=`echo $ac_prog| sed 's%\\\\%/%g'`
+      while echo $ac_prog | grep "$re_direlt" > /dev/null 2>&1; do
+	ac_prog=`echo $ac_prog| sed "s%$re_direlt%/%"`
+      done
+      test -z "$LD" && LD="$ac_prog"
+      ;;
+  "")
+    # If it fails, then pretend we aren't using GCC.
+    ac_prog=ld
+    ;;
+  *)
+    # If it is relative, then search for the first ld in PATH.
+    with_gnu_ld=unknown
+    ;;
+  esac
+elif test "$with_gnu_ld" = yes; then
+  AC_MSG_CHECKING([for GNU ld])
+else
+  AC_MSG_CHECKING([for non-GNU ld])
+fi
+AC_CACHE_VAL(ac_cv_path_LD,
+[if test -z "$LD"; then
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+  for ac_dir in $PATH; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
+      ac_cv_path_LD="$ac_dir/$ac_prog"
+      # Check to see if the program is GNU ld.  I'd rather use --version,
+      # but apparently some GNU ld's only accept -v.
+      # Break only if it was the GNU/non-GNU ld that we prefer.
+      if "$ac_cv_path_LD" -v 2>&1 < /dev/null | egrep '(GNU|with BFD)' > /dev/null; then
+	test "$with_gnu_ld" != no && break
+      else
+	test "$with_gnu_ld" != yes && break
+      fi
+    fi
+  done
+  IFS="$ac_save_ifs"
+else
+  ac_cv_path_LD="$LD" # Let the user override the test with a path.
+fi])
+LD="$ac_cv_path_LD"
+if test -n "$LD"; then
+  AC_MSG_RESULT($LD)
+else
+  AC_MSG_RESULT(no)
+fi
+test -z "$LD" && AC_MSG_ERROR([no acceptable ld found in \$PATH])
+AC_PROG_LD_GNU
+])
+
+AC_DEFUN(AC_PROG_LD_GNU,
+[AC_CACHE_CHECK([if the linker ($LD) is GNU ld], ac_cv_prog_gnu_ld,
+[# I'd rather use --version here, but apparently some GNU ld's only accept -v.
+if $LD -v 2>&1 </dev/null | egrep '(GNU|with BFD)' 1>&5; then
+  ac_cv_prog_gnu_ld=yes
+else
+  ac_cv_prog_gnu_ld=no
+fi])
+with_gnu_ld=$ac_cv_prog_gnu_ld
+])
+
+# AC_PROG_LD_RELOAD_FLAG - find reload flag for linker
+#   -- PORTME Some linkers may need a different reload flag.
+AC_DEFUN(AC_PROG_LD_RELOAD_FLAG,
+[AC_CACHE_CHECK([for $LD option to reload object files], lt_cv_ld_reload_flag,
+[lt_cv_ld_reload_flag='-r'])
+reload_flag=$lt_cv_ld_reload_flag
+test -n "$reload_flag" && reload_flag=" $reload_flag"
+])
+
+# AC_DEPLIBS_CHECK_METHOD - how to check for library dependencies
+#  -- PORTME fill in with the dynamic library characteristics
+AC_DEFUN(AC_DEPLIBS_CHECK_METHOD,
+[AC_CACHE_CHECK([how to recognise dependant libraries],
+lt_cv_deplibs_check_method,
+[lt_cv_file_magic_cmd='${MAGIC}'
+lt_cv_file_magic_test_file=
+lt_cv_deplibs_check_method='unknown'
+# Need to set the preceding variable on all platforms that support
+# interlibrary dependencies.
+# 'none' -- dependencies not supported.
+# `unknown' -- same as none, but documents that we really don't know.
+# 'pass_all' -- all dependencies passed with no checks.
+# 'test_compile' -- check by making test program.
+# 'file_magic [regex]' -- check by looking for files in library path
+# which responds to the $file_magic_cmd with a given egrep regex.
+# If you have `file' or equivalent on your system and you're not sure
+# whether `pass_all' will *always* work, you probably want this one.
+
+case "$host_os" in
+aix4* | beos*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+bsdi4*)
+  changequote(,)dnl
+  lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib)'
+  changequote([, ])dnl
+  lt_cv_file_magic_test_file=/shlib/libc.so
+  ;;
+
+cygwin* | mingw*)
+  lt_cv_deplibs_check_method='file_magic file format pei*-i386(.*architecture: i386)?'
+  lt_cv_file_magic_cmd='${OBJDUMP} -f'
+  ;;
+
+freebsd*)
+  case "$version_type" in
+  freebsd-elf*)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  esac
+  ;;
+
+gnu*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+irix5* | irix6*)
+  case "$host_os" in
+  irix5*)
+    # this will be overridden with pass_all, but let us keep it just in case
+    lt_cv_deplibs_check_method="file_magic ELF 32-bit MSB dynamic lib MIPS - version 1"
+    ;;
+  *)
+    case "$LD" in
+    *-32|*"-32 ") libmagic=32-bit;;
+    *-n32|*"-n32 ") libmagic=N32;;
+    *-64|*"-64 ") libmagic=64-bit;;
+    *) libmagic=never-match;;
+    esac
+    # this will be overridden with pass_all, but let us keep it just in case
+    changequote(,)dnl
+    lt_cv_deplibs_check_method="file_magic ELF ${libmagic} MSB mips-[1234] dynamic lib MIPS - version 1"
+    changequote([, ])dnl
+    ;;
+  esac
+  lt_cv_file_magic_test_file=`echo /lib${libsuff}/libc.so*`
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+# This must be Linux ELF.
+linux-gnu*)
+  case "$host_cpu" in
+  alpha* | i*86 | powerpc* | sparc* )
+    lt_cv_deplibs_check_method=pass_all ;;
+  *)
+    # glibc up to 2.1.1 does not perform some relocations on ARM
+    changequote(,)dnl
+    lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [LM]SB (shared object|dynamic lib )' ;;
+    changequote([, ])dnl
+  esac
+  lt_cv_file_magic_test_file=`echo /lib/libc.so* /lib/libc-*.so`
+  ;;
+
+osf3* | osf4* | osf5*)
+  # this will be overridden with pass_all, but let us keep it just in case
+  lt_cv_deplibs_check_method='file_magic COFF format alpha shared library'
+  lt_cv_file_magic_test_file=/shlib/libc.so
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+sco3.2v5*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+solaris*)
+  lt_cv_deplibs_check_method=pass_all
+  lt_cv_file_magic_test_file=/lib/libc.so
+  ;;
+
+sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+  case "$host_vendor" in
+  ncr)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  motorola)
+    changequote(,)dnl
+    lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib) M[0-9][0-9]* Version [0-9]'
+    changequote([, ])dnl
+    lt_cv_file_magic_test_file=`echo /usr/lib/libc.so*`
+    ;;
+  esac
+  ;;
+esac
+])
+file_magic_cmd=$lt_cv_file_magic_cmd
+deplibs_check_method=$lt_cv_deplibs_check_method
+])
+
+
+# AC_PROG_NM - find the path to a BSD-compatible name lister
+AC_DEFUN(AC_PROG_NM,
+[AC_MSG_CHECKING([for BSD-compatible nm])
+AC_CACHE_VAL(ac_cv_path_NM,
+[if test -n "$NM"; then
+  # Let the user override the test.
+  ac_cv_path_NM="$NM"
+else
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+  for ac_dir in $PATH /usr/ccs/bin /usr/ucb /bin; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/nm || test -f $ac_dir/nm$ac_exeext ; then
+      # Check to see if the nm accepts a BSD-compat flag.
+      # Adding the `sed 1q' prevents false positives on HP-UX, which says:
+      #   nm: unknown option "B" ignored
+      if ($ac_dir/nm -B /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
+	ac_cv_path_NM="$ac_dir/nm -B"
+	break
+      elif ($ac_dir/nm -p /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
+	ac_cv_path_NM="$ac_dir/nm -p"
+	break
+      else
+	ac_cv_path_NM=${ac_cv_path_NM="$ac_dir/nm"} # keep the first match, but
+	continue # so that we can try to find one that supports BSD flags
+      fi
+    fi
+  done
+  IFS="$ac_save_ifs"
+  test -z "$ac_cv_path_NM" && ac_cv_path_NM=nm
+fi])
+NM="$ac_cv_path_NM"
+AC_MSG_RESULT([$NM])
+])
+
+# AC_CHECK_LIBM - check for math library
+AC_DEFUN(AC_CHECK_LIBM,
+[AC_REQUIRE([AC_CANONICAL_HOST])dnl
+LIBM=
+case "$lt_target" in
+*-*-beos* | *-*-cygwin*)
+  # These system don't have libm
+  ;;
+*-ncr-sysv4.3*)
+  AC_CHECK_LIB(mw, _mwvalidcheckl, LIBM="-lmw")
+  AC_CHECK_LIB(m, main, LIBM="$LIBM -lm")
+  ;;
+*)
+  AC_CHECK_LIB(m, main, LIBM="-lm")
+  ;;
+esac
+])
+
+# AC_LIBLTDL_CONVENIENCE[(dir)] - sets LIBLTDL to the link flags for
+# the libltdl convenience library, adds --enable-ltdl-convenience to
+# the configure arguments.  Note that LIBLTDL is not AC_SUBSTed, nor
+# is AC_CONFIG_SUBDIRS called.  If DIR is not provided, it is assumed
+# to be `${top_builddir}/libltdl'.  Make sure you start DIR with
+# '${top_builddir}/' (note the single quotes!) if your package is not
+# flat, and, if you're not using automake, define top_builddir as
+# appropriate in the Makefiles.
+AC_DEFUN(AC_LIBLTDL_CONVENIENCE, [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+  case "$enable_ltdl_convenience" in
+  no) AC_MSG_ERROR([this package needs a convenience libltdl]) ;;
+  "") enable_ltdl_convenience=yes
+      ac_configure_args="$ac_configure_args --enable-ltdl-convenience" ;;
+  esac
+  LIBLTDL=ifelse($#,1,$1,['${top_builddir}/libltdl'])/libltdlc.la
+  INCLTDL=ifelse($#,1,-I$1,['-I${top_srcdir}/libltdl'])
+])
+
+# AC_LIBLTDL_INSTALLABLE[(dir)] - sets LIBLTDL to the link flags for
+# the libltdl installable library, and adds --enable-ltdl-install to
+# the configure arguments.  Note that LIBLTDL is not AC_SUBSTed, nor
+# is AC_CONFIG_SUBDIRS called.  If DIR is not provided, it is assumed
+# to be `${top_builddir}/libltdl'.  Make sure you start DIR with
+# '${top_builddir}/' (note the single quotes!) if your package is not
+# flat, and, if you're not using automake, define top_builddir as
+# appropriate in the Makefiles.
+# In the future, this macro may have to be called after AC_PROG_LIBTOOL.
+AC_DEFUN(AC_LIBLTDL_INSTALLABLE, [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+  AC_CHECK_LIB(ltdl, main,
+  [test x"$enable_ltdl_install" != xyes && enable_ltdl_install=no],
+  [if test x"$enable_ltdl_install" = xno; then
+     AC_MSG_WARN([libltdl not installed, but installation disabled])
+   else
+     enable_ltdl_install=yes
+   fi
+  ])
+  if test x"$enable_ltdl_install" = x"yes"; then
+    ac_configure_args="$ac_configure_args --enable-ltdl-install"
+    LIBLTDL=ifelse($#,1,$1,['${top_builddir}/libltdl'])/libltdl.la
+    INCLTDL=ifelse($#,1,-I$1,['-I${top_srcdir}/libltdl'])
+  else
+    ac_configure_args="$ac_configure_args --enable-ltdl-install=no"
+    LIBLTDL="-lltdl"
+    INCLTDL=
+  fi
+])
+
+dnl old names
+AC_DEFUN(AM_PROG_LIBTOOL, [indir([AC_PROG_LIBTOOL])])dnl
+AC_DEFUN(AM_ENABLE_SHARED, [indir([AC_ENABLE_SHARED], $@)])dnl
+AC_DEFUN(AM_ENABLE_STATIC, [indir([AC_ENABLE_STATIC], $@)])dnl
+AC_DEFUN(AM_DISABLE_SHARED, [indir([AC_DISABLE_SHARED], $@)])dnl
+AC_DEFUN(AM_DISABLE_STATIC, [indir([AC_DISABLE_STATIC], $@)])dnl
+AC_DEFUN(AM_PROG_LD, [indir([AC_PROG_LD])])dnl
+AC_DEFUN(AM_PROG_NM, [indir([AC_PROG_NM])])dnl
+
+dnl This is just to silence aclocal about the macro not being used
+ifelse([AC_DISABLE_FAST_INSTALL])dnl
+
diff --git a/rts/gmp/ansi2knr.1 b/rts/gmp/ansi2knr.1
new file mode 100644
index 0000000000..f9ee5a631c
--- /dev/null
+++ b/rts/gmp/ansi2knr.1
@@ -0,0 +1,36 @@
+.TH ANSI2KNR 1 "19 Jan 1996"
+.SH NAME
+ansi2knr \- convert ANSI C to Kernighan & Ritchie C
+.SH SYNOPSIS
+.I ansi2knr
+[--varargs] input_file [output_file]
+.SH DESCRIPTION
+If no output_file is supplied, output goes to stdout.
+.br
+There are no error messages.
+.sp
+.I ansi2knr
+recognizes function definitions by seeing a non-keyword identifier at the left
+margin, followed by a left parenthesis, with a right parenthesis as the last
+character on the line, and with a left brace as the first token on the
+following line (ignoring possible intervening comments).  It will recognize a
+multi-line header provided that no intervening line ends with a left or right
+brace or a semicolon.  These algorithms ignore whitespace and comments, except
+that the function name must be the first thing on the line.
+.sp
+The following constructs will confuse it:
+.br
+     - Any other construct that starts at the left margin and follows the
+above syntax (such as a macro or function call).
+.br
+     - Some macros that tinker with the syntax of the function header.
+.sp
+The --varargs switch is obsolete, and is recognized only for
+backwards compatibility.  The present version of
+.I ansi2knr
+will always attempt to convert a ... argument to va_alist and va_dcl.
+.SH AUTHOR
+L. Peter Deutsch <ghost@aladdin.com> wrote the original ansi2knr and
+continues to maintain the current version; most of the code in the current
+version is his work.  ansi2knr also includes contributions by Francois
+Pinard <pinard@iro.umontreal.ca> and Jim Avera <jima@netcom.com>.
diff --git a/rts/gmp/ansi2knr.c b/rts/gmp/ansi2knr.c
new file mode 100644
index 0000000000..937c731886
--- /dev/null
+++ b/rts/gmp/ansi2knr.c
@@ -0,0 +1,677 @@
+/* Copyright (C) 1989, 1997, 1998, 1999 Aladdin Enterprises.  All rights reserved. */
+
+/* Convert ANSI C function definitions to K&R ("traditional C") syntax */
+
+/*
+ansi2knr is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY.  No author or distributor accepts responsibility to anyone for the
+consequences of using it or for whether it serves any particular purpose or
+works at all, unless he says so in writing.  Refer to the GNU General Public
+License (the "GPL") for full details.
+
+Everyone is granted permission to copy, modify and redistribute ansi2knr,
+but only under the conditions described in the GPL.  A copy of this license
+is supposed to have been given to you along with ansi2knr so you can know
+your rights and responsibilities.  It should be in a file named COPYLEFT,
+or, if there is no file named COPYLEFT, a file named COPYING.  Among other
+things, the copyright notice and this notice must be preserved on all
+copies.
+
+We explicitly state here what we believe is already implied by the GPL: if
+the ansi2knr program is distributed as a separate set of sources and a
+separate executable file which are aggregated on a storage medium together
+with another program, this in itself does not bring the other program under
+the GPL, nor does the mere fact that such a program or the procedures for
+constructing it invoke the ansi2knr executable bring any other part of the
+program under the GPL.
+*/
+
+/*
+ * Usage:
+	ansi2knr [--filename FILENAME] [INPUT_FILE [OUTPUT_FILE]]
+ * --filename provides the file name for the #line directive in the output,
+ * overriding input_file (if present).
+ * If no input_file is supplied, input is read from stdin.
+ * If no output_file is supplied, output goes to stdout.
+ * There are no error messages.
+ *
+ * ansi2knr recognizes function definitions by seeing a non-keyword
+ * identifier at the left margin, followed by a left parenthesis,
+ * with a right parenthesis as the last character on the line,
+ * and with a left brace as the first token on the following line
+ * (ignoring possible intervening comments), except that a line
+ * consisting of only
+ *	identifier1(identifier2)
+ * will not be considered a function definition unless identifier2 is
+ * the word "void", and a line consisting of
+ *	identifier1(identifier2, <<arbitrary>>)
+ * will not be considered a function definition.
+ * ansi2knr will recognize a multi-line header provided
+ * that no intervening line ends with a left or right brace or a semicolon.
+ * These algorithms ignore whitespace and comments, except that
+ * the function name must be the first thing on the line.
+ * The following constructs will confuse it:
+ *	- Any other construct that starts at the left margin and
+ *	    follows the above syntax (such as a macro or function call).
+ *	- Some macros that tinker with the syntax of function headers.
+ */
+
+/*
+ * The original and principal author of ansi2knr is L. Peter Deutsch
+ * <ghost@aladdin.com>.  Other authors are noted in the change history
+ * that follows (in reverse chronological order):
+	lpd 1999-04-12 added minor fixes from Pavel Roskin
+		<pavel_roskin@geocities.com> for clean compilation with
+		gcc -W -Wall
+	lpd 1999-03-22 added hack to recognize lines consisting of
+		identifier1(identifier2, xxx) as *not* being procedures
+	lpd 1999-02-03 made indentation of preprocessor commands consistent
+	lpd 1999-01-28 fixed two bugs: a '/' in an argument list caused an
+		endless loop; quoted strings within an argument list
+		confused the parser
+	lpd 1999-01-24 added a check for write errors on the output,
+		suggested by Jim Meyering <meyering@ascend.com>
+	lpd 1998-11-09 added further hack to recognize identifier(void)
+		as being a procedure
+	lpd 1998-10-23 added hack to recognize lines consisting of
+		identifier1(identifier2) as *not* being procedures
+	lpd 1997-12-08 made input_file optional; only closes input and/or
+		output file if not stdin or stdout respectively; prints
+		usage message on stderr rather than stdout; adds
+		--filename switch (changes suggested by
+		<ceder@lysator.liu.se>)
+	lpd 1996-01-21 added code to cope with not HAVE_CONFIG_H and with
+		compilers that don't understand void, as suggested by
+		Tom Lane
+	lpd 1996-01-15 changed to require that the first non-comment token
+		on the line following a function header be a left brace,
+		to reduce sensitivity to macros, as suggested by Tom Lane
+		<tgl@sss.pgh.pa.us>
+	lpd 1995-06-22 removed #ifndefs whose sole purpose was to define
+		undefined preprocessor symbols as 0; changed all #ifdefs
+		for configuration symbols to #ifs
+	lpd 1995-04-05 changed copyright notice to make it clear that
+		including ansi2knr in a program does not bring the entire
+		program under the GPL
+	lpd 1994-12-18 added conditionals for systems where ctype macros
+		don't handle 8-bit characters properly, suggested by
+		Francois Pinard <pinard@iro.umontreal.ca>;
+		removed --varargs switch (this is now the default)
+	lpd 1994-10-10 removed CONFIG_BROKETS conditional
+	lpd 1994-07-16 added some conditionals to help GNU `configure',
+		suggested by Francois Pinard <pinard@iro.umontreal.ca>;
+		properly erase prototype args in function parameters,
+		contributed by Jim Avera <jima@netcom.com>;
+		correct error in writeblanks (it shouldn't erase EOLs)
+	lpd 1989-xx-xx original version
+ */
+
+/* Most of the conditionals here are to make ansi2knr work with */
+/* or without the GNU configure machinery. */
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdio.h>
+#include <ctype.h>
+
+#if HAVE_CONFIG_H
+
+/*
+   For properly autoconfiguring ansi2knr, use AC_CONFIG_HEADER(config.h).
+   This will define HAVE_CONFIG_H and so, activate the following lines.
+ */
+
+# if STDC_HEADERS || HAVE_STRING_H
+#  include <string.h>
+# else
+#  include <strings.h>
+# endif
+
+#else /* not HAVE_CONFIG_H */
+
+/* Otherwise do it the hard way */
+
+# ifdef BSD
+#  include <strings.h>
+# else
+#  ifdef VMS
+    extern int strlen(), strncmp();
+#  else
+#   include <string.h>
+#  endif
+# endif
+
+#endif /* not HAVE_CONFIG_H */
+
+#if STDC_HEADERS
+# include <stdlib.h>
+#else
+/*
+   malloc and free should be declared in stdlib.h,
+   but if you've got a K&R compiler, they probably aren't.
+ */
+# ifdef MSDOS
+#  include <malloc.h>
+# else
+#  ifdef VMS
+     extern char *malloc();
+     extern void free();
+#  else
+     extern char *malloc();
+     extern int free();
+#  endif
+# endif
+
+#endif
+
+/* Define NULL (for *very* old compilers). */
+#ifndef NULL
+# define NULL (0)
+#endif
+
+/*
+ * The ctype macros don't always handle 8-bit characters correctly.
+ * Compensate for this here.
+ */
+#ifdef isascii
+# undef HAVE_ISASCII		/* just in case */
+# define HAVE_ISASCII 1
+#else
+#endif
+#if STDC_HEADERS || !HAVE_ISASCII
+# define is_ascii(c) 1
+#else
+# define is_ascii(c) isascii(c)
+#endif
+
+#define is_space(c) (is_ascii(c) && isspace(c))
+#define is_alpha(c) (is_ascii(c) && isalpha(c))
+#define is_alnum(c) (is_ascii(c) && isalnum(c))
+
+/* Scanning macros */
+#define isidchar(ch) (is_alnum(ch) || (ch) == '_')
+#define isidfirstchar(ch) (is_alpha(ch) || (ch) == '_')
+
+/* Forward references */
+char *skipspace();
+char *scanstring();
+int writeblanks();
+int test1();
+int convert1();
+
+/* The main program */
+int
+main(argc, argv)
+    int argc;
+    char *argv[];
+{	FILE *in = stdin;
+	FILE *out = stdout;
+	char *filename = 0;
+	char *program_name = argv[0];
+	char *output_name = 0;
+#define bufsize 5000			/* arbitrary size */
+	char *buf;
+	char *line;
+	char *more;
+	char *usage =
+	  "Usage: ansi2knr [--filename FILENAME] [INPUT_FILE [OUTPUT_FILE]]\n";
+	/*
+	 * In previous versions, ansi2knr recognized a --varargs switch.
+	 * If this switch was supplied, ansi2knr would attempt to convert
+	 * a ... argument to va_alist and va_dcl; if this switch was not
+	 * supplied, ansi2knr would simply drop any such arguments.
+	 * Now, ansi2knr always does this conversion, and we only
+	 * check for this switch for backward compatibility.
+	 */
+	int convert_varargs = 1;
+	int output_error;
+
+	while ( argc > 1 && argv[1][0] == '-' ) {
+	  if ( !strcmp(argv[1], "--varargs") ) {
+	    convert_varargs = 1;
+	    argc--;
+	    argv++;
+	    continue;
+	  }
+	  if ( !strcmp(argv[1], "--filename") && argc > 2 ) {
+	    filename = argv[2];
+	    argc -= 2;
+	    argv += 2;
+	    continue;
+	  }
+	  fprintf(stderr, "%s: Unrecognized switch: %s\n", program_name,
+		  argv[1]);
+	  fprintf(stderr, usage);
+	  exit(1);
+	}
+	switch ( argc )
+	   {
+	default:
+		fprintf(stderr, usage);
+		exit(0);
+	case 3:
+		output_name = argv[2];
+		out = fopen(output_name, "w");
+		if ( out == NULL ) {
+		  fprintf(stderr, "%s: Cannot open output file %s\n",
+			  program_name, output_name);
+		  exit(1);
+		}
+		/* falls through */
+	case 2:
+		in = fopen(argv[1], "r");
+		if ( in == NULL ) {
+		  fprintf(stderr, "%s: Cannot open input file %s\n",
+			  program_name, argv[1]);
+		  exit(1);
+		}
+		if ( filename == 0 )
+		  filename = argv[1];
+		/* falls through */
+	case 1:
+		break;
+	   }
+	if ( filename )
+	  fprintf(out, "#line 1 \"%s\"\n", filename);
+	buf = malloc(bufsize);
+	if ( buf == NULL )
+	   {
+		fprintf(stderr, "Unable to allocate read buffer!\n");
+		exit(1);
+	   }
+	line = buf;
+	while ( fgets(line, (unsigned)(buf + bufsize - line), in) != NULL )
+	   {
+test:		line += strlen(line);
+		switch ( test1(buf) )
+		   {
+		case 2:			/* a function header */
+			convert1(buf, out, 1, convert_varargs);
+			break;
+		case 1:			/* a function */
+			/* Check for a { at the start of the next line. */
+			more = ++line;
+f:			if ( line >= buf + (bufsize - 1) ) /* overflow check */
+			  goto wl;
+			if ( fgets(line, (unsigned)(buf + bufsize - line), in) == NULL )
+			  goto wl;
+			switch ( *skipspace(more, 1) )
+			  {
+			  case '{':
+			    /* Definitely a function header. */
+			    convert1(buf, out, 0, convert_varargs);
+			    fputs(more, out);
+			    break;
+			  case 0:
+			    /* The next line was blank or a comment: */
+			    /* keep scanning for a non-comment. */
+			    line += strlen(line);
+			    goto f;
+			  default:
+			    /* buf isn't a function header, but */
+			    /* more might be. */
+			    fputs(buf, out);
+			    strcpy(buf, more);
+			    line = buf;
+			    goto test;
+			  }
+			break;
+		case -1:		/* maybe the start of a function */
+			if ( line != buf + (bufsize - 1) ) /* overflow check */
+			  continue;
+			/* falls through */
+		default:		/* not a function */
+wl:			fputs(buf, out);
+			break;
+		   }
+		line = buf;
+	   }
+	if ( line != buf )
+	  fputs(buf, out);
+	free(buf);
+	if ( output_name ) {
+	  output_error = ferror(out);
+	  output_error |= fclose(out);
+	} else {		/* out == stdout */
+	  fflush(out);
+	  output_error = ferror(out);
+	}
+	if ( output_error ) {
+	  fprintf(stderr, "%s: error writing to %s\n", program_name,
+		  (output_name ? output_name : "stdout"));
+	  exit(1);
+	}
+	if ( in != stdin )
+	  fclose(in);
+	return 0;
+}
+
+/* Skip over whitespace and comments, in either direction. */
+char *
+skipspace(p, dir)
+    register char *p;
+    register int dir;			/* 1 for forward, -1 for backward */
+{	for ( ; ; )
+	   {	while ( is_space(*p) )
+		  p += dir;
+		if ( !(*p == '/' && p[dir] == '*') )
+		  break;
+		p += dir;  p += dir;
+		while ( !(*p == '*' && p[dir] == '/') )
+		   {	if ( *p == 0 )
+			  return p;	/* multi-line comment?? */
+			p += dir;
+		   }
+		p += dir;  p += dir;
+	   }
+	return p;
+}
+
+/* Scan over a quoted string, in either direction. */
+char *
+scanstring(p, dir)
+    register char *p;
+    register int dir;
+{
+    for (p += dir; ; p += dir)
+	if (*p == '"' && p[-dir] != '\\')
+	    return p + dir;
+}
+
+/*
+ * Write blanks over part of a string.
+ * Don't overwrite end-of-line characters.
+ */
+int
+writeblanks(start, end)
+    char *start;
+    char *end;
+{	char *p;
+	for ( p = start; p < end; p++ )
+	  if ( *p != '\r' && *p != '\n' )
+	    *p = ' ';
+	return 0;
+}
+
+/*
+ * Test whether the string in buf is a function definition.
+ * The string may contain and/or end with a newline.
+ * Return as follows:
+ *	0 - definitely not a function definition;
+ *	1 - definitely a function definition;
+ *	2 - definitely a function prototype (NOT USED);
+ *	-1 - may be the beginning of a function definition,
+ *		append another line and look again.
+ * The reason we don't attempt to convert function prototypes is that
+ * Ghostscript's declaration-generating macros look too much like
+ * prototypes, and confuse the algorithms.
+ */
+int
+test1(buf)
+    char *buf;
+{	register char *p = buf;
+	char *bend;
+	char *endfn;
+	int contin;
+
+	if ( !isidfirstchar(*p) )
+	  return 0;		/* no name at left margin */
+	bend = skipspace(buf + strlen(buf) - 1, -1);
+	switch ( *bend )
+	   {
+	   case ';': contin = 0 /*2*/; break;
+	   case ')': contin = 1; break;
+	   case '{': return 0;		/* not a function */
+	   case '}': return 0;		/* not a function */
+	   default: contin = -1;
+	   }
+	while ( isidchar(*p) )
+	  p++;
+	endfn = p;
+	p = skipspace(p, 1);
+	if ( *p++ != '(' )
+	  return 0;		/* not a function */
+	p = skipspace(p, 1);
+	if ( *p == ')' )
+	  return 0;		/* no parameters */
+	/* Check that the apparent function name isn't a keyword. */
+	/* We only need to check for keywords that could be followed */
+	/* by a left parenthesis (which, unfortunately, is most of them). */
+	   {	static char *words[] =
+		   {	"asm", "auto", "case", "char", "const", "double",
+			"extern", "float", "for", "if", "int", "long",
+			"register", "return", "short", "signed", "sizeof",
+			"static", "switch", "typedef", "unsigned",
+			"void", "volatile", "while", 0
+		   };
+		char **key = words;
+		char *kp;
+		unsigned len = endfn - buf;
+
+		while ( (kp = *key) != 0 )
+		   {	if ( strlen(kp) == len && !strncmp(kp, buf, len) )
+			  return 0;	/* name is a keyword */
+			key++;
+		   }
+	   }
+	   {
+	       char *id = p;
+	       int len;
+	       /*
+		* Check for identifier1(identifier2) and not
+		* identifier1(void), or identifier1(identifier2, xxxx).
+		*/
+
+	       while ( isidchar(*p) )
+		   p++;
+	       len = p - id;
+	       p = skipspace(p, 1);
+	       if (*p == ',' ||
+		   (*p == ')' && (len != 4 || strncmp(id, "void", 4)))
+		   )
+		   return 0;	/* not a function */
+	   }
+	/*
+	 * If the last significant character was a ), we need to count
+	 * parentheses, because it might be part of a formal parameter
+	 * that is a procedure.
+	 */
+	if (contin > 0) {
+	    int level = 0;
+
+	    for (p = skipspace(buf, 1); *p; p = skipspace(p + 1, 1))
+		level += (*p == '(' ? 1 : *p == ')' ? -1 : 0);
+	    if (level > 0)
+		contin = -1;
+	}
+	return contin;
+}
+
+/* Convert a recognized function definition or header to K&R syntax. */
+int
+convert1(buf, out, header, convert_varargs)
+    char *buf;
+    FILE *out;
+    int header;			/* Boolean */
+    int convert_varargs;	/* Boolean */
+{	char *endfn;
+	register char *p;
+	/*
+	 * The breaks table contains pointers to the beginning and end
+	 * of each argument.
+	 */
+	char **breaks;
+	unsigned num_breaks = 2;	/* for testing */
+	char **btop;
+	char **bp;
+	char **ap;
+	char *vararg = 0;
+
+	/* Pre-ANSI implementations don't agree on whether strchr */
+	/* is called strchr or index, so we open-code it here. */
+	for ( endfn = buf; *(endfn++) != '('; )
+	  ;
+top:	p = endfn;
+	breaks = (char **)malloc(sizeof(char *) * num_breaks * 2);
+	if ( breaks == NULL )
+	   {	/* Couldn't allocate break table, give up */
+		fprintf(stderr, "Unable to allocate break table!\n");
+		fputs(buf, out);
+		return -1;
+	   }
+	btop = breaks + num_breaks * 2 - 2;
+	bp = breaks;
+	/* Parse the argument list */
+	do
+	   {	int level = 0;
+		char *lp = NULL;
+		char *rp = NULL;
+		char *end = NULL;
+
+		if ( bp >= btop )
+		   {	/* Filled up break table. */
+			/* Allocate a bigger one and start over. */
+			free((char *)breaks);
+			num_breaks <<= 1;
+			goto top;
+		   }
+		*bp++ = p;
+		/* Find the end of the argument */
+		for ( ; end == NULL; p++ )
+		   {	switch(*p)
+			   {
+			   case ',':
+				if ( !level ) end = p;
+				break;
+			   case '(':
+				if ( !level ) lp = p;
+				level++;
+				break;
+			   case ')':
+				if ( --level < 0 ) end = p;
+				else rp = p;
+				break;
+			   case '/':
+				if (p[1] == '*')
+				    p = skipspace(p, 1) - 1;
+				break;
+			   case '"':
+			       p = scanstring(p, 1) - 1;
+			       break;
+			   default:
+				;
+			   }
+		   }
+		/* Erase any embedded prototype parameters. */
+		if ( lp && rp )
+		  writeblanks(lp + 1, rp);
+		p--;			/* back up over terminator */
+		/* Find the name being declared. */
+		/* This is complicated because of procedure and */
+		/* array modifiers. */
+		for ( ; ; )
+		   {	p = skipspace(p - 1, -1);
+			switch ( *p )
+			   {
+			   case ']':	/* skip array dimension(s) */
+			   case ')':	/* skip procedure args OR name */
+			   {	int level = 1;
+				while ( level )
+				 switch ( *--p )
+				   {
+				   case ']': case ')':
+				       level++;
+				       break;
+				   case '[': case '(':
+				       level--;
+				       break;
+				   case '/':
+				       if (p > buf && p[-1] == '*')
+					   p = skipspace(p, -1) + 1;
+				       break;
+				   case '"':
+				       p = scanstring(p, -1) + 1;
+				       break;
+				   default: ;
+				   }
+			   }
+				if ( *p == '(' && *skipspace(p + 1, 1) == '*' )
+				   {	/* We found the name being declared */
+					while ( !isidfirstchar(*p) )
+					  p = skipspace(p, 1) + 1;
+					goto found;
+				   }
+				break;
+			   default:
+				goto found;
+			   }
+		   }
+found:		if ( *p == '.' && p[-1] == '.' && p[-2] == '.' )
+		  {	if ( convert_varargs )
+			  {	*bp++ = "va_alist";
+				vararg = p-2;
+			  }
+			else
+			  {	p++;
+				if ( bp == breaks + 1 )	/* sole argument */
+				  writeblanks(breaks[0], p);
+				else
+				  writeblanks(bp[-1] - 1, p);
+				bp--;
+			  }
+		   }
+		else
+		   {	while ( isidchar(*p) ) p--;
+			*bp++ = p+1;
+		   }
+		p = end;
+	   }
+	while ( *p++ == ',' );
+	*bp = p;
+	/* Make a special check for 'void' arglist */
+	if ( bp == breaks+2 )
+	   {	p = skipspace(breaks[0], 1);
+		if ( !strncmp(p, "void", 4) )
+		   {	p = skipspace(p+4, 1);
+			if ( p == breaks[2] - 1 )
+			   {	bp = breaks;	/* yup, pretend arglist is empty */
+				writeblanks(breaks[0], p + 1);
+			   }
+		   }
+	   }
+	/* Put out the function name and left parenthesis. */
+	p = buf;
+	while ( p != endfn ) putc(*p, out), p++;
+	/* Put out the declaration. */
+	if ( header )
+	  {	fputs(");", out);
+		for ( p = breaks[0]; *p; p++ )
+		  if ( *p == '\r' || *p == '\n' )
+		    putc(*p, out);
+	  }
+	else
+	  {	for ( ap = breaks+1; ap < bp; ap += 2 )
+		  {	p = *ap;
+			while ( isidchar(*p) )
+			  putc(*p, out), p++;
+			if ( ap < bp - 1 )
+			  fputs(", ", out);
+		  }
+		fputs(")  ", out);
+		/* Put out the argument declarations */
+		for ( ap = breaks+2; ap <= bp; ap += 2 )
+		  (*ap)[-1] = ';';
+		if ( vararg != 0 )
+		  {	*vararg = 0;
+			fputs(breaks[0], out);		/* any prior args */
+			fputs("va_dcl", out);		/* the final arg */
+			fputs(bp[0], out);
+		  }
+		else
+		  fputs(breaks[0], out);
+	  }
+	free((char *)breaks);
+	return 0;
+}
diff --git a/rts/gmp/assert.c b/rts/gmp/assert.c
new file mode 100644
index 0000000000..65eccfa30b
--- /dev/null
+++ b/rts/gmp/assert.c
@@ -0,0 +1,52 @@
+/* GMP assertion failure handler. */
+
+/*
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+int
+#if __STDC__
+__gmp_assert_fail (const char *filename, int linenum,
+                   const char *expr)
+#else
+__gmp_assert_fail (filename, linenum, expr)
+char *filename;
+int  linenum;
+char *expr;
+#endif
+{
+  if (filename != NULL && filename[0] != '\0')
+    {
+      fprintf (stderr, "%s:", filename);
+      if (linenum != -1)
+        fprintf (stderr, "%d: ", linenum);
+    }
+
+  fprintf (stderr, "GNU MP assertion failed: %s\n", expr);
+  abort();
+
+  /*NOTREACHED*/
+  return 0;
+}
diff --git a/rts/gmp/compat.c b/rts/gmp/compat.c
new file mode 100644
index 0000000000..ab7529f52f
--- /dev/null
+++ b/rts/gmp/compat.c
@@ -0,0 +1,46 @@
+/* Old function entrypoints retained for binary compatibility. */
+
+/*
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+/* mpn_divexact_by3 was a function in gmp 3.0, but as of gmp 3.1 it's a
+   macro calling mpn_divexact_by3c.  */
+int
+__MPN (divexact_by3) (mp_ptr dst, mp_srcptr src, mp_size_t size)
+{
+  mpn_divexact_by3 (dst, src, size);
+}
+
+
+/* mpn_divmod_1 was a function in gmp 3.0 and earlier, but marked obsolete
+   in gmp 2 and 3.  As of gmp 3.1 it's a macro calling mpn_divrem_1. */
+int
+__MPN (divmod_1) (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor)
+{
+  mpn_divmod_1 (dst, src, size, divisor);
+}
+
+
diff --git a/rts/gmp/config.guess b/rts/gmp/config.guess
new file mode 100644
index 0000000000..08018f497d
--- /dev/null
+++ b/rts/gmp/config.guess
@@ -0,0 +1,1373 @@
+#! /bin/sh
+# Attempt to guess a canonical system name.
+#
+# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000
+# Free Software Foundation, Inc.
+#
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Written by Per Bothner <bothner@cygnus.com>.
+# Please send patches to <config-patches@gnu.org>.
+#
+# This script attempts to guess a canonical system name similar to
+# config.sub.  If it succeeds, it prints the system name on stdout, and
+# exits with 0.  Otherwise, it exits with 1.
+#
+# The plan is that this can be called by configure scripts if you
+# don't specify an explicit system type (host/target name).
+#
+# Only a few systems have been added to this list; please add others
+# (but try to keep the structure clean).
+#
+
+
+# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
+# (ghazi@noc.rutgers.edu 8/24/94.)
+if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
+	PATH=$PATH:/.attbin ; export PATH
+fi
+
+UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
+UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
+UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown
+UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
+
+dummy=dummy-$$
+trap 'rm -f $dummy.c $dummy.o $dummy ${dummy}1.s ${dummy}2.c ; exit 1' 1 2 15
+
+# Use $HOST_CC if defined. $CC may point to a cross-compiler
+if test x"$CC_FOR_BUILD" = x; then
+  if test x"$HOST_CC" != x; then
+    CC_FOR_BUILD="$HOST_CC"
+  else
+    if test x"$CC" != x; then
+      CC_FOR_BUILD="$CC"
+    else
+      echo 'dummy(){}' >$dummy.c
+      for c in cc c89 gcc; do 
+	  ($c $dummy.c -c) >/dev/null 2>&1
+	  if test $? = 0; then
+	      CC_FOR_BUILD="$c"; break
+	  fi
+      done
+      rm -f $dummy.c $dummy.o
+      if test x"$CC_FOR_BUILD" = x; then
+	CC_FOR_BUILD=no_compiler_found
+      fi
+    fi
+  fi
+fi
+
+
+# First make a best effort at recognizing x86 CPU type and leave it in X86CPU.
+# If we fail, set X86CPU to UNAME_MACHINE
+#
+# DJGPP v2 (or 2.03 at least) always gives "pc" for uname -m, and the
+# OEM for uname -s.  Eg. pc:MS-DOS:6:2 on MS-DOS 6.21.  The list of
+# possible OEMs is in src/libc/dos/dos/getdos_v.c of djlsr203.zip, but
+# just pc:*:*:* seems ok.
+
+case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
+    i?86:*:*:* | i86pc:*:*:* | pc:*:*:*)
+	case "${UNAME_MACHINE}" in
+	i86pc | pc)  UNAME_MACHINE=i386 ;;
+	esac
+	cat <<EOF >${dummy}1.s
+	.globl cpuid
+	.globl _cpuid
+cpuid:
+_cpuid:
+	pushl %esi
+	pushl %ebx
+	movl 16(%esp),%eax
+	.byte 0x0f
+	.byte 0xa2
+	movl 12(%esp),%esi
+	movl %ebx,(%esi)
+	movl %edx,4(%esi)
+	movl %ecx,8(%esi)
+	popl %ebx
+	popl %esi
+	ret
+EOF
+	cat <<EOF >${dummy}2.c
+main ()
+{
+  char vendor_string[13];
+  char dummy_string[12];
+  long fms;
+  int family, model;
+  char *modelstr;
+
+  cpuid (vendor_string, 0);
+  vendor_string[12] = 0;
+
+  fms = cpuid (dummy_string, 1);
+
+  family = (fms >> 8) & 15;
+  model = (fms >> 4) & 15;
+
+  modelstr = "i486";
+  if (strcmp (vendor_string, "GenuineIntel") == 0)
+    {
+      switch (family)
+	{
+	case 5:
+	  if (model <= 2)
+	    modelstr = "pentium";
+	  else if (model >= 4)
+	    modelstr = "pentiummmx";
+	  break;
+	case 6:
+	  if (model == 1)
+	    modelstr = "pentiumpro";
+	  else if (model <= 6)
+	    modelstr = "pentium2";
+	  else
+	    modelstr = "pentium3";
+	  break;
+	}
+    }
+  else if (strcmp (vendor_string, "AuthenticAMD") == 0)
+    {
+      switch (family)
+	{
+	case 5:
+	  if (model <= 3)
+	    modelstr = "k5";
+	  else if (model <= 7)
+	    modelstr = "k6";
+	  else if (model <= 8)
+	    modelstr = "k62";
+	  else if (model <= 9)
+	    modelstr = "k63";
+	  break;
+	case 6:
+	  modelstr = "athlon";
+	  break;
+	}
+    }
+  else if (strcmp (vendor_string, "CyrixInstead") == 0)
+    {
+      /* Should recognize Cyrix' processors too.  */
+    }
+
+  printf ("%s\n", modelstr);
+  return 0;
+}
+EOF
+	$CC_FOR_BUILD ${dummy}1.s ${dummy}2.c -o $dummy >/dev/null 2>&1
+	if test "$?" = 0 ; then
+	  X86CPU=`./$dummy`
+	fi
+
+
+	# Default to believing uname -m if the program fails to compile or
+	# run.  Will fail to run on 386 since cpuid was only added on 486.
+	if test -z "$X86CPU"
+	then
+	    X86CPU="$UNAME_MACHINE"
+	fi
+	rm -f ${dummy}1.s ${dummy}2.c $dummy
+  ;;
+esac
+
+# Note: order is significant - the case branches are not exclusive.
+
+case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
+    alpha:OSF1:*:*)
+	if test $UNAME_RELEASE = "V4.0"; then
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
+	fi
+	# A Vn.n version is a released version.
+	# A Tn.n version is a released field test version.
+	# A Xn.n version is an unreleased experimental baselevel.
+	# 1.2 uses "1.2" for uname -r.
+	cat <<EOF >$dummy.s
+	.data
+\$Lformat:
+	.byte 37,100,45,37,120,10,0	# "%d-%x\n"
+
+	.text
+	.globl main
+	.align 4
+	.ent main
+main:
+	.frame \$30,16,\$26,0
+	ldgp \$29,0(\$27)
+	.prologue 1
+	.long 0x47e03d80 # implver \$0
+	lda \$2,-1
+	.long 0x47e20c21 # amask \$2,\$1
+	lda \$16,\$Lformat
+	mov \$0,\$17
+	not \$1,\$18
+	jsr \$26,printf
+	ldgp \$29,0(\$26)
+	mov 0,\$16
+	jsr \$26,exit
+	.end main
+EOF
+	$CC_FOR_BUILD $dummy.s -o $dummy 2>/dev/null
+	if test "$?" = 0 ; then
+		case `./$dummy` in
+			0-0)
+				UNAME_MACHINE="alpha"
+				;;
+			1-0)
+				UNAME_MACHINE="alphaev5"
+				;;
+			1-1)
+				UNAME_MACHINE="alphaev56"
+				;;
+			1-101)
+				UNAME_MACHINE="alphapca56"
+				;;
+			2-303)
+				UNAME_MACHINE="alphaev6"
+				;;
+			2-307)
+				UNAME_MACHINE="alphaev67"
+				;;
+		esac
+	fi
+	rm -f $dummy.s $dummy
+	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[VTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+	exit 0 ;;
+    alpha:NetBSD:*:* | alpha:FreeBSD:*:*)
+	cat <<EOF >$dummy.s
+	.globl main
+	.ent main
+main:
+	.frame \$30,0,\$26,0
+	.prologue 0
+	.long 0x47e03d80 # implver $0
+	lda \$2,259
+	.long 0x47e20c21 # amask $2,$1
+	srl \$1,8,\$2
+	sll \$2,2,\$2
+	sll \$0,3,\$0
+	addl \$1,\$0,\$0
+	addl \$2,\$0,\$0
+	ret \$31,(\$26),1
+	.end main
+EOF
+	$CC_FOR_BUILD $dummy.s -o $dummy 2>/dev/null
+	if test "$?" = 0 ; then
+		./$dummy
+		case "$?" in
+			7)
+				UNAME_MACHINE="alpha"
+				;;
+			15)
+				UNAME_MACHINE="alphaev5"
+				;;
+			14)
+				UNAME_MACHINE="alphaev56"
+				;;
+			10)
+				UNAME_MACHINE="alphapca56"
+				;;
+			16)
+				UNAME_MACHINE="alphaev6"
+				;;
+		esac
+	fi
+	rm -f $dummy.s $dummy
+	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM}${UNAME_RELEASE} | sed -e 's/^[VTX]//' -e 's/[-(].*//' | tr [[A-Z]] [[a-z]]`
+	exit 0 ;;
+    Alpha\ *:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# Should we change UNAME_MACHINE based on the output of uname instead
+	# of the specific Alpha model?
+	echo alpha-pc-interix
+	exit 0 ;;
+    21064:Windows_NT:50:3)
+	echo alpha-dec-winnt3.5
+	exit 0 ;;
+    Amiga*:UNIX_System_V:4.0:*)
+	echo m68k-cbm-sysv4
+	exit 0;;
+    amiga:NetBSD:*:*)
+      echo m68k-cbm-netbsd${UNAME_RELEASE}
+      exit 0 ;;
+    amiga:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    *:[Aa]miga[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-amigaos
+	exit 0 ;;
+    arc64:OpenBSD:*:*)
+	echo mips64el-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    arc:OpenBSD:*:*)
+	echo mipsel-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    hkmips:OpenBSD:*:*)
+	echo mips-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    pmax:OpenBSD:*:*)
+	echo mipsel-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    sgi:OpenBSD:*:*)
+	echo mips-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    wgrisc:OpenBSD:*:*)
+	echo mipsel-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    *:OS/390:*:*)
+	echo i370-ibm-openedition
+	exit 0 ;;
+    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
+	echo arm-acorn-riscix${UNAME_RELEASE}
+	exit 0;;
+    arm32:NetBSD:*:*)
+	echo arm-unknown-netbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+	exit 0 ;;
+    SR2?01:HI-UX/MPP:*:*)
+	echo hppa1.1-hitachi-hiuxmpp
+	exit 0;;
+    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
+	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
+	if test "`(/bin/universe) 2>/dev/null`" = att ; then
+		echo pyramid-pyramid-sysv3
+	else
+		echo pyramid-pyramid-bsd
+	fi
+	exit 0 ;;
+    NILE*:*:*:dcosx)
+	echo pyramid-pyramid-svr4
+	exit 0 ;;
+    sun4H:SunOS:5.*:*)
+	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    sun4[md]:SunOS:5.*:*)
+	echo sparcv8-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    sun4u:SunOS:5.*:*)
+	echo sparcv9-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
+	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    i386:SunOS:5.*:*)
+	echo ${X86CPU}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    sun4[md]:SunOS:*:*)
+	case "`/usr/bin/arch -k`" in
+	    Series*|S4*)
+		UNAME_RELEASE=`uname -v`
+		;;
+	esac
+	# Japanese Language versions have a version number like `4.1.3-JL'.
+	echo sparcv8-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
+	exit 0 ;;
+    sun4*:SunOS:*:*)
+	case "`/usr/bin/arch -k`" in
+	    Series*|S4*)
+		UNAME_RELEASE=`uname -v`
+		;;
+	esac
+	# Japanese Language versions have a version number like `4.1.3-JL'.
+	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
+	exit 0 ;;
+    sun3*:SunOS:*:*)
+	echo m68k-sun-sunos${UNAME_RELEASE}
+	exit 0 ;;
+    sun*:*:4.2BSD:*)
+	UNAME_RELEASE=`(head -1 /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
+	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
+	case "`/bin/arch`" in
+	    sun3)
+		echo m68k-sun-sunos${UNAME_RELEASE}
+		;;
+	    sun4)
+		echo sparc-sun-sunos${UNAME_RELEASE}
+		;;
+	esac
+	exit 0 ;;
+    aushp:SunOS:*:*)
+	echo sparc-auspex-sunos${UNAME_RELEASE}
+	exit 0 ;;
+    atari*:NetBSD:*:*)
+	echo m68k-atari-netbsd${UNAME_RELEASE}
+	exit 0 ;;
+    atari*:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    # The situation for MiNT is a little confusing.  The machine name
+    # can be virtually everything (everything which is not
+    # "atarist" or "atariste" at least should have a processor
+    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
+    # to the lowercase version "mint" (or "freemint").  Finally
+    # the system name "TOS" denotes a system which is actually not
+    # MiNT.  But MiNT is downward compatible to TOS, so this should
+    # be no problem.
+    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
+        echo m68k-atari-mint${UNAME_RELEASE}
+	exit 0 ;;
+    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
+	echo m68k-atari-mint${UNAME_RELEASE}
+        exit 0 ;;
+    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
+        echo m68k-atari-mint${UNAME_RELEASE}
+	exit 0 ;;
+    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
+        echo m68k-milan-mint${UNAME_RELEASE}
+        exit 0 ;;
+    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
+        echo m68k-hades-mint${UNAME_RELEASE}
+        exit 0 ;;
+    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
+        echo m68k-unknown-mint${UNAME_RELEASE}
+        exit 0 ;;
+    sun3*:NetBSD:*:*)
+	echo m68k-sun-netbsd${UNAME_RELEASE}
+	exit 0 ;;
+    sun3*:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    mac68k:NetBSD:*:*)
+	echo m68k-apple-netbsd${UNAME_RELEASE}
+	exit 0 ;;
+    mac68k:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    macppc:NetBSD:*:*)
+	echo powerpc-apple-netbsd${UNAME_RELEASE}
+	exit 0 ;;
+    mvme68k:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    mvme88k:OpenBSD:*:*)
+	echo m88k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    powerpc:machten:*:*)
+	echo powerpc-apple-machten${UNAME_RELEASE}
+	exit 0 ;;
+    RISC*:Mach:*:*)
+	echo mips-dec-mach_bsd4.3
+	exit 0 ;;
+    RISC*:ULTRIX:*:*)
+	echo mips-dec-ultrix${UNAME_RELEASE}
+	exit 0 ;;
+    VAX*:ULTRIX*:*:*)
+	echo vax-dec-ultrix${UNAME_RELEASE}
+	exit 0 ;;
+    2020:CLIX:*:* | 2430:CLIX:*:*)
+	echo clipper-intergraph-clix${UNAME_RELEASE}
+	exit 0 ;;
+    mips:*:*:UMIPS | mips:*:*:RISCos)
+	sed 's/^	//' << EOF >$dummy.c
+#ifdef __cplusplus
+#include <stdio.h>  /* for printf() prototype */
+	int main (int argc, char *argv[]) {
+#else
+	int main (argc, argv) int argc; char *argv[]; {
+#endif
+	#if defined (host_mips) && defined (MIPSEB)
+	#if defined (SYSTYPE_SYSV)
+	  printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_SVR4)
+	  printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
+	  printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
+	#endif
+	#endif
+	  exit (-1);
+	}
+EOF
+	$CC_FOR_BUILD $dummy.c -o $dummy \
+	  && ./$dummy `echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` \
+	  && rm $dummy.c $dummy && exit 0
+	rm -f $dummy.c $dummy
+	echo mips-mips-riscos${UNAME_RELEASE}
+	exit 0 ;;
+    Night_Hawk:Power_UNIX:*:*)
+	echo powerpc-harris-powerunix
+	exit 0 ;;
+    m88k:CX/UX:7*:*)
+	echo m88k-harris-cxux7
+	exit 0 ;;
+    m88k:*:4*:R4*)
+	echo m88k-motorola-sysv4
+	exit 0 ;;
+    m88k:*:3*:R3*)
+	echo m88k-motorola-sysv3
+	exit 0 ;;
+    AViiON:dgux:*:*)
+        # DG/UX returns AViiON for all architectures
+        UNAME_PROCESSOR=`/usr/bin/uname -p`
+	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
+	then
+	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
+	       [ ${TARGET_BINARY_INTERFACE}x = x ]
+	    then
+		echo m88k-dg-dgux${UNAME_RELEASE}
+	    else
+		echo m88k-dg-dguxbcs${UNAME_RELEASE}
+	    fi
+	else
+	    echo i586-dg-dgux${UNAME_RELEASE}
+	fi
+ 	exit 0 ;;
+    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
+	echo m88k-dolphin-sysv3
+	exit 0 ;;
+    M88*:*:R3*:*)
+	# Delta 88k system running SVR3
+	echo m88k-motorola-sysv3
+	exit 0 ;;
+    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
+	echo m88k-tektronix-sysv3
+	exit 0 ;;
+    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
+	echo m68k-tektronix-bsd
+	exit 0 ;;
+    *:IRIX*:*:*)
+	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
+	exit 0 ;;
+    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
+	echo romp-ibm-aix      # uname -m gives an 8 hex-code CPU id
+	exit 0 ;;              # Note that: echo "'`uname -s`'" gives 'AIX '
+    i?86:AIX:*:*)
+	echo i386-ibm-aix
+	exit 0 ;;
+    *:AIX:2:3)
+	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
+		sed 's/^		//' << EOF >$dummy.c
+		#include <sys/systemcfg.h>
+
+		main()
+			{
+			if (!__power_pc())
+				exit(1);
+			puts("powerpc-ibm-aix3.2.5");
+			exit(0);
+			}
+EOF
+		$CC_FOR_BUILD $dummy.c -o $dummy && ./$dummy && rm $dummy.c $dummy && exit 0
+		rm -f $dummy.c $dummy
+		echo rs6000-ibm-aix3.2.5
+	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
+		echo rs6000-ibm-aix3.2.4
+	else
+		echo rs6000-ibm-aix3.2
+	fi
+	exit 0 ;;
+    *:AIX:*:4)
+	sed 's/^	//' << EOF >$dummy.c
+	#include <stdio.h>
+	#include <sys/systemcfg.h>
+	main ()
+	{
+	  if (_system_configuration.architecture == POWER_RS
+	      || _system_configuration.implementation == POWER_601)
+	    puts ("power");
+	  else
+	    {
+	      if (_system_configuration.width == 64)
+		puts ("powerpc64");
+	      else
+		puts ("powerpc");
+	    }
+	  exit (0);
+	}
+EOF
+	$CC_FOR_BUILD $dummy.c -o $dummy
+	IBM_ARCH=`./$dummy`
+	rm -f $dummy.c $dummy
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=4.${UNAME_RELEASE}
+	fi
+	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
+	exit 0 ;;
+    *:AIX:*:*)
+	echo rs6000-ibm-aix
+	exit 0 ;;
+    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
+	echo romp-ibm-bsd4.4
+	exit 0 ;;
+    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
+	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
+	exit 0 ;;                           # report: romp-ibm BSD 4.3
+    *:BOSX:*:*)
+	echo rs6000-bull-bosx
+	exit 0 ;;
+    DPX/2?00:B.O.S.:*:*)
+	echo m68k-bull-sysv3
+	exit 0 ;;
+    9000/[34]??:4.3bsd:1.*:*)
+	echo m68k-hp-bsd
+	exit 0 ;;
+    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
+	echo m68k-hp-bsd4.4
+	exit 0 ;;
+    9000/[34678]??:HP-UX:*:*)
+	case "${UNAME_MACHINE}" in
+	    9000/31? )            HP_ARCH=m68000 ;;
+	    9000/[34]?? )         HP_ARCH=m68k ;;
+	    9000/[678][0-9][0-9])
+              sed 's/^              //' << EOF >$dummy.c
+
+              #define _HPUX_SOURCE
+              #include <stdlib.h>
+              #include <unistd.h>
+
+              int main ()
+              {
+              #if defined(_SC_KERNEL_BITS)
+                  long bits = sysconf(_SC_KERNEL_BITS);
+              #endif
+                  long cpu  = sysconf (_SC_CPU_VERSION);
+
+                  switch (cpu)
+              	{
+              	case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
+              	case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
+              	case CPU_PA_RISC2_0:
+              #if defined(_SC_KERNEL_BITS)
+              	    switch (bits)
+              		{
+              		case 64: puts ("hppa2.0w"); break;
+              		case 32: puts ("hppa2.0n"); break;
+              		default: puts ("hppa2.0"); break;
+              		} break;
+              #else  /* !defined(_SC_KERNEL_BITS) */
+              	    puts ("hppa2.0"); break;
+              #endif
+              	default: puts ("hppa1.0"); break;
+              	}
+                  exit (0);
+              }
+EOF
+	(CCOPTS= $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null ) && HP_ARCH=`./$dummy`
+	rm -f $dummy.c $dummy
+	esac
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
+	exit 0 ;;
+    3050*:HI-UX:*:*)
+	sed 's/^	//' << EOF >$dummy.c
+	#include <unistd.h>
+	int
+	main ()
+	{
+	  long cpu = sysconf (_SC_CPU_VERSION);
+	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
+	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
+	     results, however.  */
+	  if (CPU_IS_PA_RISC (cpu))
+	    {
+	      switch (cpu)
+		{
+		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
+		  default: puts ("hppa-hitachi-hiuxwe2"); break;
+		}
+	    }
+	  else if (CPU_IS_HP_MC68K (cpu))
+	    puts ("m68k-hitachi-hiuxwe2");
+	  else puts ("unknown-hitachi-hiuxwe2");
+	  exit (0);
+	}
+EOF
+	$CC_FOR_BUILD $dummy.c -o $dummy && ./$dummy && rm $dummy.c $dummy && exit 0
+	rm -f $dummy.c $dummy
+	echo unknown-hitachi-hiuxwe2
+	exit 0 ;;
+    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
+	echo hppa1.1-hp-bsd
+	exit 0 ;;
+    9000/8??:4.3bsd:*:*)
+	echo hppa1.0-hp-bsd
+	exit 0 ;;
+    *9??*:MPE/iX:*:*)
+	echo hppa1.0-hp-mpeix
+	exit 0 ;;
+    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
+	echo hppa1.1-hp-osf
+	exit 0 ;;
+    hp8??:OSF1:*:*)
+	echo hppa1.0-hp-osf
+	exit 0 ;;
+    i?86:OSF1:*:*)
+	if [ -x /usr/sbin/sysversion ] ; then
+	    echo ${UNAME_MACHINE}-unknown-osf1mk
+	else
+	    echo ${UNAME_MACHINE}-unknown-osf1
+	fi
+	exit 0 ;;
+    parisc*:Lites*:*:*)
+	echo hppa1.1-hp-lites
+	exit 0 ;;
+    hppa*:OpenBSD:*:*)
+	echo hppa-unknown-openbsd
+	exit 0 ;;
+    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
+	echo c1-convex-bsd
+        exit 0 ;;
+    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+        exit 0 ;;
+    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
+	echo c34-convex-bsd
+        exit 0 ;;
+    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
+	echo c38-convex-bsd
+        exit 0 ;;
+    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
+	echo c4-convex-bsd
+        exit 0 ;;
+    CRAY*X-MP:*:*:*)
+	echo xmp-cray-unicos
+        exit 0 ;;
+    CRAY*Y-MP:*:*:*)
+	echo ymp-cray-unicos${UNAME_RELEASE}
+	exit 0 ;;
+    CRAY*[A-Z]90:*:*:*)
+	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
+	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
+	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/
+	exit 0 ;;
+    CRAY*TS:*:*:*)
+	echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit 0 ;;
+    CRAY*T3D:*:*:*)
+	echo alpha-cray-unicos
+	exit 0 ;;
+    CRAY*T3E:*:*:*)
+	echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit 0 ;;
+    CRAY*SV1:*:*:*)
+	echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit 0 ;;
+    CRAY-2:*:*:*)
+	echo cray2-cray-unicos
+        exit 0 ;;
+    F300:UNIX_System_V:*:*)
+        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+        FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
+        echo "f300-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+        exit 0 ;;
+    F301:UNIX_System_V:*:*)
+       echo f301-fujitsu-uxpv`echo $UNAME_RELEASE | sed 's/ .*//'`
+       exit 0 ;;
+    hp3[0-9][05]:NetBSD:*:*)
+	echo m68k-hp-netbsd${UNAME_RELEASE}
+	exit 0 ;;
+    hp300:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    i?86:BSD/386:*:* | i?86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
+	echo ${X86CPU}-pc-bsdi${UNAME_RELEASE}
+	exit 0 ;;
+    sparc*:BSD/OS:*:*)
+	echo sparc-unknown-bsdi${UNAME_RELEASE}
+	exit 0 ;;
+    *:BSD/OS:*:*)
+	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
+	exit 0 ;;
+    i386:FreeBSD:*:*)
+	echo ${X86CPU}-pc-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
+	exit 0 ;;
+    *:FreeBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
+	exit 0 ;;
+    i386:NetBSD:*:*)
+	echo ${X86CPU}-pc-netbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+	exit 0 ;;
+    *:NetBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-netbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+	exit 0 ;;
+    i386:OpenBSD:*:*)
+	echo ${X86CPU}-pc-openbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+	exit 0 ;;
+    *:OpenBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-openbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+	exit 0 ;;
+    i*:CYGWIN*:*)
+	echo ${X86CPU}-pc-cygwin
+	exit 0 ;;
+    i*:MINGW*:*)
+	echo ${UNAME_MACHINE}-pc-mingw32
+	exit 0 ;;
+    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
+	# UNAME_MACHINE based on the output of uname instead of i386?
+	echo i386-pc-interix
+	exit 0 ;;
+    i*:UWIN*:*)
+	echo ${UNAME_MACHINE}-pc-uwin
+	exit 0 ;;
+    p*:CYGWIN*:*)
+	echo powerpcle-unknown-cygwin
+	exit 0 ;;
+    prep*:SunOS:5.*:*)
+	echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    *:GNU:*:*)
+	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
+	exit 0 ;;
+    *:Linux:*:*)
+
+	# The BFD linker knows what the default object file format is, so
+	# first see if it will tell us. cd to the root directory to prevent
+	# problems with other programs or directories called `ld' in the path.
+	ld_help_string=`cd /; ld --help 2>&1`
+	ld_supported_emulations=`echo $ld_help_string \
+			 | sed -ne '/supported emulations:/!d
+				    s/[ 	][ 	]*/ /g
+				    s/.*supported emulations: *//
+				    s/ .*//
+				    p'`
+        case "$ld_supported_emulations" in
+	  *ia64)
+		echo "${UNAME_MACHINE}-unknown-linux"
+		exit 0
+		;;
+	  i?86linux)
+		echo "${X86CPU}-pc-linux-gnuaout"
+		exit 0
+		;;
+	  i?86coff)
+		echo "${X86CPU}-pc-linux-gnucoff"
+		exit 0
+		;;
+	  sparclinux)
+		echo "${UNAME_MACHINE}-unknown-linux-gnuaout"
+		exit 0
+		;;
+	  armlinux)
+		echo "${UNAME_MACHINE}-unknown-linux-gnuaout"
+		exit 0
+		;;
+	  elf32arm*)
+		echo "${UNAME_MACHINE}-unknown-linux-gnuoldld"
+		exit 0
+		;;
+	  armelf_linux*)
+		echo "${UNAME_MACHINE}-unknown-linux-gnu"
+		exit 0
+		;;
+	  m68klinux)
+		echo "${UNAME_MACHINE}-unknown-linux-gnuaout"
+		exit 0
+		;;
+	  elf32ppc | elf32ppclinux)
+		# Determine Lib Version
+		cat >$dummy.c <<EOF
+#include <features.h>
+#if defined(__GLIBC__)
+extern char __libc_version[];
+extern char __libc_release[];
+#endif
+main(argc, argv)
+     int argc;
+     char *argv[];
+{
+#if defined(__GLIBC__)
+  printf("%s %s\n", __libc_version, __libc_release);
+#else
+  printf("unkown\n");
+#endif
+  return 0;
+}
+EOF
+		LIBC=""
+		$CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null
+		if test "$?" = 0 ; then
+			./$dummy | grep 1\.99 > /dev/null
+			if test "$?" = 0 ; then
+				LIBC="libc1"
+			fi
+		fi
+		rm -f $dummy.c $dummy
+		echo powerpc-unknown-linux-gnu${LIBC}
+		exit 0
+		;;
+	esac
+
+	if test "${UNAME_MACHINE}" = "alpha" ; then
+		cat <<EOF >$dummy.s
+			.data
+		\$Lformat:
+			.byte 37,100,45,37,120,10,0	# "%d-%x\n"
+
+			.text
+			.globl main
+			.align 4
+			.ent main
+		main:
+			.frame \$30,16,\$26,0
+			ldgp \$29,0(\$27)
+			.prologue 1
+			.long 0x47e03d80 # implver \$0
+			lda \$2,-1
+			.long 0x47e20c21 # amask \$2,\$1
+			lda \$16,\$Lformat
+			mov \$0,\$17
+			not \$1,\$18
+			jsr \$26,printf
+			ldgp \$29,0(\$26)
+			mov 0,\$16
+			jsr \$26,exit
+			.end main
+EOF
+		LIBC=""
+		$CC_FOR_BUILD $dummy.s -o $dummy 2>/dev/null
+		if test "$?" = 0 ; then
+			case `./$dummy` in
+			0-0)
+				UNAME_MACHINE="alpha"
+				;;
+			1-0)
+				UNAME_MACHINE="alphaev5"
+				;;
+			1-1)
+				UNAME_MACHINE="alphaev56"
+				;;
+			1-101)
+				UNAME_MACHINE="alphapca56"
+				;;
+			2-303)
+				UNAME_MACHINE="alphaev6"
+				;;
+			2-307)
+				UNAME_MACHINE="alphaev67"
+				;;
+			esac
+
+			objdump --private-headers $dummy | \
+			  grep ld.so.1 > /dev/null
+			if test "$?" = 0 ; then
+				LIBC="libc1"
+			fi
+		fi
+		rm -f $dummy.s $dummy
+		echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} ; exit 0
+	elif test "${UNAME_MACHINE}" = "mips" ; then
+	  cat >$dummy.c <<EOF
+#ifdef __cplusplus
+#include <stdio.h>  /* for printf() prototype */
+	int main (int argc, char *argv[]) {
+#else
+	int main (argc, argv) int argc; char *argv[]; {
+#endif
+#ifdef __MIPSEB__
+  printf ("%s-unknown-linux-gnu\n", argv[1]);
+#endif
+#ifdef __MIPSEL__
+  printf ("%sel-unknown-linux-gnu\n", argv[1]);
+#endif
+  return 0;
+}
+EOF
+	  $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null && ./$dummy "${UNAME_MACHINE}" && rm $dummy.c $dummy && exit 0
+	  rm -f $dummy.c $dummy
+	elif test "${UNAME_MACHINE}" = "s390"; then
+	  echo s390-ibm-linux && exit 0
+	else
+	  # Either a pre-BFD a.out linker (linux-gnuoldld)
+	  # or one that does not give us useful --help.
+	  # GCC wants to distinguish between linux-gnuoldld and linux-gnuaout.
+	  # If ld does not provide *any* "supported emulations:"
+	  # that means it is gnuoldld.
+	  echo "$ld_help_string" | grep >/dev/null 2>&1 "supported emulations:"
+	  test $? != 0 && echo "${X86CPU}-pc-linux-gnuoldld" && exit 0
+
+	  case "${UNAME_MACHINE}" in
+	  i?86)
+	    VENDOR=pc;
+	    UNAME_MACHINE=${X86CPU}
+	    ;;
+	  *)
+	    VENDOR=unknown;
+	    ;;
+	  esac
+	  # Determine whether the default compiler is a.out or elf
+	  cat >$dummy.c <<EOF
+#include <features.h>
+#ifdef __cplusplus
+#include <stdio.h>  /* for printf() prototype */
+	int main (int argc, char *argv[]) {
+#else
+	int main (argc, argv) int argc; char *argv[]; {
+#endif
+#ifdef __ELF__
+# ifdef __GLIBC__
+#  if __GLIBC__ >= 2
+    printf ("%s-${VENDOR}-linux-gnu\n", argv[1]);
+#  else
+    printf ("%s-${VENDOR}-linux-gnulibc1\n", argv[1]);
+#  endif
+# else
+   printf ("%s-${VENDOR}-linux-gnulibc1\n", argv[1]);
+# endif
+#else
+  printf ("%s-${VENDOR}-linux-gnuaout\n", argv[1]);
+#endif
+  return 0;
+}
+EOF
+	  $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null && ./$dummy "${UNAME_MACHINE}" && rm $dummy.c $dummy && exit 0
+	  rm -f $dummy.c $dummy
+	fi ;;
+# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.  earlier versions
+# are messed up and put the nodename in both sysname and nodename.
+    i?86:DYNIX/ptx:4*:*)
+	echo i386-sequent-sysv4
+	exit 0 ;;
+    i?86:UNIX_SV:4.2MP:2.*)
+        # Unixware is an offshoot of SVR4, but it has its own version
+        # number series starting with 2...
+        # I am not positive that other SVR4 systems won't match this,
+	# I just have to hope.  -- rms.
+        # Use sysv4.2uw... so that sysv4* matches it.
+	echo ${X86CPU}-pc-sysv4.2uw${UNAME_VERSION}
+	exit 0 ;;
+    i?86:*:4.*:* | i?86:SYSTEM_V:4.*:*)
+	UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
+	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
+		echo ${X86CPU}-univel-sysv${UNAME_REL}
+	else
+		echo ${X86CPU}-pc-sysv${UNAME_REL}
+	fi
+	exit 0 ;;
+    i?86:*:5:7*)
+        # Fixed at (any) Pentium or better
+        UNAME_MACHINE=i586
+        if [ ${UNAME_SYSTEM} = "UnixWare" ] ; then
+	    echo ${X86CPU}-sco-sysv${UNAME_RELEASE}uw${UNAME_VERSION}
+	else
+	    echo ${X86CPU}-pc-sysv${UNAME_RELEASE}
+	fi
+	exit 0 ;;
+    i?86:*:3.2:*)
+	if test -f /usr/options/cb.name; then
+		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
+		echo ${X86CPU}-pc-isc$UNAME_REL
+	elif /bin/uname -X 2>/dev/null >/dev/null ; then
+		UNAME_REL=`(/bin/uname -X|egrep Release|sed -e 's/.*= //')`
+		(/bin/uname -X|egrep i80486 >/dev/null) && UNAME_MACHINE=i486
+		(/bin/uname -X|egrep '^Machine.*Pentium' >/dev/null) \
+			&& UNAME_MACHINE=i586
+		(/bin/uname -X|egrep '^Machine.*Pent ?II' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		(/bin/uname -X|egrep '^Machine.*Pentium Pro' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		echo ${X86CPU}-pc-sco$UNAME_REL
+	else
+		echo ${X86CPU}-pc-sysv32
+	fi
+	exit 0 ;;
+    i?86:*DOS:*:*)
+	echo ${X86CPU}-pc-msdosdjgpp
+	exit 0 ;;
+    pc:*:*:*)
+	# Left here for compatibility:
+        # uname -m prints for DJGPP always 'pc', but it prints nothing about
+        # the processor, so we play safe by assuming i386.
+	echo i386-pc-msdosdjgpp
+        exit 0 ;;
+    Intel:Mach:3*:*)
+	echo i386-pc-mach3
+	exit 0 ;;
+    paragon:*:*:*)
+	echo i860-intel-osf1
+	exit 0 ;;
+    i860:*:4.*:*) # i860-SVR4
+	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
+	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
+	else # Add other i860-SVR4 vendors below as they are discovered.
+	  echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
+	fi
+	exit 0 ;;
+    mini*:CTIX:SYS*5:*)
+	# "miniframe"
+	echo m68010-convergent-sysv
+	exit 0 ;;
+    M68*:*:R3V[567]*:*)
+	test -r /sysV68 && echo 'm68k-motorola-sysv' && exit 0 ;;
+    3[34]??:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 4850:*:4.0:3.0)
+	OS_REL=''
+	test -r /etc/.relid \
+	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && echo i486-ncr-sysv4.3${OS_REL} && exit 0
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	  && echo i586-ncr-sysv4.3${OS_REL} && exit 0 ;;
+    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
+        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+          && echo i486-ncr-sysv4 && exit 0 ;;
+    m68*:LynxOS:2.*:*)
+	echo m68k-unknown-lynxos${UNAME_RELEASE}
+	exit 0 ;;
+    mc68030:UNIX_System_V:4.*:*)
+	echo m68k-atari-sysv4
+	exit 0 ;;
+    i?86:LynxOS:2.*:* | i?86:LynxOS:3.[01]*:*)
+	echo i386-unknown-lynxos${UNAME_RELEASE}
+	exit 0 ;;
+    TSUNAMI:LynxOS:2.*:*)
+	echo sparc-unknown-lynxos${UNAME_RELEASE}
+	exit 0 ;;
+    rs6000:LynxOS:2.*:* | PowerPC:LynxOS:2.*:*)
+	echo rs6000-unknown-lynxos${UNAME_RELEASE}
+	exit 0 ;;
+    SM[BE]S:UNIX_SV:*:*)
+	echo mips-dde-sysv${UNAME_RELEASE}
+	exit 0 ;;
+    RM*:ReliantUNIX-*:*:*)
+	echo mips-sni-sysv4
+	exit 0 ;;
+    RM*:SINIX-*:*:*)
+	echo mips-sni-sysv4
+	exit 0 ;;
+    *:SINIX-*:*:*)
+	if uname -p 2>/dev/null >/dev/null ; then
+		UNAME_MACHINE=`(uname -p) 2>/dev/null`
+		echo ${UNAME_MACHINE}-sni-sysv4
+	else
+		echo ns32k-sni-sysv
+	fi
+	exit 0 ;;
+    PENTIUM:CPunix:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
+                           # says <Richard.M.Bartel@ccMail.Census.GOV>
+        echo i586-unisys-sysv4
+        exit 0 ;;
+    *:UNIX_System_V:4*:FTX*)
+	# From Gerald Hewes <hewes@openmarket.com>.
+	# How about differentiating between stratus architectures? -djm
+	echo hppa1.1-stratus-sysv4
+	exit 0 ;;
+    *:*:*:FTX*)
+	# From seanf@swdc.stratus.com.
+	echo i860-stratus-sysv4
+	exit 0 ;;
+    mc68*:A/UX:*:*)
+	echo m68k-apple-aux${UNAME_RELEASE}
+	exit 0 ;;
+    news*:NEWS-OS:*:6*)
+	echo mips-sony-newsos6
+	exit 0 ;;
+    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
+	if [ -d /usr/nec ]; then
+	        echo mips-nec-sysv${UNAME_RELEASE}
+	else
+	        echo mips-unknown-sysv${UNAME_RELEASE}
+	fi
+        exit 0 ;;
+    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
+	echo powerpc-be-beos
+	exit 0 ;;
+    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
+	echo powerpc-apple-beos
+	exit 0 ;;
+    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
+	echo i586-pc-beos
+	exit 0 ;;
+    SX-4:SUPER-UX:*:*)
+	echo sx4-nec-superux${UNAME_RELEASE}
+	exit 0 ;;
+    SX-5:SUPER-UX:*:*)
+	echo sx5-nec-superux${UNAME_RELEASE}
+	exit 0 ;;
+    Power*:Rhapsody:*:*)
+	echo powerpc-apple-rhapsody${UNAME_RELEASE}
+	exit 0 ;;
+    *:Rhapsody:*:*)
+	echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
+	exit 0 ;;
+    Power*:Mac*OS:*:*)
+	echo powerpc-apple-macos${UNAME_RELEASE}
+	exit 0 ;;
+    *:Mac*OS:*:*)
+	echo ${UNAME_MACHINE}-apple-macos${UNAME_RELEASE}
+	exit 0 ;;
+    *:Darwin:*:*)
+	echo `uname -p`-apple-darwin${UNAME_RELEASE}
+	exit 0 ;;
+    *:procnto*:*:* | *:QNX:[0123456789]*:*)
+	if test "${UNAME_MACHINE}" = "x86pc"; then
+		UNAME_MACHINE=pc
+	fi
+	echo `uname -p`-${UNAME_MACHINE}-nto-qnx
+	exit 0 ;;
+    *:QNX:*:4*)
+	echo i386-pc-qnx
+	exit 0 ;;
+    NSR-W:NONSTOP_KERNEL:*:*)
+	echo nsr-tandem-nsk${UNAME_RELEASE}
+	exit 0 ;;
+    BS2000:POSIX*:*:*)
+	echo bs2000-siemens-sysv
+	exit 0 ;;
+esac
+
+#echo '(No uname command or uname output not recognized.)' 1>&2
+#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
+
+cat >$dummy.c <<EOF
+#ifdef _SEQUENT_
+# include <sys/types.h>
+# include <sys/utsname.h>
+#endif
+main ()
+{
+#if defined (sony)
+#if defined (MIPSEB)
+  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
+     I don't know....  */
+  printf ("mips-sony-bsd\n"); exit (0);
+#else
+#include <sys/param.h>
+  printf ("m68k-sony-newsos%s\n",
+#ifdef NEWSOS4
+          "4"
+#else
+	  ""
+#endif
+         ); exit (0);
+#endif
+#endif
+
+#if defined (__arm) && defined (__acorn) && defined (__unix)
+  printf ("arm-acorn-riscix"); exit (0);
+#endif
+
+#if defined (hp300) && !defined (hpux)
+  printf ("m68k-hp-bsd\n"); exit (0);
+#endif
+
+#if defined (NeXT)
+#if !defined (__ARCHITECTURE__)
+#define __ARCHITECTURE__ "m68k"
+#endif
+  int version;
+  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
+  if (version < 4)
+    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
+  else
+    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
+  exit (0);
+#endif
+
+#if defined (MULTIMAX) || defined (n16)
+#if defined (UMAXV)
+  printf ("ns32k-encore-sysv\n"); exit (0);
+#else
+#if defined (CMU)
+  printf ("ns32k-encore-mach\n"); exit (0);
+#else
+  printf ("ns32k-encore-bsd\n"); exit (0);
+#endif
+#endif
+#endif
+
+#if defined (__386BSD__)
+  printf ("i386-pc-bsd\n"); exit (0);
+#endif
+
+#if defined (sequent)
+#if defined (i386)
+  printf ("i386-sequent-dynix\n"); exit (0);
+#endif
+#if defined (ns32000)
+  printf ("ns32k-sequent-dynix\n"); exit (0);
+#endif
+#endif
+
+#if defined (_SEQUENT_)
+    struct utsname un;
+
+    uname(&un);
+
+    if (strncmp(un.version, "V2", 2) == 0) {
+	printf ("i386-sequent-ptx2\n"); exit (0);
+    }
+    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
+	printf ("i386-sequent-ptx1\n"); exit (0);
+    }
+    printf ("i386-sequent-ptx\n"); exit (0);
+
+#endif
+
+#if defined (vax)
+#if !defined (ultrix)
+  printf ("vax-dec-bsd\n"); exit (0);
+#else
+  printf ("vax-dec-ultrix\n"); exit (0);
+#endif
+#endif
+
+#if defined (alliant) && defined (i860)
+  printf ("i860-alliant-bsd\n"); exit (0);
+#endif
+
+  exit (1);
+}
+EOF
+
+$CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null && ./$dummy && rm $dummy.c $dummy && exit 0
+rm -f $dummy.c $dummy
+
+# Apollos put the system type in the environment.
+
+test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit 0; }
+
+# Convex versions that predate uname can use getsysinfo(1)
+
+if [ -x /usr/convex/getsysinfo ]
+then
+    case `getsysinfo -f cpu_type` in
+    c1*)
+	echo c1-convex-bsd
+	exit 0 ;;
+    c2*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+	exit 0 ;;
+    c34*)
+	echo c34-convex-bsd
+	exit 0 ;;
+    c38*)
+	echo c38-convex-bsd
+	exit 0 ;;
+    c4*)
+	echo c4-convex-bsd
+	exit 0 ;;
+    esac
+fi
+
+#echo '(Unable to guess system type)' 1>&2
+
+exit 1
diff --git a/rts/gmp/config.in b/rts/gmp/config.in
new file mode 100644
index 0000000000..8b2546ef16
--- /dev/null
+++ b/rts/gmp/config.in
@@ -0,0 +1,162 @@
+/* config.in.  Generated automatically from configure.in by autoheader.  */
+/*
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+
+/* Define if a limb is long long. */
+#undef _LONG_LONG_LIMB
+
+/* Define if we have native implementation of function. */
+#undef HAVE_NATIVE_
+#undef HAVE_NATIVE_mpn_add                 
+#undef HAVE_NATIVE_mpn_add_1               
+#undef HAVE_NATIVE_mpn_add_n               
+#undef HAVE_NATIVE_mpn_add_nc              
+#undef HAVE_NATIVE_mpn_addmul_1            
+#undef HAVE_NATIVE_mpn_addmul_1c
+#undef HAVE_NATIVE_mpn_addsub_n            
+#undef HAVE_NATIVE_mpn_addsub_nc
+#undef HAVE_NATIVE_mpn_and_n               
+#undef HAVE_NATIVE_mpn_andn_n              
+#undef HAVE_NATIVE_mpn_bdivmod             
+#undef HAVE_NATIVE_mpn_cmp                 
+#undef HAVE_NATIVE_mpn_com_n               
+#undef HAVE_NATIVE_mpn_copyd               
+#undef HAVE_NATIVE_mpn_copyi               
+#undef HAVE_NATIVE_mpn_divexact_by3c
+#undef HAVE_NATIVE_mpn_divrem              
+#undef HAVE_NATIVE_mpn_divrem_1            
+#undef HAVE_NATIVE_mpn_divrem_1c
+#undef HAVE_NATIVE_mpn_divrem_2            
+#undef HAVE_NATIVE_mpn_divrem_newton       
+#undef HAVE_NATIVE_mpn_divrem_classic      
+#undef HAVE_NATIVE_mpn_dump                
+#undef HAVE_NATIVE_mpn_gcd                 
+#undef HAVE_NATIVE_mpn_gcd_1               
+#undef HAVE_NATIVE_mpn_gcdext              
+#undef HAVE_NATIVE_mpn_get_str             
+#undef HAVE_NATIVE_mpn_hamdist             
+#undef HAVE_NATIVE_mpn_invert_limb
+#undef HAVE_NATIVE_mpn_ior_n               
+#undef HAVE_NATIVE_mpn_iorn_n              
+#undef HAVE_NATIVE_mpn_lshift              
+#undef HAVE_NATIVE_mpn_mod_1               
+#undef HAVE_NATIVE_mpn_mod_1c
+#undef HAVE_NATIVE_mpn_mul                 
+#undef HAVE_NATIVE_mpn_mul_1               
+#undef HAVE_NATIVE_mpn_mul_1c
+#undef HAVE_NATIVE_mpn_mul_basecase        
+#undef HAVE_NATIVE_mpn_mul_n               
+#undef HAVE_NATIVE_mpn_nand_n              
+#undef HAVE_NATIVE_mpn_nior_n              
+#undef HAVE_NATIVE_mpn_perfect_square_p    
+#undef HAVE_NATIVE_mpn_popcount            
+#undef HAVE_NATIVE_mpn_preinv_mod_1        
+#undef HAVE_NATIVE_mpn_random2             
+#undef HAVE_NATIVE_mpn_random              
+#undef HAVE_NATIVE_mpn_rawrandom           
+#undef HAVE_NATIVE_mpn_rshift              
+#undef HAVE_NATIVE_mpn_scan0               
+#undef HAVE_NATIVE_mpn_scan1               
+#undef HAVE_NATIVE_mpn_set_str             
+#undef HAVE_NATIVE_mpn_sqrtrem             
+#undef HAVE_NATIVE_mpn_sqr_basecase        
+#undef HAVE_NATIVE_mpn_sub                 
+#undef HAVE_NATIVE_mpn_sub_1               
+#undef HAVE_NATIVE_mpn_sub_n               
+#undef HAVE_NATIVE_mpn_sub_nc              
+#undef HAVE_NATIVE_mpn_submul_1            
+#undef HAVE_NATIVE_mpn_submul_1c
+#undef HAVE_NATIVE_mpn_udiv_w_sdiv         
+#undef HAVE_NATIVE_mpn_umul_ppmm
+#undef HAVE_NATIVE_mpn_udiv_qrnnd
+#undef HAVE_NATIVE_mpn_xor_n               
+#undef HAVE_NATIVE_mpn_xnor_n              
+
+/* Define to 1 if you have the declaration of `optarg', and to 0 if you don't.
+   */
+#undef HAVE_DECL_OPTARG
+
+/* ./configure --enable-assert option, to enable some ASSERT()s */
+#undef WANT_ASSERT
+
+/* Define if you have the <sys/sysctl.h> header file. */
+#undef HAVE_SYS_SYSCTL_H
+
+/* Define if you have the `strtoul' function. */
+#undef HAVE_STRTOUL
+
+/* Name of package */
+#undef PACKAGE
+
+/* Define if you have the `sysctlbyname' function. */
+#undef HAVE_SYSCTLBYNAME
+
+/* Define if the system has the type `void'. */
+#undef HAVE_VOID
+
+/* Define if you have the `popen' function. */
+#undef HAVE_POPEN
+
+/* ./configure --disable-alloca option, to use stack-alloc.c, not alloca */
+#undef USE_STACK_ALLOC
+
+/* Define if cpp supports the ANSI # stringizing operator. */
+#undef HAVE_STRINGIZE
+
+/* Define if you have the <sys/time.h> header file. */
+#undef HAVE_SYS_TIME_H
+
+/* Define if you have the `sysconf' function. */
+#undef HAVE_SYSCONF
+
+/* Define if you have the `getpagesize' function. */
+#undef HAVE_GETPAGESIZE
+
+/* Define if you have the `processor_info' function. */
+#undef HAVE_PROCESSOR_INFO
+
+/* Version number of package */
+#undef VERSION
+
+/* Define if you have the `getopt_long' function. */
+#undef HAVE_GETOPT_LONG
+
+/* Define if you have the <getopt.h> header file. */
+#undef HAVE_GETOPT_H
+
+/* Define if you have the ANSI C header files. */
+#undef STDC_HEADERS
+
+/* Define if a speed_cyclecounter exists (for the tune programs) */
+#undef HAVE_SPEED_CYCLECOUNTER
+
+/* Define if mpn/tests has calling conventions checking for the CPU */
+#undef HAVE_CALLING_CONVENTIONS
+
+/* ./configure --enable-fft option, to enable FFTs for multiplication */
+#undef WANT_FFT
+
+/* Define if you have the <string.h> header file. */
+#undef HAVE_STRING_H
+
+/* Define if you have the <unistd.h> header file. */
+#undef HAVE_UNISTD_H
diff --git a/rts/gmp/config.sub b/rts/gmp/config.sub
new file mode 100644
index 0000000000..c4123f28ff
--- /dev/null
+++ b/rts/gmp/config.sub
@@ -0,0 +1,1273 @@
+#! /bin/sh
+# Configuration validation subroutine script, version 1.1.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000
+#   Free Software Foundation, Inc.
+#
+# This file is (in principle) common to ALL GNU software.
+# The presence of a machine in this file suggests that SOME GNU software
+# can handle that machine.  It does not imply ALL GNU software can.
+#
+# This file is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330,
+# Boston, MA 02111-1307, USA.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Written by Per Bothner <bothner@cygnus.com>.
+# Please send patches to <config-patches@gnu.org>.
+#
+# Configuration subroutine to validate and canonicalize a configuration type.
+# Supply the specified configuration type as an argument.
+# If it is invalid, we print an error message on stderr and exit with code 1.
+# Otherwise, we print the canonical config type on stdout and succeed.
+
+# This file is supposed to be the same for all GNU packages
+# and recognize all the CPU types, system types and aliases
+# that are meaningful with *any* GNU software.
+# Each package is responsible for reporting which valid configurations
+# it does not support.  The user should be able to distinguish
+# a failure to support a valid configuration from a meaningless
+# configuration.
+
+# The goal of this file is to map all the various variations of a given
+# machine specification into a single specification in the form:
+#	CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
+# or in some cases, the newer four-part form:
+#	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
+# It is wrong to echo any other type of specification.
+
+if [ x$1 = x ]
+then
+	echo Configuration name missing. 1>&2
+	echo "Usage: $0 CPU-MFR-OPSYS" 1>&2
+	echo "or     $0 ALIAS" 1>&2
+	echo where ALIAS is a recognized configuration type. 1>&2
+	exit 1
+fi
+
+# First pass through any local machine types.
+case $1 in
+	*local*)
+		echo $1
+		exit 0
+		;;
+	*)
+	;;
+esac
+
+# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
+# Here we must recognize all the valid KERNEL-OS combinations.
+maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
+case $maybe_os in
+  nto-qnx* | linux-gnu*)
+    os=-$maybe_os
+    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
+    ;;
+  *)
+    basic_machine=`echo $1 | sed 's/-[^-]*$//'`
+    if [ $basic_machine != $1 ]
+    then os=`echo $1 | sed 's/.*-/-/'`
+    else os=; fi
+    ;;
+esac
+
+### Let's recognize common machines as not being operating systems so
+### that things like config.sub decstation-3100 work.  We also
+### recognize some manufacturers as not being operating systems, so we
+### can provide default operating systems below.
+case $os in
+	-sun*os*)
+		# Prevent following clause from handling this invalid input.
+		;;
+	-dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
+	-att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \
+	-unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \
+	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
+	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
+	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
+	-apple)
+		os=
+		basic_machine=$1
+		;;
+	-sim | -cisco | -oki | -wec | -winbond)
+		os=
+		basic_machine=$1
+		;;
+	-scout)
+		;;
+	-wrs)
+		os=-vxworks
+		basic_machine=$1
+		;;
+	-hiux*)
+		os=-hiuxwe2
+		;;
+	-sco5)
+		os=-sco3.2v5
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco4)
+		os=-sco3.2v4
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco3.2.[4-9]*)
+		os=`echo $os | sed -e 's/sco3.2./sco3.2v/'`
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco3.2v[4-9]*)
+		# Don't forget version if it is 3.2v4 or newer.
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco*)
+		os=-sco3.2v2
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-udk*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-isc)
+		os=-isc2.2
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-clix*)
+		basic_machine=clipper-intergraph
+		;;
+	-isc*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-lynx*)
+		os=-lynxos
+		;;
+	-ptx*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
+		;;
+	-windowsnt*)
+		os=`echo $os | sed -e 's/windowsnt/winnt/'`
+		;;
+	-psos*)
+		os=-psos
+		;;
+	-mint | -mint[0-9]*)
+		basic_machine=m68k-atari
+		os=-mint
+		;;
+esac
+
+# Decode aliases for certain CPU-COMPANY combinations.
+case $basic_machine in
+	# Recognize the basic CPU types without company name.
+	# Some are omitted here because they have special meanings below.
+	tahoe | i860 | ia64 | m32r | m68k | m68000 | m88k | ns32k | arc | arm \
+		| arme[lb] | pyramid | mn10200 | mn10300 | tron | a29k \
+		| 580 | i960 | h8300 \
+		| x86 | ppcbe | mipsbe | mipsle | shbe | shle | armbe | armle \
+		| hppa | hppa1.0 | hppa1.1 | hppa2.0 | hppa2.0w | hppa2.0n \
+		| alpha | alphaev[4-8] | alphaev56 | alphapca5[67] \
+		| alphaev6[78] \
+		| we32k | ns16k | clipper | i370 | sh | powerpc | powerpcle \
+		| 1750a | dsp16xx | pdp11 | mips16 | mips64 | mipsel | mips64el \
+		| mips64orion | mips64orionel | mipstx39 | mipstx39el \
+		| mips64vr4300 | mips64vr4300el | mips64vr4100 | mips64vr4100el \
+		| mips64vr5000 | miprs64vr5000el | mcore \
+		| sparc | sparclet | sparclite | sparc64 | sparcv9 | v850 | c4x \
+		| powerpc64 | sparcv8 | supersparc | microsparc | ultrasparc \
+		| thumb | d10v | fr30 | avr)
+		basic_machine=$basic_machine-unknown
+		;;
+	m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | z8k | v70 | h8500 | w65 | pj | pjl)
+		;;
+
+	# We use `pc' rather than `unknown'
+	# because (1) that's what they normally are, and
+	# (2) the word "unknown" tends to confuse beginning users.
+	i[34567]86 | pentium[23] | k[56] | k6[23] | athlon)
+	  basic_machine=$basic_machine-pc
+	  ;;
+	# Object if more than one company name word.
+	*-*-*)
+		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+		exit 1
+		;;
+	# Recognize the basic CPU types with company name.
+	vax-* | tahoe-* | i[34567]86-* | pentium[23]-* | i860-* | ia64-* | m32r-* | m68k-* | m68000-* \
+	      | m88k-* | sparc-* | ns32k-* | fx80-* | arc-* | arm-* | c[123]* \
+	      | mips-* | pyramid-* | tron-* | a29k-* | romp-* | rs6000-* \
+	      | power-* | none-* | 580-* | cray2-* | h8300-* | h8500-* | i960-* \
+	      | xmp-* | ymp-* \
+	      | x86-* | ppcbe-* | mipsbe-* | mipsle-* | shbe-* | shle-* | armbe-* | armle-* \
+	      | hppa-* | hppa1.0-* | hppa1.1-* | hppa2.0-* | hppa2.0w-* | hppa2.0n-* \
+	      | alpha-* | alphaev[4-8]-* | alphaev56-* | alphapca5[67]-* \
+	      | alphaev6[78]-* \
+	      | we32k-* | cydra-* | ns16k-* | pn-* | np1-* | xps100-* \
+	      | clipper-* | orion-* \
+	      | sparclite-* | pdp11-* | sh-* | powerpc-* | powerpcle-* \
+	      | sparc64-* | sparcv9-* | sparc86x-* | mips16-* | mips64-* | mipsel-* \
+	      | mips64el-* | mips64orion-* | mips64orionel-* \
+	      | mips64vr4100-* | mips64vr4100el-* | mips64vr4300-* | mips64vr4300el-* \
+	      | mipstx39-* | mipstx39el-* | mcore-* \
+	      | f301-* | armv*-* | s390-* | sv1-* | t3e-* \
+	      | m88110-* | m680[01234]0-* | m683?2-* | m68360-* | z8k-* | d10v-* \
+	      | k[56]-* | k6[23]-* | athlon-* | powerpc64-* \
+	      | sparcv8-* | supersparc-* | microsparc-* | ultrasparc-* \
+	      | thumb-* | v850-* | d30v-* | tic30-* | c30-* | fr30-* )
+		;;
+	# Recognize the various machine names and aliases which stand
+	# for a CPU type and a company and sometimes even an OS.
+	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
+		basic_machine=m68000-att
+		;;
+	3b*)
+		basic_machine=we32k-att
+		;;
+	a29khif)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	adobe68k)
+		basic_machine=m68010-adobe
+		os=-scout
+		;;
+	alliant | fx80)
+		basic_machine=fx80-alliant
+		;;
+	altos | altos3068)
+		basic_machine=m68k-altos
+		;;
+	am29k)
+		basic_machine=a29k-none
+		os=-bsd
+		;;
+	amdahl)
+		basic_machine=580-amdahl
+		os=-sysv
+		;;
+	amiga | amiga-*)
+		basic_machine=m68k-cbm
+		;;
+	amigaos | amigados)
+		basic_machine=m68k-cbm
+		os=-amigaos
+		;;
+	amigaunix | amix)
+		basic_machine=m68k-cbm
+		os=-sysv4
+		;;
+	apollo68)
+		basic_machine=m68k-apollo
+		os=-sysv
+		;;
+	apollo68bsd)
+		basic_machine=m68k-apollo
+		os=-bsd
+		;;
+	aux)
+		basic_machine=m68k-apple
+		os=-aux
+		;;
+	balance)
+		basic_machine=ns32k-sequent
+		os=-dynix
+		;;
+	convex-c1)
+		basic_machine=c1-convex
+		os=-bsd
+		;;
+	convex-c2)
+		basic_machine=c2-convex
+		os=-bsd
+		;;
+	convex-c32)
+		basic_machine=c32-convex
+		os=-bsd
+		;;
+	convex-c34)
+		basic_machine=c34-convex
+		os=-bsd
+		;;
+	convex-c38)
+		basic_machine=c38-convex
+		os=-bsd
+		;;
+	cray | ymp)
+		basic_machine=ymp-cray
+		os=-unicos
+		;;
+	cray2)
+		basic_machine=cray2-cray
+		os=-unicos
+		;;
+	[ctj]90-cray)
+		basic_machine=c90-cray
+		os=-unicos
+		;;
+	crds | unos)
+		basic_machine=m68k-crds
+		;;
+	da30 | da30-*)
+		basic_machine=m68k-da30
+		;;
+	decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn)
+		basic_machine=mips-dec
+		;;
+	delta | 3300 | motorola-3300 | motorola-delta \
+	      | 3300-motorola | delta-motorola)
+		basic_machine=m68k-motorola
+		;;
+	delta88)
+		basic_machine=m88k-motorola
+		os=-sysv3
+		;;
+	dpx20 | dpx20-*)
+		basic_machine=rs6000-bull
+		os=-bosx
+		;;
+	dpx2* | dpx2*-bull)
+		basic_machine=m68k-bull
+		os=-sysv3
+		;;
+	ebmon29k)
+		basic_machine=a29k-amd
+		os=-ebmon
+		;;
+	elxsi)
+		basic_machine=elxsi-elxsi
+		os=-bsd
+		;;
+	encore | umax | mmax)
+		basic_machine=ns32k-encore
+		;;
+	es1800 | OSE68k | ose68k | ose | OSE)
+		basic_machine=m68k-ericsson
+		os=-ose
+		;;
+	fx2800)
+		basic_machine=i860-alliant
+		;;
+	genix)
+		basic_machine=ns32k-ns
+		;;
+	gmicro)
+		basic_machine=tron-gmicro
+		os=-sysv
+		;;
+	h3050r* | hiux*)
+		basic_machine=hppa1.1-hitachi
+		os=-hiuxwe2
+		;;
+	h8300hms)
+		basic_machine=h8300-hitachi
+		os=-hms
+		;;
+	h8300xray)
+		basic_machine=h8300-hitachi
+		os=-xray
+		;;
+	h8500hms)
+		basic_machine=h8500-hitachi
+		os=-hms
+		;;
+	harris)
+		basic_machine=m88k-harris
+		os=-sysv3
+		;;
+	hp300-*)
+		basic_machine=m68k-hp
+		;;
+	hp300bsd)
+		basic_machine=m68k-hp
+		os=-bsd
+		;;
+	hp300hpux)
+		basic_machine=m68k-hp
+		os=-hpux
+		;;
+	hp3k9[0-9][0-9] | hp9[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hp9k2[0-9][0-9] | hp9k31[0-9])
+		basic_machine=m68000-hp
+		;;
+	hp9k3[2-9][0-9])
+		basic_machine=m68k-hp
+		;;
+	hp9k6[0-9][0-9] | hp6[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hp9k7[0-79][0-9] | hp7[0-79][0-9])
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k78[0-9] | hp78[0-9])
+		basic_machine=hppa2.0-hp
+		;;
+	hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
+		basic_machine=hppa2.0-hp
+		;;
+	hp9k8[0-9][13679] | hp8[0-9][13679])
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[0-9][0-9] | hp8[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hppa-next)
+		os=-nextstep3
+		;;
+	hppaosf)
+		basic_machine=hppa1.1-hp
+		os=-osf
+		;;
+	hppro)
+		basic_machine=hppa1.1-hp
+		os=-proelf
+		;;
+	i370-ibm* | ibm*)
+		basic_machine=i370-ibm
+		;;
+# I'm not sure what "Sysv32" means.  Should this be sysv3.2?
+	i[34567]86v32)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv32
+		;;
+	i[34567]86v4*)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv4
+		;;
+	i[34567]86v)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv
+		;;
+	i[34567]86sol2)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-solaris2
+		;;
+	i386mach)
+		basic_machine=i386-mach
+		os=-mach
+		;;
+	i386-vsta | vsta)
+		basic_machine=i386-unknown
+		os=-vsta
+		;;
+	i386-go32 | go32)
+		basic_machine=i386-unknown
+		os=-go32
+		;;
+	i386-mingw32 | mingw32)
+		basic_machine=i386-unknown
+		os=-mingw32
+		;;
+	iris | iris4d)
+		basic_machine=mips-sgi
+		case $os in
+		    -irix*)
+			;;
+		    *)
+			os=-irix4
+			;;
+		esac
+		;;
+	isi68 | isi)
+		basic_machine=m68k-isi
+		os=-sysv
+		;;
+	macppc*)
+		basic_machine=powerpc-apple
+		;;
+	m88k-omron*)
+		basic_machine=m88k-omron
+		;;
+	magnum | m3230)
+		basic_machine=mips-mips
+		os=-sysv
+		;;
+	merlin)
+		basic_machine=ns32k-utek
+		os=-sysv
+		;;
+	miniframe)
+		basic_machine=m68000-convergent
+		;;
+	*mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*)
+		basic_machine=m68k-atari
+		os=-mint
+		;;
+	mipsel*-linux*)
+		basic_machine=mipsel-unknown
+		os=-linux-gnu
+		;;
+	mips*-linux*)
+		basic_machine=mips-unknown
+		os=-linux-gnu
+		;;
+	mips3*-*)
+		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
+		;;
+	mips3*)
+		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
+		;;
+	mmix*)
+		basic_machine=mmix-knuth
+		os=-mmixware
+		;;
+	monitor)
+		basic_machine=m68k-rom68k
+		os=-coff
+		;;
+	msdos)
+		basic_machine=i386-unknown
+		os=-msdos
+		;;
+	mvs)
+		basic_machine=i370-ibm
+		os=-mvs
+		;;
+	ncr3000)
+		basic_machine=i486-ncr
+		os=-sysv4
+		;;
+	netbsd386)
+		basic_machine=i386-unknown
+		os=-netbsd
+		;;
+	netwinder)
+		basic_machine=armv4l-rebel
+		os=-linux
+		;;
+	news | news700 | news800 | news900)
+		basic_machine=m68k-sony
+		os=-newsos
+		;;
+	news1000)
+		basic_machine=m68030-sony
+		os=-newsos
+		;;
+	news-3600 | risc-news)
+		basic_machine=mips-sony
+		os=-newsos
+		;;
+	necv70)
+		basic_machine=v70-nec
+		os=-sysv
+		;;
+	next | m*-next )
+		basic_machine=m68k-next
+		case $os in
+		    -nextstep* )
+			;;
+		    -ns2*)
+		      os=-nextstep2
+			;;
+		    *)
+		      os=-nextstep3
+			;;
+		esac
+		;;
+	nh3000)
+		basic_machine=m68k-harris
+		os=-cxux
+		;;
+	nh[45]000)
+		basic_machine=m88k-harris
+		os=-cxux
+		;;
+	nindy960)
+		basic_machine=i960-intel
+		os=-nindy
+		;;
+	mon960)
+		basic_machine=i960-intel
+		os=-mon960
+		;;
+	np1)
+		basic_machine=np1-gould
+		;;
+	nsr-tandem)
+		basic_machine=nsr-tandem
+		;;
+	op50n-* | op60c-*)
+		basic_machine=hppa1.1-oki
+		os=-proelf
+		;;
+	OSE68000 | ose68000)
+		basic_machine=m68000-ericsson
+		os=-ose
+		;;
+	os68k)
+		basic_machine=m68k-none
+		os=-os68k
+		;;
+	pa-hitachi)
+		basic_machine=hppa1.1-hitachi
+		os=-hiuxwe2
+		;;
+	paragon)
+		basic_machine=i860-intel
+		os=-osf
+		;;
+	pbd)
+		basic_machine=sparc-tti
+		;;
+	pbb)
+		basic_machine=m68k-tti
+		;;
+        pc532 | pc532-*)
+		basic_machine=ns32k-pc532
+		;;
+	pentiummmx | p55)
+		basic_machine=pentiummmx-pc
+		;;
+	pentium | p5 | i586)
+		basic_machine=pentium-pc
+		;;
+	pentiumpro | p6)
+		basic_machine=pentiumpro-pc
+		;;
+	pentiummmx-* | p55-*)
+		basic_machine=pentiummmx-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentium-* | p5-* | i586-*)
+		basic_machine=pentium-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentiumpro-* | p6-*)
+		basic_machine=pentiumpro-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	nexen)
+		# We don't have specific support for Nexgen yet, so just call it a Pentium
+		basic_machine=i586-nexgen
+		;;
+	pn)
+		basic_machine=pn-gould
+		;;
+	power)	basic_machine=rs6000-ibm
+		;;
+	ppc)	basic_machine=powerpc-unknown
+	        ;;
+	ppc-*)	basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppc64)	basic_machine=powerpc64-unknown
+	        ;;
+	ppc64-*)
+		basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppcle | powerpclittle | ppc-le | powerpc-little)
+		basic_machine=powerpcle-unknown
+	        ;;
+	ppcle-* | powerpclittle-*)
+		basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ps2)
+		basic_machine=i386-ibm
+		;;
+	rom68k)
+		basic_machine=m68k-rom68k
+		os=-coff
+		;;
+	rm[46]00)
+		basic_machine=mips-siemens
+		;;
+	rtpc | rtpc-*)
+		basic_machine=romp-ibm
+		;;
+	sa29200)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	sequent)
+		basic_machine=i386-sequent
+		;;
+	sh)
+		basic_machine=sh-hitachi
+		os=-hms
+		;;
+	sparclite-wrs)
+		basic_machine=sparclite-wrs
+		os=-vxworks
+		;;
+	sps7)
+		basic_machine=m68k-bull
+		os=-sysv2
+		;;
+	spur)
+		basic_machine=spur-unknown
+		;;
+	st2000)
+		basic_machine=m68k-tandem
+		;;
+	stratus)
+		basic_machine=i860-stratus
+		os=-sysv4
+		;;
+	sun2)
+		basic_machine=m68000-sun
+		;;
+	sun2os3)
+		basic_machine=m68000-sun
+		os=-sunos3
+		;;
+	sun2os4)
+		basic_machine=m68000-sun
+		os=-sunos4
+		;;
+	sun3os3)
+		basic_machine=m68k-sun
+		os=-sunos3
+		;;
+	sun3os4)
+		basic_machine=m68k-sun
+		os=-sunos4
+		;;
+	sun4os3)
+		basic_machine=sparc-sun
+		os=-sunos3
+		;;
+	sun4os4)
+		basic_machine=sparc-sun
+		os=-sunos4
+		;;
+	sun4sol2)
+		basic_machine=sparc-sun
+		os=-solaris2
+		;;
+	sun3 | sun3-*)
+		basic_machine=m68k-sun
+		;;
+	sun4)
+		basic_machine=sparc-sun
+		;;
+	sun386 | sun386i | roadrunner)
+		basic_machine=i386-sun
+		;;
+	sv1)
+		basic_machine=sv1-cray
+		os=-unicos
+		;;
+	symmetry)
+		basic_machine=i386-sequent
+		os=-dynix
+		;;
+	t3e)
+		basic_machine=t3e-cray
+		os=-unicos
+		;;
+	tx39)
+		basic_machine=mipstx39-unknown
+		;;
+	tx39el)
+		basic_machine=mipstx39el-unknown
+		;;
+	tower | tower-32)
+		basic_machine=m68k-ncr
+		;;
+	udi29k)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	ultra3)
+		basic_machine=a29k-nyu
+		os=-sym1
+		;;
+	v810 | necv810)
+		basic_machine=v810-nec
+		os=-none
+		;;
+	vaxv)
+		basic_machine=vax-dec
+		os=-sysv
+		;;
+	vms)
+		basic_machine=vax-dec
+		os=-vms
+		;;
+	vpp*|vx|vx-*)
+               basic_machine=f301-fujitsu
+               ;;
+	vxworks960)
+		basic_machine=i960-wrs
+		os=-vxworks
+		;;
+	vxworks68)
+		basic_machine=m68k-wrs
+		os=-vxworks
+		;;
+	vxworks29k)
+		basic_machine=a29k-wrs
+		os=-vxworks
+		;;
+	w65*)
+		basic_machine=w65-wdc
+		os=-none
+		;;
+	w89k-*)
+		basic_machine=hppa1.1-winbond
+		os=-proelf
+		;;
+	xmp)
+		basic_machine=xmp-cray
+		os=-unicos
+		;;
+        xps | xps100)
+		basic_machine=xps100-honeywell
+		;;
+	z8k-*-coff)
+		basic_machine=z8k-unknown
+		os=-sim
+		;;
+	none)
+		basic_machine=none-none
+		os=-none
+		;;
+
+# Here we handle the default manufacturer of certain CPU types.  It is in
+# some cases the only manufacturer, in others, it is the most popular.
+	w89k)
+		basic_machine=hppa1.1-winbond
+		;;
+	op50n)
+		basic_machine=hppa1.1-oki
+		;;
+	op60c)
+		basic_machine=hppa1.1-oki
+		;;
+	mips)
+		if [ x$os = x-linux-gnu ]; then
+			basic_machine=mips-unknown
+		else
+			basic_machine=mips-mips
+		fi
+		;;
+	romp)
+		basic_machine=romp-ibm
+		;;
+	rs6000)
+		basic_machine=rs6000-ibm
+		;;
+	vax)
+		basic_machine=vax-dec
+		;;
+	pdp11)
+		basic_machine=pdp11-dec
+		;;
+	we32k)
+		basic_machine=we32k-att
+		;;
+	sparc | sparcv9)
+		basic_machine=sparc-sun
+		;;
+        cydra)
+		basic_machine=cydra-cydrome
+		;;
+	orion)
+		basic_machine=orion-highlevel
+		;;
+	orion105)
+		basic_machine=clipper-highlevel
+		;;
+	mac | mpw | mac-mpw)
+		basic_machine=m68k-apple
+		;;
+	pmac | pmac-mpw)
+		basic_machine=powerpc-apple
+		;;
+	c4x*)
+		basic_machine=c4x-none
+		os=-coff
+		;;
+	*)
+		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+		exit 1
+		;;
+esac
+
+# Here we canonicalize certain aliases for manufacturers.
+case $basic_machine in
+	*-digital*)
+		basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'`
+		;;
+	*-commodore*)
+		basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'`
+		;;
+	*)
+		;;
+esac
+
+# Decode manufacturer-specific aliases for certain operating systems.
+
+if [ x"$os" != x"" ]
+then
+case $os in
+        # First match some system type aliases
+        # that might get confused with valid system types.
+	# -solaris* is a basic system type, with this one exception.
+	-solaris1 | -solaris1.*)
+		os=`echo $os | sed -e 's|solaris1|sunos4|'`
+		;;
+	-solaris)
+		os=-solaris2
+		;;
+	-svr4*)
+		os=-sysv4
+		;;
+	-unixware*)
+		os=-sysv4.2uw
+		;;
+	-gnu/linux*)
+		os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
+		;;
+	# First accept the basic system types.
+	# The portable systems comes first.
+	# Each alternative MUST END IN A *, to match a version number.
+	# -sysv* is not here because it comes later, after sysvr4.
+	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
+	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\
+	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \
+	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
+	      | -aos* \
+	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
+	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
+	      | -hiux* | -386bsd* | -netbsd* | -openbsd* | -freebsd* | -riscix* \
+	      | -lynxos* | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
+	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
+	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
+	      | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
+	      | -mingw32* | -linux-gnu* | -uxpv* | -beos* | -mpeix* | -udk* \
+	      | -interix* | -uwin* | -rhapsody* | -darwin* | -opened* \
+	      | -openstep* | -oskit*)
+	# Remember, each alternative MUST END IN *, to match a version number.
+		;;
+	-qnx*)
+		case $basic_machine in
+		    x86-* | i[34567]86-*)
+			;;
+		    *)
+			os=-nto$os
+			;;
+		esac
+		;;
+	-nto*)
+		os=-nto-qnx
+		;;
+	-sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \
+	      | -windows* | -osx | -abug | -netware* | -os9* | -beos* \
+	      | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*)
+		;;
+	-mac*)
+		os=`echo $os | sed -e 's|mac|macos|'`
+		;;
+	-linux*)
+		os=`echo $os | sed -e 's|linux|linux-gnu|'`
+		;;
+	-sunos5*)
+		os=`echo $os | sed -e 's|sunos5|solaris2|'`
+		;;
+	-sunos6*)
+		os=`echo $os | sed -e 's|sunos6|solaris3|'`
+		;;
+	-opened*)
+		os=-openedition
+		;;
+	-wince*)
+		os=-wince
+		;;
+	-osfrose*)
+		os=-osfrose
+		;;
+	-osf*)
+		os=-osf
+		;;
+	-utek*)
+		os=-bsd
+		;;
+	-dynix*)
+		os=-bsd
+		;;
+	-acis*)
+		os=-aos
+		;;
+	-386bsd)
+		os=-bsd
+		;;
+	-ctix* | -uts*)
+		os=-sysv
+		;;
+	-ns2 )
+	        os=-nextstep2
+		;;
+	-nsk)
+		os=-nsk
+		;;
+	# Preserve the version number of sinix5.
+	-sinix5.*)
+		os=`echo $os | sed -e 's|sinix|sysv|'`
+		;;
+	-sinix*)
+		os=-sysv4
+		;;
+	-triton*)
+		os=-sysv3
+		;;
+	-oss*)
+		os=-sysv3
+		;;
+	-svr4)
+		os=-sysv4
+		;;
+	-svr3)
+		os=-sysv3
+		;;
+	-sysvr4)
+		os=-sysv4
+		;;
+	# This must come after -sysvr4.
+	-sysv*)
+		;;
+	-ose*)
+		os=-ose
+		;;
+	-es1800*)
+		os=-ose
+		;;
+	-xenix)
+		os=-xenix
+		;;
+        -*mint | -*MiNT)
+	        os=-mint
+		;;
+	-none)
+		;;
+	*)
+		# Get rid of the `-' at the beginning of $os.
+		os=`echo $os | sed 's/[^-]*-//'`
+		echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2
+		exit 1
+		;;
+esac
+else
+
+# Here we handle the default operating systems that come with various machines.
+# The value should be what the vendor currently ships out the door with their
+# machine or put another way, the most popular os provided with the machine.
+
+# Note that if you're going to try to match "-MANUFACTURER" here (say,
+# "-sun"), then you have to tell the case statement up towards the top
+# that MANUFACTURER isn't an operating system.  Otherwise, code above
+# will signal an error saying that MANUFACTURER isn't an operating
+# system, and we'll never get to this point.
+
+case $basic_machine in
+	*-acorn)
+		os=-riscix1.2
+		;;
+	arm*-rebel)
+		os=-linux
+		;;
+	arm*-semi)
+		os=-aout
+		;;
+        pdp11-*)
+		os=-none
+		;;
+	*-dec | vax-*)
+		os=-ultrix4.2
+		;;
+	m68*-apollo)
+		os=-domain
+		;;
+	i386-sun)
+		os=-sunos4.0.2
+		;;
+	m68000-sun)
+		os=-sunos3
+		# This also exists in the configure program, but was not the
+		# default.
+		# os=-sunos4
+		;;
+	m68*-cisco)
+		os=-aout
+		;;
+	mips*-cisco)
+		os=-elf
+		;;
+	mips*-*)
+		os=-elf
+		;;
+	*-tti)	# must be before sparc entry or we get the wrong os.
+		os=-sysv3
+		;;
+	sparc-* | *-sun)
+		os=-sunos4.1.1
+		;;
+	*-be)
+		os=-beos
+		;;
+	*-ibm)
+		os=-aix
+		;;
+	*-wec)
+		os=-proelf
+		;;
+	*-winbond)
+		os=-proelf
+		;;
+	*-oki)
+		os=-proelf
+		;;
+	*-hp)
+		os=-hpux
+		;;
+	*-hitachi)
+		os=-hiux
+		;;
+	i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
+		os=-sysv
+		;;
+	*-cbm)
+		os=-amigaos
+		;;
+	*-dg)
+		os=-dgux
+		;;
+	*-dolphin)
+		os=-sysv3
+		;;
+	m68k-ccur)
+		os=-rtu
+		;;
+	m88k-omron*)
+		os=-luna
+		;;
+	*-next )
+		os=-nextstep
+		;;
+	*-sequent)
+		os=-ptx
+		;;
+	*-crds)
+		os=-unos
+		;;
+	*-ns)
+		os=-genix
+		;;
+	i370-*)
+		os=-mvs
+		;;
+	*-next)
+		os=-nextstep3
+		;;
+        *-gould)
+		os=-sysv
+		;;
+        *-highlevel)
+		os=-bsd
+		;;
+	*-encore)
+		os=-bsd
+		;;
+        *-sgi)
+		os=-irix
+		;;
+        *-siemens)
+		os=-sysv4
+		;;
+	*-masscomp)
+		os=-rtu
+		;;
+	f301-fujitsu)
+		os=-uxpv
+		;;
+	*-rom68k)
+		os=-coff
+		;;
+	*-*bug)
+		os=-coff
+		;;
+	*-apple)
+		os=-macos
+		;;
+	*-atari*)
+		os=-mint
+		;;
+	*)
+		os=-none
+		;;
+esac
+fi
+
+# Here we handle the case where we know the os, and the CPU type, but not the
+# manufacturer.  We pick the logical manufacturer.
+vendor=unknown
+case $basic_machine in
+	*-unknown)
+		case $os in
+			-riscix*)
+				vendor=acorn
+				;;
+			-sunos*)
+				vendor=sun
+				;;
+			-aix*)
+				vendor=ibm
+				;;
+			-beos*)
+				vendor=be
+				;;
+			-hpux*)
+				vendor=hp
+				;;
+			-mpeix*)
+				vendor=hp
+				;;
+			-hiux*)
+				vendor=hitachi
+				;;
+			-unos*)
+				vendor=crds
+				;;
+			-dgux*)
+				vendor=dg
+				;;
+			-luna*)
+				vendor=omron
+				;;
+			-genix*)
+				vendor=ns
+				;;
+			-mvs* | -opened*)
+				vendor=ibm
+				;;
+			-ptx*)
+				vendor=sequent
+				;;
+			-vxsim* | -vxworks*)
+				vendor=wrs
+				;;
+			-aux*)
+				vendor=apple
+				;;
+			-hms*)
+				vendor=hitachi
+				;;
+			-mpw* | -macos*)
+				vendor=apple
+				;;
+			-*mint | -*MiNT)
+				vendor=atari
+				;;
+		esac
+		basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
+		;;
+esac
+
+echo $basic_machine$os
diff --git a/rts/gmp/configure b/rts/gmp/configure
new file mode 100644
index 0000000000..8294680486
--- /dev/null
+++ b/rts/gmp/configure
@@ -0,0 +1,5216 @@
+#! /bin/sh
+# From configure.in Revision: 1.129.2.2
+# Guess values for system-dependent variables and create Makefiles.
+# Generated automatically using Autoconf version 2.14a.
+# Copyright (C) 1992, 93, 94, 95, 96, 98, 99, 2000
+# Free Software Foundation, Inc.
+#
+# This configure script is free software; the Free Software Foundation
+# gives unlimited permission to copy, distribute and modify it.
+
+# Defaults:
+ac_default_prefix=/usr/local
+# Any additions from configure.in:
+
+# Initialize some variables set by options.
+ac_init_help=false
+ac_init_version=false
+# The variables have the same names as the options, with
+# dashes changed to underlines.
+build=NONE
+cache_file=./config.cache
+exec_prefix=NONE
+host=NONE
+no_create=
+nonopt=NONE
+no_recursion=
+prefix=NONE
+program_prefix=NONE
+program_suffix=NONE
+program_transform_name=s,x,x,
+silent=
+site=
+srcdir=
+target=NONE
+verbose=
+x_includes=NONE
+x_libraries=NONE
+bindir='${exec_prefix}/bin'
+sbindir='${exec_prefix}/sbin'
+libexecdir='${exec_prefix}/libexec'
+datadir='${prefix}/share'
+sysconfdir='${prefix}/etc'
+sharedstatedir='${prefix}/com'
+localstatedir='${prefix}/var'
+libdir='${exec_prefix}/lib'
+includedir='${prefix}/include'
+oldincludedir='/usr/include'
+infodir='${prefix}/info'
+mandir='${prefix}/man'
+
+# Initialize some other variables.
+subdirs=
+MFLAGS= MAKEFLAGS=
+SHELL=${CONFIG_SHELL-/bin/sh}
+# Maximum number of lines to put in a shell here document.
+: ${ac_max_here_lines=48}
+# Sed expression to map a string onto a valid sh and CPP variable names.
+ac_tr_sh='sed -e y%*+%pp%;s%[^a-zA-Z0-9_]%_%g'
+ac_tr_cpp='sed -e y%*abcdefghijklmnopqrstuvwxyz%PABCDEFGHIJKLMNOPQRSTUVWXYZ%;s%[^A-Z0-9_]%_%g'
+
+ac_prev=
+for ac_option
+do
+  # If the previous option needs an argument, assign it.
+  if test -n "$ac_prev"; then
+    eval "$ac_prev=\$ac_option"
+    ac_prev=
+    continue
+  fi
+
+  ac_optarg=`echo "$ac_option" | sed -n 's/^[^=]*=//p'`
+
+  # Accept the important Cygnus configure options, so we can diagnose typos.
+
+  case "$ac_option" in
+
+  -bindir | --bindir | --bindi | --bind | --bin | --bi)
+    ac_prev=bindir ;;
+  -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*)
+    bindir="$ac_optarg" ;;
+
+  -build | --build | --buil | --bui | --bu)
+    ac_prev=build ;;
+  -build=* | --build=* | --buil=* | --bui=* | --bu=*)
+    build="$ac_optarg" ;;
+
+  -cache-file | --cache-file | --cache-fil | --cache-fi \
+  | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c)
+    ac_prev=cache_file ;;
+  -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \
+  | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*)
+    cache_file="$ac_optarg" ;;
+
+  -datadir | --datadir | --datadi | --datad | --data | --dat | --da)
+    ac_prev=datadir ;;
+  -datadir=* | --datadir=* | --datadi=* | --datad=* | --data=* | --dat=* \
+  | --da=*)
+    datadir="$ac_optarg" ;;
+
+  -disable-* | --disable-*)
+    ac_feature=`echo "$ac_option"|sed -e 's/-*disable-//'`
+    # Reject names that are not valid shell variable names.
+    if echo "$ac_feature" | grep '[^-a-zA-Z0-9_]' >/dev/null 2>&1; then
+      { echo "configure: error: invalid feature: $ac_feature" 1>&2; exit 1; }
+    fi
+    ac_feature=`echo $ac_feature| sed 's/-/_/g'`
+    eval "enable_${ac_feature}=no" ;;
+
+  -enable-* | --enable-*)
+    ac_feature=`echo "$ac_option"|sed -e 's/-*enable-//' -e 's/=.*//'`
+    # Reject names that are not valid shell variable names.
+    if echo "$ac_feature" | grep '[^-a-zA-Z0-9_]' >/dev/null 2>&1; then
+      { echo "configure: error: invalid feature: $ac_feature" 1>&2; exit 1; }
+    fi
+    ac_feature=`echo $ac_feature| sed 's/-/_/g'`
+    case "$ac_option" in
+      *=*) ac_optarg=`echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"`;;
+      *) ac_optarg=yes ;;
+    esac
+    eval "enable_${ac_feature}='$ac_optarg'" ;;
+
+  -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \
+  | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \
+  | --exec | --exe | --ex)
+    ac_prev=exec_prefix ;;
+  -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \
+  | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \
+  | --exec=* | --exe=* | --ex=*)
+    exec_prefix="$ac_optarg" ;;
+
+  -gas | --gas | --ga | --g)
+    # Obsolete; use --with-gas.
+    with_gas=yes ;;
+
+  -help | --help | --hel | --he | -h)
+    ac_init_help=: ;;
+  -host | --host | --hos | --ho)
+    ac_prev=host ;;
+  -host=* | --host=* | --hos=* | --ho=*)
+    host="$ac_optarg" ;;
+
+  -includedir | --includedir | --includedi | --included | --include \
+  | --includ | --inclu | --incl | --inc)
+    ac_prev=includedir ;;
+  -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \
+  | --includ=* | --inclu=* | --incl=* | --inc=*)
+    includedir="$ac_optarg" ;;
+
+  -infodir | --infodir | --infodi | --infod | --info | --inf)
+    ac_prev=infodir ;;
+  -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*)
+    infodir="$ac_optarg" ;;
+
+  -libdir | --libdir | --libdi | --libd)
+    ac_prev=libdir ;;
+  -libdir=* | --libdir=* | --libdi=* | --libd=*)
+    libdir="$ac_optarg" ;;
+
+  -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \
+  | --libexe | --libex | --libe)
+    ac_prev=libexecdir ;;
+  -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \
+  | --libexe=* | --libex=* | --libe=*)
+    libexecdir="$ac_optarg" ;;
+
+  -localstatedir | --localstatedir | --localstatedi | --localstated \
+  | --localstate | --localstat | --localsta | --localst \
+  | --locals | --local | --loca | --loc | --lo)
+    ac_prev=localstatedir ;;
+  -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \
+  | --localstate=* | --localstat=* | --localsta=* | --localst=* \
+  | --locals=* | --local=* | --loca=* | --loc=* | --lo=*)
+    localstatedir="$ac_optarg" ;;
+
+  -mandir | --mandir | --mandi | --mand | --man | --ma | --m)
+    ac_prev=mandir ;;
+  -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*)
+    mandir="$ac_optarg" ;;
+
+  -nfp | --nfp | --nf)
+    # Obsolete; use --without-fp.
+    with_fp=no ;;
+
+  -no-create | --no-create | --no-creat | --no-crea | --no-cre \
+  | --no-cr | --no-c)
+    no_create=yes ;;
+
+  -no-recursion | --no-recursion | --no-recursio | --no-recursi \
+  | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r)
+    no_recursion=yes ;;
+
+  -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \
+  | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \
+  | --oldin | --oldi | --old | --ol | --o)
+    ac_prev=oldincludedir ;;
+  -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \
+  | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \
+  | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*)
+    oldincludedir="$ac_optarg" ;;
+
+  -prefix | --prefix | --prefi | --pref | --pre | --pr | --p)
+    ac_prev=prefix ;;
+  -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*)
+    prefix="$ac_optarg" ;;
+
+  -program-prefix | --program-prefix | --program-prefi | --program-pref \
+  | --program-pre | --program-pr | --program-p)
+    ac_prev=program_prefix ;;
+  -program-prefix=* | --program-prefix=* | --program-prefi=* \
+  | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*)
+    program_prefix="$ac_optarg" ;;
+
+  -program-suffix | --program-suffix | --program-suffi | --program-suff \
+  | --program-suf | --program-su | --program-s)
+    ac_prev=program_suffix ;;
+  -program-suffix=* | --program-suffix=* | --program-suffi=* \
+  | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*)
+    program_suffix="$ac_optarg" ;;
+
+  -program-transform-name | --program-transform-name \
+  | --program-transform-nam | --program-transform-na \
+  | --program-transform-n | --program-transform- \
+  | --program-transform | --program-transfor \
+  | --program-transfo | --program-transf \
+  | --program-trans | --program-tran \
+  | --progr-tra | --program-tr | --program-t)
+    ac_prev=program_transform_name ;;
+  -program-transform-name=* | --program-transform-name=* \
+  | --program-transform-nam=* | --program-transform-na=* \
+  | --program-transform-n=* | --program-transform-=* \
+  | --program-transform=* | --program-transfor=* \
+  | --program-transfo=* | --program-transf=* \
+  | --program-trans=* | --program-tran=* \
+  | --progr-tra=* | --program-tr=* | --program-t=*)
+    program_transform_name="$ac_optarg" ;;
+
+  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+  | -silent | --silent | --silen | --sile | --sil)
+    silent=yes ;;
+
+  -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
+    ac_prev=sbindir ;;
+  -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
+  | --sbi=* | --sb=*)
+    sbindir="$ac_optarg" ;;
+
+  -sharedstatedir | --sharedstatedir | --sharedstatedi \
+  | --sharedstated | --sharedstate | --sharedstat | --sharedsta \
+  | --sharedst | --shareds | --shared | --share | --shar \
+  | --sha | --sh)
+    ac_prev=sharedstatedir ;;
+  -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \
+  | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \
+  | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \
+  | --sha=* | --sh=*)
+    sharedstatedir="$ac_optarg" ;;
+
+  -site | --site | --sit)
+    ac_prev=site ;;
+  -site=* | --site=* | --sit=*)
+    site="$ac_optarg" ;;
+
+  -srcdir | --srcdir | --srcdi | --srcd | --src | --sr)
+    ac_prev=srcdir ;;
+  -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*)
+    srcdir="$ac_optarg" ;;
+
+  -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \
+  | --syscon | --sysco | --sysc | --sys | --sy)
+    ac_prev=sysconfdir ;;
+  -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \
+  | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*)
+    sysconfdir="$ac_optarg" ;;
+
+  -target | --target | --targe | --targ | --tar | --ta | --t)
+    ac_prev=target ;;
+  -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*)
+    target="$ac_optarg" ;;
+
+  -v | -verbose | --verbose | --verbos | --verbo | --verb)
+    verbose=yes ;;
+
+  -version | --version | --versio | --versi | --vers | -V)
+    ac_init_version=: ;;
+
+  -with-* | --with-*)
+    ac_package=`echo "$ac_option"|sed -e 's/-*with-//' -e 's/=.*//'`
+    # Reject names that are not valid shell variable names.
+    if echo "$ac_package" | grep '[^-a-zA-Z0-9_]' >/dev/null 2>&1; then
+      { echo "configure: error: invalid package: $ac_package" 1>&2; exit 1; }
+    fi
+    ac_package=`echo $ac_package| sed 's/-/_/g'`
+    case "$ac_option" in
+      *=*) ac_optarg=`echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"`;;
+      *) ac_optarg=yes ;;
+    esac
+    eval "with_${ac_package}='$ac_optarg'" ;;
+
+  -without-* | --without-*)
+    ac_package=`echo "$ac_option"|sed -e 's/-*without-//'`
+    # Reject names that are not valid shell variable names.
+    if echo "$ac_package" | grep '[^-a-zA-Z0-9_]' >/dev/null 2>&1; then
+      { echo "configure: error: invalid package: $ac_package" 1>&2; exit 1; }
+    fi
+    ac_package=`echo $ac_package| sed 's/-/_/g'`
+    eval "with_${ac_package}=no" ;;
+
+  --x)
+    # Obsolete; use --with-x.
+    with_x=yes ;;
+
+  -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \
+  | --x-incl | --x-inc | --x-in | --x-i)
+    ac_prev=x_includes ;;
+  -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \
+  | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*)
+    x_includes="$ac_optarg" ;;
+
+  -x-libraries | --x-libraries | --x-librarie | --x-librari \
+  | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l)
+    ac_prev=x_libraries ;;
+  -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \
+  | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
+    x_libraries="$ac_optarg" ;;
+
+  -*) { echo "configure: error: unrecognized option: $ac_option
+Try \`configure --help' for more information." 1>&2; exit 1; }
+    ;;
+
+  *=*)
+    ac_envvar=`echo "$ac_option" | sed -e 's/=.*//'`
+    # Reject names that are not valid shell variable names.
+    if echo "$ac_envvar" | grep '[^a-zA-Z0-9_]' >/dev/null 2>&1; then
+      { echo "configure: error: invalid variable name: $ac_envvar" 1>&2; exit 1; }
+    fi
+    ac_optarg=`echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"`
+    eval "$ac_envvar='$ac_optarg'"
+    export $ac_envvar ;;
+
+  *)
+    if echo "$ac_option" | grep '[^-a-zA-Z0-9.]' >/dev/null 2>&1; then
+      echo "configure: warning: invalid host type: $ac_option" 1>&2
+    fi
+    if test "x$nonopt" != xNONE; then
+      { echo "configure: error: can only configure for one host and one target at a time" 1>&2; exit 1; }
+    fi
+    nonopt="$ac_option"
+    ;;
+
+  esac
+done
+
+if test -n "$ac_prev"; then
+  { echo "configure: error: missing argument to --\`echo $ac_prev | sed 's/_/-/g'\`" 1>&2; exit 1; }
+fi
+if $ac_init_help; then
+  # Omit some internal or obsolete options to make the list less imposing.
+  # This message is too long to be a string in the A/UX 3.1 sh.
+  cat <<\EOF
+`configure' configures software source code packages to adapt to many kinds
+of systems.
+
+Usage: configure [OPTION]... [VAR=VALUE]... [HOST]
+
+To safely assign special values to environment variables (e.g., CC,
+CFLAGS...), give to `configure' the definition as VAR=VALUE.
+
+Defaults for the options are specified in brackets.
+
+Configuration:
+  -h, --help              print this message
+  -V, --version           print the version of autoconf that created configure
+  -q, --quiet, --silent   do not print `checking...' messages
+      --cache-file=FILE   cache test results in FILE
+  -n, --no-create         do not create output files
+
+EOF
+
+  cat <<EOF
+Directories:
+  --prefix=PREFIX         install architecture-independent files in PREFIX
+                          [$ac_default_prefix]
+  --exec-prefix=EPREFIX   install architecture-dependent files in EPREFIX
+                          [same as prefix]
+  --bindir=DIR            user executables in DIR [EPREFIX/bin]
+  --sbindir=DIR           system admin executables in DIR [EPREFIX/sbin]
+  --libexecdir=DIR        program executables in DIR [EPREFIX/libexec]
+  --datadir=DIR           read-only architecture-independent data in DIR
+                          [PREFIX/share]
+  --sysconfdir=DIR        read-only single-machine data in DIR [PREFIX/etc]
+  --sharedstatedir=DIR    modifiable architecture-independent data in DIR
+                          [PREFIX/com]
+  --localstatedir=DIR     modifiable single-machine data in DIR [PREFIX/var]
+  --libdir=DIR            object code libraries in DIR [EPREFIX/lib]
+  --includedir=DIR        C header files in DIR [PREFIX/include]
+  --oldincludedir=DIR     C header files for non-gcc in DIR [/usr/include]
+  --infodir=DIR           info documentation in DIR [PREFIX/info]
+  --mandir=DIR            man documentation in DIR [PREFIX/man]
+  --srcdir=DIR            find the sources in DIR [configure dir or ..]
+EOF
+
+  cat <<\EOF
+
+Host type:
+  --build=BUILD      configure for building on BUILD [BUILD=HOST]
+  --host=HOST        configure for HOST [guessed]
+  --target=TARGET    configure for TARGET [TARGET=HOST]
+EOF
+
+  cat <<\EOF
+
+Program names:
+  --program-prefix=PREFIX            prepend PREFIX to installed program names
+  --program-suffix=SUFFIX            append SUFFIX to installed program names
+  --program-transform-name=PROGRAM   run sed PROGRAM on installed program names
+
+Optional Features:
+  --disable-FEATURE       do not include FEATURE (same as --enable-FEATURE=no)
+  --enable-FEATURE=ARG  include FEATURE ARG=yes
+  --disable-dependency-tracking Speeds up one-time builds
+  --enable-dependency-tracking  Do not reject slow dependency extractors
+  --enable-maintainer-mode enable make rules and dependencies not useful
+                          (and sometimes confusing) to the casual installer
+  --enable-assert         enable ASSERT checking default=no
+  --enable-alloca         use alloca for temp space default=yes
+  --enable-fft            enable FFTs for multiplication default=no
+  --enable-mpbsd          build Berkley MP compatibility library default=no
+  --enable-mpfr           build MPFR default=no
+  --enable-shared=PKGS  build shared libraries default=yes
+  --enable-static=PKGS  build static libraries default=yes
+  --enable-fast-install=PKGS  optimize for fast installation default=yes
+  --disable-libtool-lock  avoid locking (might break parallel builds)
+
+Optional Packages:
+  --with-PACKAGE=ARG    use PACKAGE ARG=yes
+  --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
+  --with-gnu-ld           assume the C compiler uses GNU ld default=no
+  --with-pic              try to use only PIC/non-PIC objects default=use both
+EOF
+  exit 0
+fi
+if $ac_init_version; then
+  cat <<\EOF
+Generated automatically using Autoconf version 2.14a.
+Copyright (C) 1992, 93, 94, 95, 96, 98, 99, 2000
+Free Software Foundation, Inc.
+
+This configure script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it.
+EOF
+  exit 0
+fi
+trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15
+
+# Keep a trace of the command line.
+# Strip out --no-create and --no-recursion so they do not pile up.
+# Also quote any args containing shell meta-characters.
+ac_configure_args=
+for ac_arg
+do
+  case "$ac_arg" in
+  -no-create | --no-create | --no-creat | --no-crea | --no-cre \
+  | --no-cr | --no-c) ;;
+  -no-recursion | --no-recursion | --no-recursio | --no-recursi \
+  | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) ;;
+  *" "*|*"	"*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?\"\']*)
+  ac_arg=`echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"`
+  ac_configure_args="$ac_configure_args '$ac_arg'" ;;
+  *) ac_configure_args="$ac_configure_args $ac_arg" ;;
+  esac
+done
+
+# File descriptor usage:
+# 0 standard input
+# 1 file creation
+# 2 errors and warnings
+# 3 some systems may open it to /dev/tty
+# 4 used on the Kubota Titan
+# 6 checking for... messages and results
+# 5 compiler messages saved in config.log
+if test "$silent" = yes; then
+  exec 6>/dev/null
+else
+  exec 6>&1
+fi
+exec 5>./config.log
+
+echo "\
+This file contains any messages produced by compilers while
+running configure, to aid debugging if configure makes a mistake.
+
+It was created by configure version 2.14a, executed with
+ > $0 $ac_configure_args
+" 1>&5
+
+# NLS nuisances.
+# Only set these to C if already set.  These must not be set unconditionally
+# because not all systems understand e.g. LANG=C (notably SCO).
+# Fixing LC_MESSAGES prevents Solaris sh from translating var values in `set'!
+# Non-C LC_CTYPE values break the ctype check.
+if test "${LANG+set}"   = set; then LANG=C;   export LANG;   fi
+if test "${LC_ALL+set}" = set; then LC_ALL=C; export LC_ALL; fi
+if test "${LC_MESSAGES+set}" = set; then LC_MESSAGES=C; export LC_MESSAGES; fi
+if test "${LC_CTYPE+set}"    = set; then LC_CTYPE=C;    export LC_CTYPE;    fi
+
+# confdefs.h avoids OS command line length limits that DEFS can exceed.
+rm -rf conftest* confdefs.h
+# AIX cpp loses on an empty file, so make sure it contains at least a newline.
+echo >confdefs.h
+
+# A filename unique to this package, relative to the directory that
+# configure is in, which we can look for to find out if srcdir is correct.
+ac_unique_file=
+
+# Find the source files, if location was not specified.
+if test -z "$srcdir"; then
+  ac_srcdir_defaulted=yes
+  # Try the directory containing this script, then its parent.
+  ac_prog=$0
+  ac_confdir=`echo "$ac_prog" | sed 's%/[^/][^/]*$%%'`
+  test "x$ac_confdir" = "x$ac_prog" && ac_confdir=.
+  srcdir=$ac_confdir
+  if test ! -r $srcdir/$ac_unique_file; then
+    srcdir=..
+  fi
+else
+  ac_srcdir_defaulted=no
+fi
+if test ! -r $srcdir/$ac_unique_file; then
+  if test "$ac_srcdir_defaulted" = yes; then
+    { echo "configure: error: cannot find sources in $ac_confdir or .." 1>&2; exit 1; }
+  else
+    { echo "configure: error: cannot find sources in $srcdir" 1>&2; exit 1; }
+  fi
+fi
+srcdir=`echo "$srcdir" | sed 's%\([^/]\)/*$%\1%'`
+
+# Prefer explicitly selected file to automatically selected ones.
+if test -z "$CONFIG_SITE"; then
+  if test "x$prefix" != xNONE; then
+    CONFIG_SITE="$prefix/share/config.site $prefix/etc/config.site"
+  else
+    CONFIG_SITE="$ac_default_prefix/share/config.site $ac_default_prefix/etc/config.site"
+  fi
+fi
+for ac_site_file in $CONFIG_SITE; do
+  if test -r "$ac_site_file"; then
+    echo "loading site script $ac_site_file"
+    . "$ac_site_file"
+  fi
+done
+
+if test -r "$cache_file"; then
+  echo "loading cache $cache_file"
+      test -f "$cache_file" && . $cache_file
+else
+  echo "creating cache $cache_file"
+  >$cache_file
+fi
+
+ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+ac_exeext=
+ac_objext=o
+# Factoring default headers for most tests.
+ac_includes_default="\
+#include <stdio.h>
+#include <sys/types.h>
+#if STDC_HEADERS
+# include <stdlib.h>
+# include <stddef.h>
+#else
+# if HAVE_STDLIB_H
+#  include <stdlib.h>
+# endif
+#endif
+#if HAVE_STRING_H
+# if !STDC_HEADERS && HAVE_MEMORY_H
+#  include <memory.h>
+# endif
+# include <string.h>
+#else
+# if HAVE_STRINGS_H
+#  include <strings.h>
+# endif
+#endif
+#if HAVE_INTTYPES_H
+# include <inttypes.h>
+#endif
+#if HAVE_UNISTD_H
+# include <unistd.h>
+#endif"
+
+if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then
+  # Stardent Vistra SVR4 grep lacks -e, says Kaveh R. Ghazi.
+  if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then
+    ECHO_N= ECHO_C='
+' ECHO_T='	'
+  else
+    ECHO_N=-n ECHO_C= ECHO_T=
+  fi
+else
+  ECHO_N= ECHO_C='\c' ECHO_T=
+fi
+
+ac_aux_dir=
+for ac_dir in $srcdir $srcdir/.. $srcdir/../..; do
+  if test -f $ac_dir/install-sh; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/install-sh -c"
+    break
+  elif test -f $ac_dir/install.sh; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/install.sh -c"
+    break
+  elif test -f $ac_dir/shtool; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/shtool install -c"
+    break
+  fi
+done
+if test -z "$ac_aux_dir"; then
+  { echo "configure: error: cannot find install-sh or install.sh in $srcdir $srcdir/.. $srcdir/../.." 1>&2; exit 1; }
+fi
+ac_config_guess="$SHELL $ac_aux_dir/config.guess"
+ac_config_sub="$SHELL $ac_aux_dir/config.sub"
+ac_configure="$SHELL $ac_aux_dir/configure" # This should be Cygnus configure.
+
+echo $ECHO_N "checking host system type... $ECHO_C" 1>&6
+echo "configure:636: checking host system type" 1>&5
+if test "x$ac_cv_host" = "x" || (test "x$host" != "xNONE" && test "x$host" != "x$ac_cv_host_alias"); then
+
+  # Make sure we can run config.sub.
+  if $ac_config_sub sun4 >/dev/null 2>&1; then :; else
+    { echo "configure: error: cannot run $ac_config_sub" 1>&2; exit 1; }
+  fi
+
+  ac_cv_host_alias=$host
+  case "$ac_cv_host_alias" in
+  NONE)
+    case $nonopt in
+    NONE)
+      if ac_cv_host_alias=`$ac_config_guess`; then :
+      else { echo "configure: error: cannot guess host type; you must specify one" 1>&2; exit 1; }
+      fi ;;    *) ac_cv_host_alias=$nonopt ;;
+    esac ;;
+  esac
+
+  ac_cv_host=`$ac_config_sub $ac_cv_host_alias` || exit 1
+  ac_cv_host_cpu=`echo $ac_cv_host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+  ac_cv_host_vendor=`echo $ac_cv_host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+  ac_cv_host_os=`echo $ac_cv_host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+else
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+fi
+
+echo "$ECHO_T""$ac_cv_host" 1>&6
+
+host=$ac_cv_host
+host_alias=$ac_cv_host_alias
+host_cpu=$ac_cv_host_cpu
+host_vendor=$ac_cv_host_vendor
+host_os=$ac_cv_host_os
+
+echo $ECHO_N "checking target system type... $ECHO_C" 1>&6
+echo "configure:672: checking target system type" 1>&5
+if test "x$ac_cv_target" = "x" || (test "x$target" != "xNONE" && test "x$target" != "x$ac_cv_target_alias"); then
+
+  # Make sure we can run config.sub.
+  if $ac_config_sub sun4 >/dev/null 2>&1; then :; else
+    { echo "configure: error: cannot run $ac_config_sub" 1>&2; exit 1; }
+  fi
+
+  ac_cv_target_alias=$target
+  case "$ac_cv_target_alias" in
+  NONE)
+    case $nonopt in
+    NONE)
+      ac_cv_target_alias=$host_alias ;;
+    *) ac_cv_target_alias=$nonopt ;;
+    esac ;;
+  esac
+
+  ac_cv_target=`$ac_config_sub $ac_cv_target_alias` || exit 1
+  ac_cv_target_cpu=`echo $ac_cv_target | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+  ac_cv_target_vendor=`echo $ac_cv_target | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+  ac_cv_target_os=`echo $ac_cv_target | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+else
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+fi
+
+echo "$ECHO_T""$ac_cv_target" 1>&6
+
+target=$ac_cv_target
+target_alias=$ac_cv_target_alias
+target_cpu=$ac_cv_target_cpu
+target_vendor=$ac_cv_target_vendor
+target_os=$ac_cv_target_os
+
+echo $ECHO_N "checking build system type... $ECHO_C" 1>&6
+echo "configure:707: checking build system type" 1>&5
+if test "x$ac_cv_build" = "x" || (test "x$build" != "xNONE" && test "x$build" != "x$ac_cv_build_alias"); then
+
+  # Make sure we can run config.sub.
+  if $ac_config_sub sun4 >/dev/null 2>&1; then :; else
+    { echo "configure: error: cannot run $ac_config_sub" 1>&2; exit 1; }
+  fi
+
+  ac_cv_build_alias=$build
+  case "$ac_cv_build_alias" in
+  NONE)
+    case $nonopt in
+    NONE)
+      ac_cv_build_alias=$host_alias ;;
+    *) ac_cv_build_alias=$nonopt ;;
+    esac ;;
+  esac
+
+  ac_cv_build=`$ac_config_sub $ac_cv_build_alias` || exit 1
+  ac_cv_build_cpu=`echo $ac_cv_build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+  ac_cv_build_vendor=`echo $ac_cv_build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+  ac_cv_build_os=`echo $ac_cv_build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+else
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+fi
+
+echo "$ECHO_T""$ac_cv_build" 1>&6
+
+build=$ac_cv_build
+build_alias=$ac_cv_build_alias
+build_cpu=$ac_cv_build_cpu
+build_vendor=$ac_cv_build_vendor
+build_os=$ac_cv_build_os
+
+# Do some error checking and defaulting for the host and target type.
+# The inputs are:
+#    configure --host=HOST --target=TARGET --build=BUILD NONOPT
+#
+# The rules are:
+# 1. You are not allowed to specify --host, --target, and nonopt at the
+#    same time.
+# 2. Host defaults to nonopt.
+# 3. If nonopt is not specified, then host defaults to the current host,
+#    as determined by config.guess.
+# 4. Target and build default to nonopt.
+# 5. If nonopt is not specified, then target and build default to host.
+
+# The aliases save the names the user supplied, while $host etc.
+# will get canonicalized.
+case $host---$target---$nonopt in
+NONE---*---* | *---NONE---* | *---*---NONE) ;;
+*) { echo "configure: error: can only configure for one host and one target at a time" 1>&2; exit 1; } ;;
+esac
+
+test "$host_alias" != "$target_alias" &&
+  test "$program_prefix$program_suffix$program_transform_name" = \
+    NONENONEs,x,x, &&
+  program_prefix=${target_alias}-
+
+# Find a good install program.  We prefer a C program (faster),
+# so one script is as good as another.  But avoid the broken or
+# incompatible versions:
+# SysV /etc/install, /usr/sbin/install
+# SunOS /usr/etc/install
+# IRIX /sbin/install
+# AIX /bin/install
+# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag
+# AFS /usr/afsws/bin/install, which mishandles nonexistent args
+# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
+# ./install, which can be erroneously created by make from ./install.sh.
+echo $ECHO_N "checking for a BSD compatible install... $ECHO_C" 1>&6
+echo "configure:778: checking for a BSD compatible install" 1>&5
+if test -z "$INSTALL"; then
+if test "${ac_cv_path_install+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+    IFS="${IFS= 	}"; ac_save_IFS="$IFS"; IFS=":"
+  for ac_dir in $PATH; do
+    # Account for people who put trailing slashes in PATH elements.
+    case "$ac_dir/" in
+    /|./|.//|/etc/*|/usr/sbin/*|/usr/etc/*|/sbin/*|/usr/afsws/bin/*|/usr/ucb/*) ;;
+    *)
+      # OSF1 and SCO ODT 3.0 have their own names for install.
+      # Don't use installbsd from OSF since it installs stuff as root
+      # by default.
+      for ac_prog in ginstall scoinst install; do
+        if test -f $ac_dir/$ac_prog; then
+	  if test $ac_prog = install &&
+            grep dspmsg $ac_dir/$ac_prog >/dev/null 2>&1; then
+	    # AIX install.  It has an incompatible calling convention.
+	    :
+	  elif test $ac_prog = install &&
+	    grep pwplus $ac_dir/$ac_prog >/dev/null 2>&1; then
+	    # program-specific install script used by HP pwplus--don't use.
+	    :
+	  else
+	    ac_cv_path_install="$ac_dir/$ac_prog -c"
+	    break 2
+	  fi
+	fi
+      done
+      ;;
+    esac
+  done
+  IFS="$ac_save_IFS"
+
+fi
+  if test "${ac_cv_path_install+set}" = set; then
+    INSTALL="$ac_cv_path_install"
+  else
+    # As a last resort, use the slow shell script.  We don't cache a
+    # path for INSTALL within a source directory, because that will
+    # break other packages using the cache if that directory is
+    # removed, or if the path is relative.
+    INSTALL="$ac_install_sh"
+  fi
+fi
+echo "$ECHO_T""$INSTALL" 1>&6
+
+# Use test -z because SunOS4 sh mishandles braces in ${var-val}.
+# It thinks the first close brace ends the variable substitution.
+test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}'
+
+test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL}'
+
+test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
+
+echo $ECHO_N "checking whether build environment is sane... $ECHO_C" 1>&6
+echo "configure:835: checking whether build environment is sane" 1>&5
+# Just in case
+sleep 1
+echo timestamp > conftestfile
+# Do `set' in a subshell so we don't clobber the current shell's
+# arguments.  Must try -L first in case configure is actually a
+# symlink; some systems play weird games with the mod time of symlinks
+# (eg FreeBSD returns the mod time of the symlink's containing
+# directory).
+if (
+   set X `ls -Lt $srcdir/configure conftestfile 2> /dev/null`
+   if test "$*" = "X"; then
+      # -L didn't work.
+      set X `ls -t $srcdir/configure conftestfile`
+   fi
+   if test "$*" != "X $srcdir/configure conftestfile" \
+      && test "$*" != "X conftestfile $srcdir/configure"; then
+
+      # If neither matched, then we have a broken ls.  This can happen
+      # if, for instance, CONFIG_SHELL is bash and it inherits a
+      # broken ls alias from the environment.  This has actually
+      # happened.  Such a system could not be considered "sane".
+      { echo "configure: error: ls -t appears to fail.  Make sure there is not a broken
+alias in your environment" 1>&2; exit 1; }
+   fi
+
+   test "$2" = conftestfile
+   )
+then
+   # Ok.
+   :
+else
+   { echo "configure: error: newly created file is older than distributed files!
+Check your system clock" 1>&2; exit 1; }
+fi
+rm -f conftest*
+echo "$ECHO_T""yes" 1>&6
+if test "$program_transform_name" = s,x,x,; then
+  program_transform_name=
+else
+  # Double any \ or $.  echo might interpret backslashes.
+  cat <<\EOF >conftestsed
+s,\\,\\\\,g; s,\$,$$,g
+EOF
+  program_transform_name=`echo $program_transform_name | sed -f conftestsed`
+  rm -f conftestsed
+fi
+test "$program_prefix" != NONE &&
+  program_transform_name="s,^,${program_prefix},;$program_transform_name"
+# Use a double $ so make ignores it.
+test "$program_suffix" != NONE &&
+  program_transform_name="s,\$\$,${program_suffix},;$program_transform_name"
+
+# sed with no file args requires a program.
+test "$program_transform_name" = "" && program_transform_name="s,x,x,"
+
+test x"${MISSING+set}" = xset || \
+  MISSING="\${SHELL} `CDPATH=: && cd $ac_aux_dir && pwd`/missing"
+if eval "$MISSING --run :"; then
+  am_missing_run="$MISSING --run "
+else
+  am_missing_run=
+  am_backtick='`'
+  echo "configure: warning: ${am_backtick}missing' script is too old or missing" 1>&2
+fi
+
+for ac_prog in mawk gawk nawk awk
+do
+# Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+echo $ECHO_N "checking for $ac_word... $ECHO_C" 1>&6
+echo "configure:906: checking for $ac_word" 1>&5
+if test "${ac_cv_prog_AWK+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$AWK"; then
+  ac_cv_prog_AWK="$AWK" # Let the user override the test.
+else
+  for ac_path in `IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+ac_dummy="$PATH"
+for ac_dir in $ac_dummy; do
+  test -z "$ac_dir" && ac_dir=.
+  if test -f $ac_dir/$ac_word; then
+    echo "$ac_dir/$ac_word"
+  fi
+done
+IFS="$ac_save_ifs"
+`; do
+    ac_cv_prog_AWK="$ac_prog"
+    break
+  done
+fi
+fi
+AWK="$ac_cv_prog_AWK"
+if test -n "$AWK"; then
+  echo "$ECHO_T""$AWK" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+test -n "$AWK" && break
+done
+
+echo $ECHO_N "checking whether ${MAKE-make} sets \${MAKE}... $ECHO_C" 1>&6
+echo "configure:939: checking whether ${MAKE-make} sets \${MAKE}" 1>&5
+set dummy ${MAKE-make}; ac_make=`echo "$2" | sed 'y%./+-%__p_%'`
+if eval "test \"\${ac_cv_prog_make_${ac_make}_set+set}\" = set"; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat >conftestmake <<\EOF
+all:
+	@echo 'ac_maketemp="${MAKE}"'
+EOF
+# GNU make sometimes prints "make[1]: Entering...", which would confuse us.
+eval `${MAKE-make} -f conftestmake 2>/dev/null | grep temp=`
+if test -n "$ac_maketemp"; then
+  eval ac_cv_prog_make_${ac_make}_set=yes
+else
+  eval ac_cv_prog_make_${ac_make}_set=no
+fi
+rm -f conftestmake
+fi
+if eval "test \"`echo '$ac_cv_prog_make_'${ac_make}_set`\" = yes"; then
+  echo "$ECHO_T""yes" 1>&6
+  SET_MAKE=
+else
+  echo "$ECHO_T""no" 1>&6
+  SET_MAKE="MAKE=${MAKE-make}"
+fi
+
+# Check whether --enable-dependency-tracking or --disable-dependency-tracking was given.
+if test "${enable_dependency_tracking+set}" = set; then
+  enableval="$enable_dependency_tracking"
+
+fi
+if test "x$enable_dependency_tracking" = xno; then
+  AMDEP="#"
+else
+  am_depcomp="$ac_aux_dir/depcomp"
+  if test ! -f "$am_depcomp"; then
+    AMDEP="#"
+  else
+    AMDEP=
+  fi
+fi
+
+if test -z "$AMDEP"; then
+  AMDEPBACKSLASH='\'
+else
+  AMDEPBACKSLASH=
+fi
+
+if test -d .deps || mkdir .deps 2> /dev/null || test -d .deps; then
+  DEPDIR=.deps
+else
+  DEPDIR=_deps
+fi
+
+PACKAGE=gmp
+
+VERSION=3.1.1
+
+if test "`CDPATH=: && cd $srcdir && pwd`" != "`pwd`" &&
+   test -f $srcdir/config.status; then
+  { echo "configure: error: source directory already configured; run "make distclean" there first" 1>&2; exit 1; }
+fi
+cat >>confdefs.h <<EOF
+#define PACKAGE "$PACKAGE"
+EOF
+
+cat >>confdefs.h <<EOF
+#define VERSION "$VERSION"
+EOF
+
+ACLOCAL=${ACLOCAL-"${am_missing_run}aclocal"}
+
+AUTOCONF=${AUTOCONF-"${am_missing_run}autoconf"}
+
+AUTOMAKE=${AUTOMAKE-"${am_missing_run}automake"}
+
+AUTOHEADER=${AUTOHEADER-"${am_missing_run}autoheader"}
+
+MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
+
+AMTAR=${AMTAR-"${am_missing_run}tar"}
+
+if test -z "$install_sh"; then
+   install_sh="$ac_aux_dir/install-sh"
+   test -f "$install_sh" || install_sh="$ac_aux_dir/install.sh"
+   test -f "$install_sh" || install_sh="${am_missing_run}${ac_auxdir}/install-sh"
+         install_sh="`echo $install_sh | sed -e 's/\${SHELL}//'`"
+fi
+
+echo $ECHO_N "checking whether to enable maintainer-specific portions of Makefiles... $ECHO_C" 1>&6
+echo "configure:1029: checking whether to enable maintainer-specific portions of Makefiles" 1>&5
+    # Check whether --enable-maintainer-mode or --disable-maintainer-mode was given.
+if test "${enable_maintainer_mode+set}" = set; then
+  enableval="$enable_maintainer_mode"
+  USE_MAINTAINER_MODE=$enableval
+else
+  USE_MAINTAINER_MODE=no
+fi
+  echo "$ECHO_T""$USE_MAINTAINER_MODE" 1>&6
+
+if test $USE_MAINTAINER_MODE = yes; then
+  MAINTAINER_MODE_TRUE=
+  MAINTAINER_MODE_FALSE='#'
+else
+  MAINTAINER_MODE_TRUE='#'
+  MAINTAINER_MODE_FALSE=
+fi
+  MAINT=$MAINTAINER_MODE_TRUE
+
+gmp_configm4="config.m4"
+gmp_tmpconfigm4=cnfm4.tmp
+gmp_tmpconfigm4i=cnfm4i.tmp
+gmp_tmpconfigm4p=cnfm4p.tmp
+test -f $gmp_tmpconfigm4 && rm $gmp_tmpconfigm4
+test -f $gmp_tmpconfigm4i && rm $gmp_tmpconfigm4i
+test -f $gmp_tmpconfigm4p && rm $gmp_tmpconfigm4p
+
+# Check whether --enable-assert or --disable-assert was given.
+if test "${enable_assert+set}" = set; then
+  enableval="$enable_assert"
+  case "${enableval}" in
+yes|no) ;;
+*) { echo "configure: error: bad value ${enableval} for --enable-assert, need yes or no" 1>&2; exit 1; } ;;
+esac
+else
+  enable_assert=no
+fi
+
+if test "$enable_assert" = "yes"; then
+  cat >>confdefs.h <<\EOF
+#define WANT_ASSERT 1
+EOF
+
+fi
+
+# Check whether --enable-alloca or --disable-alloca was given.
+if test "${enable_alloca+set}" = set; then
+  enableval="$enable_alloca"
+  case "${enableval}" in
+yes|no) ;;
+*) { echo "configure: error: bad value ${enableval} for --enable-alloca, need yes or no" 1>&2; exit 1; } ;;
+esac
+else
+  enable_alloca=yes
+fi
+
+if test "$enable_alloca" = "no"; then
+  cat >>confdefs.h <<\EOF
+#define USE_STACK_ALLOC 1
+EOF
+
+fi
+
+# Check whether --enable-fft or --disable-fft was given.
+if test "${enable_fft+set}" = set; then
+  enableval="$enable_fft"
+  case "${enableval}" in
+yes|no) ;;
+*) { echo "configure: error: bad value ${enableval} for --enable-fft, need yes or no" 1>&2; exit 1; } ;;
+esac
+else
+  enable_fft=no
+fi
+
+if test "$enable_fft" = "yes"; then
+  cat >>confdefs.h <<\EOF
+#define WANT_FFT 1
+EOF
+
+fi
+
+# Check whether --enable-mpbsd or --disable-mpbsd was given.
+if test "${enable_mpbsd+set}" = set; then
+  enableval="$enable_mpbsd"
+  case "${enableval}" in
+yes|no) ;;
+*) { echo "configure: error: bad value ${enableval} for --enable-mpbsd, need yes or no" 1>&2; exit 1; } ;;
+esac
+else
+  enable_mpbsd=no
+fi
+
+if test "$enable_mpbsd" = "yes"; then
+  WANT_MPBSD_TRUE=
+  WANT_MPBSD_FALSE='#'
+else
+  WANT_MPBSD_TRUE='#'
+  WANT_MPBSD_FALSE=
+fi
+
+# Check whether --enable-mpfr or --disable-mpfr was given.
+if test "${enable_mpfr+set}" = set; then
+  enableval="$enable_mpfr"
+  case "${enableval}" in
+yes|no) ;;
+*) { echo "configure: error: bad value ${enableval} for --enable-mpfr, need yes or no" 1>&2; exit 1; } ;;
+esac
+else
+  enable_mpfr=no
+fi
+
+if test "$enable_mpfr" = "yes"; then
+  WANT_MPFR_TRUE=
+  WANT_MPFR_FALSE='#'
+else
+  WANT_MPFR_TRUE='#'
+  WANT_MPFR_FALSE=
+fi
+
+os_64bit="no"
+cclist="gcc cc"		# FIXME: Prefer c89 to cc.
+gmp_cflags_gcc="-g -O2"
+gmp_cflags64_gcc="-g -O2"
+gmp_cflags_cc="-g"
+gmp_cflags64_cc="-g"
+
+case "$target" in
+  # Alpha
+  alpha*-cray-unicos*)
+    # Don't perform any assembly syntax tests on this beast.
+    gmp_no_asm_syntax_testing=yes
+    cclist=cc
+    gmp_cflags_cc="$gmp_cflags_cc -O"
+    ;;
+  alpha*-*-osf*)
+    flavour=`echo $target_cpu | sed 's/^alpha//g'`
+    if test -n "$flavour"; then
+      case $flavour in	   # compilers don't seem to understand `ev67' and such.
+	ev6? | ev7*) flavour=ev6;;
+      esac
+      gmp_optcflags_gcc="-mcpu=$flavour"
+      # FIXME: We shouldn't fail fatally if none of these work, but that's
+      # how xoptcflags work and we don't have any other mechanism right now.
+      # Why do we need this here and not for alpha*-*-* below?
+      gmp_xoptcflags_gcc="-Wa,-arch,${flavour} -Wa,-m${flavour}"
+      gmp_optcflags_cc="-arch $flavour -tune $flavour"
+    fi
+    ;;
+  alpha*-*-*)
+    cclist="gcc"
+    flavour=`echo $target_cpu | sed 's/^alpha//g'`
+    if test -n "$flavour"; then
+      case $flavour in
+	ev6? | ev7*) flavour=ev6;;
+      esac
+      gmp_optcflags_gcc="-mcpu=$flavour"
+    fi
+    ;;
+  # Cray vector machines.  This must come after alpha* so that we can
+  # recognize present and future vector processors with a wildcard.
+  *-cray-unicos*)
+    # Don't perform any assembly syntax tests on this beast.
+    gmp_no_asm_syntax_testing=yes
+    cclist=cc
+    # Don't inherit default gmp_cflags_cc value; it comes with -g which
+    # disables all optimization on Cray vector systems
+    gmp_cflags_cc="-O"
+    ;;
+
+  # AMD and Intel x86 configurations
+  i?86*-*-* | k[5-8]*-*-* | pentium*-*-* | athlon-*-*)
+    # Rumour has it -O2 used to give worse register allocation than just -O.
+    gmp_cflags_gcc="-g -O -fomit-frame-pointer"
+
+    case "${target}" in
+      i386*-*-*)    gmp_optcflags_gcc="-mcpu=i386 -march=i386";;
+      i486*-*-*)    gmp_optcflags_gcc="-mcpu=i486 -march=i486";;
+      i586*-*-* | pentium-*-* | pentiummmx-*-*)
+                    gmp_optcflags_gcc="-mcpu=pentium -march=pentium";;
+
+      # -march=pentiumpro not used because mpz/powm.c (swox cvs rev 1.4)
+      # tickles a bug in gcc 2.95.2 (believed fixed in 2.96).
+      i686*-*-* | pentiumpro-*-* | pentium[23]-*-*)
+                    gmp_optcflags_gcc="-mcpu=pentiumpro";;
+
+      k6*-*-*)      gmp_optcflags_gcc="-mcpu=k6 -march=k6";;
+
+      # Athlon instruction costs are close to p6: 3 cycle load latency, 4-6
+      # cycle mul, 40 cycle div, pairable adc, ...
+      # FIXME: Change this when gcc gets something specific for Athlon.
+      # -march=pentiumpro not used, per i686 above.
+      athlon-*-*)   gmp_optcflags_gcc="-mcpu=pentiumpro";;
+    esac
+    ;;
+
+  # Sparc
+  ultrasparc*-*-solaris2.[7-9] | sparcv9-*-solaris2.[7-9])
+    os_64bit=yes
+    gmp_cflags_gcc="$gmp_cflags_gcc -Wa,-xarch=v8plus"
+    gmp_xoptcflags_gcc="-mcpu=v9 -mcpu=v8 -mv8"
+    gmp_cflags64_gcc="$gmp_cflags64_gcc -m64 -mptr64 -Wa,-xarch=v9 -mcpu=v9"
+    gmp_cflags_cc="-xtarget=native -xarch=v8 -xO4"
+    gmp_cflags64_cc="-xtarget=native -xarch=v9 -xO4"
+    ;;
+  sparc64-*-linux*)
+    # Need to think more about the options passed here.  This isn't good for
+    # some sparc64 linux distros, since we end up not optimizing when all the
+    # options below fail.
+    os_64bit=yes
+    gmp_cflags64_gcc="$gmp_cflags64_gcc -m64 -mptr64 -Wa,-xarch=v9 -mcpu=v9"
+    gmp_cflags_gcc="$gmp_cflags_gcc -m32"
+    gmp_xoptflags_gcc="-mcpu=ultrasparc -mvis"
+    ;;
+  ultrasparc*-*-* | sparcv9-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -Wa,-xarch=v8plus"
+    gmp_xoptcflags_gcc="-mcpu=v9 -mcpu=v8 -mv8"
+    gmp_cflags_cc="-xtarget=native -xarch=v8 -xO4"
+    ;;
+  sparcv8*-*-solaris2.* | microsparc*-*-solaris2.*)
+    gmp_cflags_gcc="$gmp_cflags_gcc"
+    gmp_xoptcflags_gcc="-mcpu=v8 -mv8"
+    gmp_cflags_cc="-xtarget=native -xarch=v8 -xO4"
+    ;;
+  sparcv8*-*-* | microsparc*-*-*)		# SunOS, Linux, *BSD
+    cclist="gcc acc cc"
+    gmp_cflags_gcc="$gmp_cflags_gcc"
+    gmp_xoptcflags_gcc="-mcpu=v8 -mv8"
+    gmp_cflags_acc="-g -O2 -cg92"
+    gmp_cflags_cc="-O2"		# FIXME: Flag for v8?
+    ;;
+  supersparc*-*-solaris2.*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -DSUPERSPARC"
+    gmp_xoptcflags_gcc="-mcpu=v8 -mv8"
+    gmp_cflags_cc="-xtarget=native -xarch=v8 -xO4 -DSUPERSPARC"
+    ;;
+  supersparc*-*-*)		# SunOS, Linux, *BSD
+    cclist="gcc acc cc"
+    gmp_cflags_gcc="$gmp_cflags_gcc -DSUPERSPARC"
+    gmp_xoptcflags_gcc="-mcpu=v8 -mv8"
+    gmp_cflags_acc="-g -O2 -cg92 -DSUPERSPARC"
+    gmp_cflags_cc="-O2 -DSUPERSPARC"	# FIXME: Flag for v8?
+    ;;
+  *sparc*-*-*)
+    cclist="gcc acc cc"
+    gmp_cflags_acc="-g -O2"
+    gmp_cflags_cc="-g -O2"
+    ;;
+
+  # POWER/PowerPC
+  powerpc64-*-aix*)
+    cclist="gcc xlc"
+    gmp_cflags_gcc="$gmp_cflags_gcc -maix64 -mpowerpc64"
+    gmp_cflags_xlc="-g -O2 -q64 -qtune=pwr3"
+    ;;
+  powerpc*-*-aix*)
+    cclist="gcc xlc"
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpowerpc"
+    gmp_cflags_xlc="$gmp_cflags_cc -qarch=ppc -O2"
+    ;;
+  power-*-aix*)
+    cclist="gcc xlc"
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpower"
+    gmp_cflags_xlc="$gmp_cflags_cc -qarch=pwr -O2"
+    ;;
+  powerpc64*-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpowerpc64"
+    cat >>confdefs.h <<\EOF
+#define _LONG_LONG_LIMB 1
+EOF
+	    ;;
+  powerpc-apple-darwin* | powerpc-apple-macosx*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpowerpc -traditional-cpp"
+    ;;
+  powerpc*-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpowerpc"
+    ;;
+
+  # MIPS
+  mips-sgi-irix6.*)
+    os_64bit=yes
+    gmp_cflags64_gcc="-g -O2 -mabi=n32"
+    gmp_cflags64_cc="$gmp_cflags64_cc -O2 -n32"
+    ;;
+
+  # Motorola 68k family
+  m88110*-*-*)
+    gmp_cflags_gcc="-g -O -m88110"	    ;;
+  m68*-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -fomit-frame-pointer"
+    ;;
+
+  # HP
+  hppa1.0*-*-*)
+    cclist="gcc c89 cc"
+    gmp_cflags_c89="$gmp_cflags_cc +O2"
+    gmp_cflags_cc="$gmp_cflags_cc +O2"
+    ;;
+  hppa2.0w*-*-*)
+    cclist="c89 cc"
+    gmp_cflags_c89="+DD64 +O3"
+    gmp_cflags_cc="+DD64 +O3"
+    ;;
+  hppa2.0*-*-*)
+    os_64bit=yes
+    cclist="gcc c89 cc"
+    gmp_cflags64_gcc="$gmp_cflags64_gcc -mWHAT -D_LONG_LONG_LIMB"
+    # +O2 to cc triggers bug in mpz/powm.c (1.4)
+    gmp_cflags64_c89="+DA2.0 +e +O3 -D_LONG_LONG_LIMB"
+    gmp_cflags64_cc="+DA2.0 +e +O3 -D_LONG_LONG_LIMB"
+    gmp_cflags_c89="$gmp_cflags_cc +O2"
+    gmp_cflags_cc="$gmp_cflags_cc +O2"
+    ;;
+
+  # VAX
+  vax*-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -fomit-frame-pointer"
+    ;;
+
+  # Fujitsu
+  f30[01]-fujitsu-sysv*)
+    cclist="gcc vcc"
+    gmp_cflags_vcc="-g"		# FIXME: flags for vcc?
+    ;;
+esac
+
+case "${target}" in
+  *-*-mingw32) gmp_cflags_gcc="$gmp_cflags_gcc -mno-cygwin";;
+esac
+
+echo $ECHO_N "checking for BSD-compatible nm... $ECHO_C" 1>&6
+echo "configure:1352: checking for BSD-compatible nm" 1>&5
+if test "${ac_cv_path_NM+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$NM"; then
+  # Let the user override the test.
+  ac_cv_path_NM="$NM"
+else
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+  for ac_dir in $PATH /usr/ccs/bin /usr/ucb /bin; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/nm || test -f $ac_dir/nm$ac_exeext ; then
+      # Check to see if the nm accepts a BSD-compat flag.
+      # Adding the `sed 1q' prevents false positives on HP-UX, which says:
+      #   nm: unknown option "B" ignored
+      if ($ac_dir/nm -B /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
+	ac_cv_path_NM="$ac_dir/nm -B"
+	break
+      elif ($ac_dir/nm -p /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
+	ac_cv_path_NM="$ac_dir/nm -p"
+	break
+      else
+	ac_cv_path_NM=${ac_cv_path_NM="$ac_dir/nm"} # keep the first match, but
+	continue # so that we can try to find one that supports BSD flags
+      fi
+    fi
+  done
+  IFS="$ac_save_ifs"
+  test -z "$ac_cv_path_NM" && ac_cv_path_NM=nm
+fi
+fi
+
+NM="$ac_cv_path_NM"
+echo "$ECHO_T""$NM" 1>&6
+	# nm on 64-bit AIX needs to know the object file format
+case "$target" in
+  powerpc64*-*-aix*)
+    NM="$NM -X 64"
+    ;;
+esac
+
+# Save CFLAGS given on command line.
+gmp_user_CFLAGS="$CFLAGS"
+
+if test -z "$CC"; then
+  # Find compiler.
+
+if test $host != $build; then
+  ac_tool_prefix=${host_alias}-
+else
+  ac_tool_prefix=
+fi
+
+gmp_cc_list="$cclist"
+gmp_req_64bit_cc="$os_64bit"
+
+CC32=
+CC64=
+for c in $gmp_cc_list; do
+  # Avoid cache hits.
+  unset CC
+  unset ac_cv_prog_CC
+
+# Extract the first word of "${ac_tool_prefix}$c", so it can be a program name with args.
+set dummy ${ac_tool_prefix}$c; ac_word=$2
+echo $ECHO_N "checking for $ac_word... $ECHO_C" 1>&6
+echo "configure:1418: checking for $ac_word" 1>&5
+if test "${ac_cv_prog_CC+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+  for ac_path in `IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+ac_dummy="$PATH"
+for ac_dir in $ac_dummy; do
+  test -z "$ac_dir" && ac_dir=.
+  if test -f $ac_dir/$ac_word; then
+    echo "$ac_dir/$ac_word"
+  fi
+done
+IFS="$ac_save_ifs"
+`; do
+    ac_cv_prog_CC="${ac_tool_prefix}$c"
+    break
+  done
+fi
+fi
+CC="$ac_cv_prog_CC"
+if test -n "$CC"; then
+  echo "$ECHO_T""$CC" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+if test -z "$ac_cv_prog_CC"; then
+  if test -n "$ac_tool_prefix"; then
+    # Extract the first word of "$c", so it can be a program name with args.
+set dummy $c; ac_word=$2
+echo $ECHO_N "checking for $ac_word... $ECHO_C" 1>&6
+echo "configure:1452: checking for $ac_word" 1>&5
+if test "${ac_cv_prog_CC+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+  for ac_path in `IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+ac_dummy="$PATH"
+for ac_dir in $ac_dummy; do
+  test -z "$ac_dir" && ac_dir=.
+  if test -f $ac_dir/$ac_word; then
+    echo "$ac_dir/$ac_word"
+  fi
+done
+IFS="$ac_save_ifs"
+`; do
+    ac_cv_prog_CC="$c"
+    break
+  done
+  test -z "$ac_cv_prog_CC" && ac_cv_prog_CC="$c"
+fi
+fi
+CC="$ac_cv_prog_CC"
+if test -n "$CC"; then
+  echo "$ECHO_T""$CC" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+  else
+    CC="$c"
+  fi
+fi
+
+  if test -n "$CC"; then
+    eval c_flags=\$gmp_cflags_$c
+    ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+	CC="$CC"
+CFLAGS="$c_flags"
+echo $ECHO_N "checking if the C compiler ($CC) works with flags $CFLAGS... $ECHO_C" 1>&6
+echo "configure:1498: checking if the C compiler ($CC) works with flags $CFLAGS" 1>&5
+
+# Simple test for all targets.
+cat >conftest.$ac_ext <<EOF
+
+#line 1503 "configure"
+#include "confdefs.h"
+
+int main(){return(0);}
+EOF
+if { (eval echo configure:1508: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  tmp_works=yes
+  # If we can't run a trivial program, we are probably using a cross compiler.
+  if (./conftest; exit) 2>/dev/null; then
+    tmp_cross=no
+  else
+    tmp_cross=yes
+  fi
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  tmp_works=no
+fi
+rm -fr conftest*
+
+# Target specific tests.
+if test "$tmp_works" = "yes"; then
+  case "$target" in
+    *-*-aix*)	# Returning a funcptr.
+      cat >conftest.$ac_ext <<EOF
+#line 1528 "configure"
+#include "confdefs.h"
+
+int
+main ()
+{
+} void *g(); void *f() { return g(); } int bar(){
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:1539: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  tmp_works=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  tmp_works=no
+fi
+rm -f conftest*
+      ;;
+  esac
+fi
+
+if test "$tmp_works" = "yes"; then
+  gmp_prog_cc_works=yes
+else
+  gmp_prog_cc_works=no
+fi
+
+echo "$ECHO_T""$tmp_works" 1>&6
+
+    if test "$gmp_prog_cc_works" != "yes"; then
+      continue
+    fi
+
+    # Save first working compiler, whether 32- or 64-bit capable.
+    if test -z "$CC32"; then
+      CC32="$CC"
+    fi
+    if test "$gmp_req_64bit_cc" = "yes"; then
+      eval c_flags=\$gmp_cflags64_$c
+
+      # Verify that the compiler works in 64-bit mode as well.
+      # /usr/ucb/cc on Solaris 7 can *compile* in 64-bit mode, but not link.
+      ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+	CC="$c"
+CFLAGS="$c_flags"
+echo $ECHO_N "checking if the C compiler ($CC) works with flags $CFLAGS... $ECHO_C" 1>&6
+echo "configure:1583: checking if the C compiler ($CC) works with flags $CFLAGS" 1>&5
+
+# Simple test for all targets.
+cat >conftest.$ac_ext <<EOF
+
+#line 1588 "configure"
+#include "confdefs.h"
+
+int main(){return(0);}
+EOF
+if { (eval echo configure:1593: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  tmp_works=yes
+  # If we can't run a trivial program, we are probably using a cross compiler.
+  if (./conftest; exit) 2>/dev/null; then
+    tmp_cross=no
+  else
+    tmp_cross=yes
+  fi
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  tmp_works=no
+fi
+rm -fr conftest*
+
+# Target specific tests.
+if test "$tmp_works" = "yes"; then
+  case "$target" in
+    *-*-aix*)	# Returning a funcptr.
+      cat >conftest.$ac_ext <<EOF
+#line 1613 "configure"
+#include "confdefs.h"
+
+int
+main ()
+{
+} void *g(); void *f() { return g(); } int bar(){
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:1624: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  tmp_works=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  tmp_works=no
+fi
+rm -f conftest*
+      ;;
+  esac
+fi
+
+if test "$tmp_works" = "yes"; then
+  gmp_prog_cc_works=yes
+else
+  gmp_prog_cc_works=no
+fi
+
+echo "$ECHO_T""$tmp_works" 1>&6
+
+      if test "$gmp_prog_cc_works" = "yes"; then
+
+  gmp_tmp_CC_save="$CC"
+  CC="$c"
+  echo $ECHO_N "checking whether the C compiler ($CC) is 64-bit capable... $ECHO_C" 1>&6
+echo "configure:1651: checking whether the C compiler ($CC) is 64-bit capable" 1>&5
+  if test -z "$NM"; then
+    echo; echo "configure: GMP_CHECK_CC_64BIT: fatal: need nm"
+    exit 1
+  fi
+  gmp_tmp_CFLAGS_save="$CFLAGS"
+  CFLAGS="$c_flags"
+
+  case "$target" in
+    hppa2.0*-*-*)
+      # FIXME: If gcc is installed under another name than "gcc", we will
+      # test the wrong thing.
+      if test "$CC" != "gcc"; then
+                                                echo >conftest.c
+        gmp_tmp_vs=`$CC $CFLAGS -V -c -o conftest.o conftest.c 2>&1 | grep "^ccom:"`
+        rm conftest*
+        gmp_tmp_v1=`echo $gmp_tmp_vs | sed 's/.* .\.\(.*\)\..*\..* HP C.*/\1/'`
+        gmp_tmp_v2=`echo $gmp_tmp_vs | sed 's/.* .\..*\.\(.*\)\..* HP C.*/\1/'`
+        gmp_tmp_v3=`echo $gmp_tmp_vs | sed 's/.* .\..*\..*\.\(.*\) HP C.*/\1/'`
+	gmp_cv_cc_64bit=no
+	test -n "$gmp_tmp_v1" && test "$gmp_tmp_v1" -ge "10" \
+  	  && test -n "$gmp_tmp_v2" && test "$gmp_tmp_v2" -ge "32" \
+    	  && test -n "$gmp_tmp_v3" && test "$gmp_tmp_v3" -ge "30" \
+	  && gmp_cv_cc_64bit=yes
+      else	# gcc
+	# FIXME: Compile a minimal file and determine if the resulting object
+	# file is an ELF file.  If so, gcc can produce 64-bit code.
+	# Do we have file(1) for target?
+	gmp_cv_cc_64bit=no
+      fi
+      ;;
+    mips-sgi-irix6.*)
+      # We use `-n32' to cc and `-mabi=n32' to gcc, resulting in 64-bit
+      # arithmetic but not 64-bit pointers, so the general test for sizeof
+      # (void *) is not valid.
+      # Simply try to compile an empty main.  If that succeeds return
+      # true.
+      cat >conftest.$ac_ext <<EOF
+#line 1689 "configure"
+#include "confdefs.h"
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:1700: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  gmp_cv_cc_64bit=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  gmp_cv_cc_64bit=no
+fi
+rm -f conftest*
+      ;;
+    *-*-*)
+      # Allocate an array of size sizeof (void *) and use nm to determine its
+      # size.  We depend on the first declared variable being put at address 0.
+      cat >conftest.c <<EOF
+char arr[sizeof (void *)]={0};
+char post=0;
+EOF
+      gmp_compile="$CC $CFLAGS -c conftest.c 1>&5"
+      if { (eval echo configure:1719: \"$gmp_compile\") 1>&5; (eval $gmp_compile) 2>&5; }; then
+        	gmp_tmp_val=`$NM conftest.o | grep post | sed -e 's;[[][0-9][]]\(.*\);\1;' \
+          -e 's;[^1-9]*\([0-9]*\).*;\1;'`
+                if test "$gmp_tmp_val" = "8"; then
+	  gmp_cv_cc_64bit=yes
+	else
+	  gmp_cv_cc_64bit=no
+        fi
+      else
+        echo "configure: failed program was:" >&5
+        cat conftest.$ac_ext >&5
+        gmp_cv_cc_64bit=no
+      fi
+      rm -f conftest*
+      ;;
+  esac
+
+  CC="$gmp_tmp_CC_save"
+  CFLAGS="$gmp_tmp_CFLAGS_save"
+  echo "$ECHO_T""$gmp_cv_cc_64bit" 1>&6
+
+        if test "$gmp_cv_cc_64bit" = "yes"; then
+          test -z "$CC64" && CC64="$c"
+          test -z "$CFLAGS64" && CFLAGS64="$c_flags"
+	  # We have CC64 so we're done.
+          break
+        fi
+      fi
+    else
+      # We have CC32, and we don't need a 64-bit compiler so we're done.
+      break
+    fi
+  fi
+done
+CC="$CC32"
+
+  # If 64-bit OS and we have a 64-bit compiler, use it.
+  if test -n "$os_64bit" && test -n "$CC64"; then
+    CC=$CC64
+    CFLAGS=$CFLAGS64
+  else
+    eval CFLAGS=\$gmp_cflags_$CC
+  fi
+
+  # Try compiler flags that may work with only some compiler versions.
+  # gmp_optcflags: All or nothing.
+  eval optcflags=\$gmp_optcflags_$CC
+  if test -n "$optcflags"; then
+    CFLAGS_save="$CFLAGS"
+    CFLAGS="$CFLAGS $optcflags"
+    echo $ECHO_N "checking whether $CC accepts $optcflags... $ECHO_C" 1>&6
+echo "configure:1770: checking whether $CC accepts $optcflags" 1>&5
+    ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+    cat >conftest.$ac_ext <<EOF
+
+#line 1780 "configure"
+#include "confdefs.h"
+
+int main(){return(0);}
+EOF
+if { (eval echo configure:1785: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  optok=yes
+  # If we can't run a trivial program, we are probably using a cross compiler.
+  if (./conftest; exit) 2>/dev/null; then
+    cross=no
+  else
+    cross=yes
+  fi
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  optok=no
+fi
+rm -fr conftest*
+    if test "$optok" = "yes"; then
+      echo "$ECHO_T""yes" 1>&6
+    else
+      echo "$ECHO_T""no" 1>&6
+      CFLAGS="$CFLAGS_save"
+    fi
+  fi
+  # gmp_xoptcflags: First is best, one has to work.
+  eval xoptcflags=\$gmp_xoptcflags_$CC
+  if test -n "$xoptcflags"; then
+    gmp_found="no"
+    for xopt in $xoptcflags; do
+      CFLAGS_save="$CFLAGS"
+      CFLAGS="$CFLAGS $xopt"
+      echo $ECHO_N "checking whether $CC accepts $xopt... $ECHO_C" 1>&6
+echo "configure:1814: checking whether $CC accepts $xopt" 1>&5
+      ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+      cat >conftest.$ac_ext <<EOF
+
+#line 1824 "configure"
+#include "confdefs.h"
+
+int main(){return(0);}
+EOF
+if { (eval echo configure:1829: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  optok=yes
+  # If we can't run a trivial program, we are probably using a cross compiler.
+  if (./conftest; exit) 2>/dev/null; then
+    cross=no
+  else
+    cross=yes
+  fi
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  optok=no
+fi
+rm -fr conftest*
+      if test "$optok" = "yes"; then
+        echo "$ECHO_T""yes" 1>&6
+        gmp_found="yes"
+        break
+      else
+        echo "$ECHO_T""no" 1>&6
+        CFLAGS="$CFLAGS_save"
+      fi
+    done
+    if test "$gmp_found" = "no"; then
+      echo "$0: fatal: need a compiler that understands one of $xoptcflags"
+      exit 1
+    fi
+  fi
+fi
+
+# Restore CFLAGS given on command line.
+# FIXME: We've run through quite some unnecessary code looking for a
+# nice compiler and working flags for it, just to spoil that with user
+# supplied flags.
+test -n "$gmp_user_CFLAGS" && CFLAGS="$gmp_user_CFLAGS"
+
+# Select chosen compiler.
+
+echo $ECHO_N "checking whether the C compiler ($CC $CFLAGS $CPPFLAGS $LDFLAGS) works... $ECHO_C" 1>&6
+echo "configure:1868: checking whether the C compiler ($CC $CFLAGS $CPPFLAGS $LDFLAGS) works" 1>&5
+
+ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+cat >conftest.$ac_ext <<EOF
+
+#line 1879 "configure"
+#include "confdefs.h"
+
+int main(){return(0);}
+EOF
+if { (eval echo configure:1884: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  ac_cv_prog_cc_works=yes
+  # If we can't run a trivial program, we are probably using a cross compiler.
+  if (./conftest; exit) 2>/dev/null; then
+    ac_cv_prog_cc_cross=no
+  else
+    ac_cv_prog_cc_cross=yes
+  fi
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  ac_cv_prog_cc_works=no
+fi
+rm -fr conftest*
+ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+echo "$ECHO_T""$ac_cv_prog_cc_works" 1>&6
+if test $ac_cv_prog_cc_works = no; then
+  { echo "configure: error: installation or configuration problem: C compiler cannot create executables." 1>&2; exit 77; }
+fi
+echo $ECHO_N "checking whether the C compiler ($CC $CFLAGS $CPPFLAGS $LDFLAGS) is a cross-compiler... $ECHO_C" 1>&6
+echo "configure:1910: checking whether the C compiler ($CC $CFLAGS $CPPFLAGS $LDFLAGS) is a cross-compiler" 1>&5
+echo "$ECHO_T""$ac_cv_prog_cc_cross" 1>&6
+cross_compiling=$ac_cv_prog_cc_cross
+
+echo $ECHO_N "checking whether we are using GNU C... $ECHO_C" 1>&6
+echo "configure:1915: checking whether we are using GNU C" 1>&5
+if test "${ac_cv_prog_gcc+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat >conftest.c <<EOF
+#ifdef __GNUC__
+  yes;
+#endif
+EOF
+if { ac_try='${CC-cc} -E conftest.c'; { (eval echo configure:1924: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
+  ac_cv_prog_gcc=yes
+else
+  ac_cv_prog_gcc=no
+fi
+fi
+echo "$ECHO_T""$ac_cv_prog_gcc" 1>&6
+
+if test "$ac_cv_prog_gcc" = "yes"; then
+  GCC=yes
+else
+  GCC=
+fi
+
+# Set CFLAGS if not already set.
+if test -z "$CFLAGS"; then
+  CFLAGS="-g"
+  if test "$GCC" = "yes"; then
+    CFLAGS="$CFLAGS -O2"
+  fi
+fi
+
+if test "${ac_cv_prog_CC+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  ac_cv_prog_CC="$CC"
+fi
+
+# How to assemble.
+CCAS="$CC -c"
+
+echo $ECHO_N "checking how to run the C preprocessor... $ECHO_C" 1>&6
+echo "configure:1956: checking how to run the C preprocessor" 1>&5
+# On Suns, sometimes $CPP names a directory.
+if test -n "$CPP" && test -d "$CPP"; then
+  CPP=
+fi
+if test -z "$CPP"; then
+if test "${ac_cv_prog_CPP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+    # This must be in double quotes, not single quotes, because CPP may get
+  # substituted into the Makefile and "${CC-cc}" will confuse make.
+  CPP="${CC-cc} -E"
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp.
+
+cat >conftest.$ac_ext <<EOF
+#line 1972 "configure"
+#include "confdefs.h"
+#include <assert.h>
+Syntax Error
+EOF
+ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
+{ (eval echo configure:1978: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  :
+else
+  echo "$ac_err" >&5
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  CPP="${CC-cc} -E -traditional-cpp"
+
+cat >conftest.$ac_ext <<EOF
+#line 1990 "configure"
+#include "confdefs.h"
+#include <assert.h>
+Syntax Error
+EOF
+ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
+{ (eval echo configure:1996: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  :
+else
+  echo "$ac_err" >&5
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  CPP="${CC-cc} -nologo -E"
+
+cat >conftest.$ac_ext <<EOF
+#line 2008 "configure"
+#include "confdefs.h"
+#include <assert.h>
+Syntax Error
+EOF
+ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
+{ (eval echo configure:2014: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  :
+else
+  echo "$ac_err" >&5
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  CPP=/lib/cpp
+fi
+rm -f conftest*
+fi
+rm -f conftest*
+fi
+rm -f conftest*
+  ac_cv_prog_CPP="$CPP"
+fi
+  CPP="$ac_cv_prog_CPP"
+else
+  ac_cv_prog_CPP="$CPP"
+fi
+echo "$ECHO_T""$CPP" 1>&6
+
+# Find a good install program.  We prefer a C program (faster),
+# so one script is as good as another.  But avoid the broken or
+# incompatible versions:
+# SysV /etc/install, /usr/sbin/install
+# SunOS /usr/etc/install
+# IRIX /sbin/install
+# AIX /bin/install
+# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag
+# AFS /usr/afsws/bin/install, which mishandles nonexistent args
+# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
+# ./install, which can be erroneously created by make from ./install.sh.
+echo $ECHO_N "checking for a BSD compatible install... $ECHO_C" 1>&6
+echo "configure:2050: checking for a BSD compatible install" 1>&5
+if test -z "$INSTALL"; then
+if test "${ac_cv_path_install+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+    IFS="${IFS= 	}"; ac_save_IFS="$IFS"; IFS=":"
+  for ac_dir in $PATH; do
+    # Account for people who put trailing slashes in PATH elements.
+    case "$ac_dir/" in
+    /|./|.//|/etc/*|/usr/sbin/*|/usr/etc/*|/sbin/*|/usr/afsws/bin/*|/usr/ucb/*) ;;
+    *)
+      # OSF1 and SCO ODT 3.0 have their own names for install.
+      # Don't use installbsd from OSF since it installs stuff as root
+      # by default.
+      for ac_prog in ginstall scoinst install; do
+        if test -f $ac_dir/$ac_prog; then
+	  if test $ac_prog = install &&
+            grep dspmsg $ac_dir/$ac_prog >/dev/null 2>&1; then
+	    # AIX install.  It has an incompatible calling convention.
+	    :
+	  elif test $ac_prog = install &&
+	    grep pwplus $ac_dir/$ac_prog >/dev/null 2>&1; then
+	    # program-specific install script used by HP pwplus--don't use.
+	    :
+	  else
+	    ac_cv_path_install="$ac_dir/$ac_prog -c"
+	    break 2
+	  fi
+	fi
+      done
+      ;;
+    esac
+  done
+  IFS="$ac_save_IFS"
+
+fi
+  if test "${ac_cv_path_install+set}" = set; then
+    INSTALL="$ac_cv_path_install"
+  else
+    # As a last resort, use the slow shell script.  We don't cache a
+    # path for INSTALL within a source directory, because that will
+    # break other packages using the cache if that directory is
+    # removed, or if the path is relative.
+    INSTALL="$ac_install_sh"
+  fi
+fi
+echo "$ECHO_T""$INSTALL" 1>&6
+
+# Use test -z because SunOS4 sh mishandles braces in ${var-val}.
+# It thinks the first close brace ends the variable substitution.
+test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}'
+
+test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL}'
+
+test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
+
+echo $ECHO_N "checking whether ln -s works... $ECHO_C" 1>&6
+echo "configure:2107: checking whether ln -s works" 1>&5
+if test "${ac_cv_prog_LN_S+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  rm -f conftestdata
+if ln -s X conftestdata 2>/dev/null
+then
+  rm -f conftestdata
+  ac_cv_prog_LN_S="ln -s"
+else
+  ac_cv_prog_LN_S=ln
+fi
+fi
+LN_S="$ac_cv_prog_LN_S"
+if test "$ac_cv_prog_LN_S" = "ln -s"; then
+  echo "$ECHO_T""yes" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+echo $ECHO_N "checking for suitable m4... $ECHO_C" 1>&6
+echo "configure:2128: checking for suitable m4" 1>&5
+if test "${gmp_cv_prog_m4+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$M4"; then
+  gmp_cv_prog_m4="$M4"
+else
+  cat >conftest.m4 <<\EOF
+define(dollarhash,``$#'')dnl
+ifelse(dollarhash(x),1,`define(t1,Y)',
+``bad: $# not supported (SunOS /usr/bin/m4)
+'')dnl
+ifelse(eval(89),89,`define(t2,Y)',
+`bad: eval() doesnt support 8 or 9 in a constant (OpenBSD 2.6 m4)
+')dnl
+ifelse(t1`'t2,YY,`good
+')dnl
+EOF
+  echo "trying m4" 1>&5
+  gmp_tmp_val="`(m4 conftest.m4) 2>&5`"
+  echo "$gmp_tmp_val" 1>&5
+  if test "$gmp_tmp_val" = good; then
+    gmp_cv_prog_m4="m4"
+  else
+    IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+    ac_dummy="$PATH:/usr/5bin"
+    for ac_dir in $ac_dummy; do
+      test -z "$ac_dir" && ac_dir=.
+      echo "trying $ac_dir/m4" 1>&5
+      gmp_tmp_val="`($ac_dir/m4 conftest.m4) 2>&5`"
+      echo "$gmp_tmp_val" 1>&5
+      if test "$gmp_tmp_val" = good; then
+        gmp_cv_prog_m4="$ac_dir/m4"
+        break
+      fi
+    done
+    IFS="$ac_save_ifs"
+    if test -z "$gmp_cv_prog_m4"; then
+      { echo "configure: error: No usable m4 in \$PATH or /usr/5bin (see config.log for reasons)." 1>&2; exit 1; }
+    fi
+  fi
+  rm -f conftest.m4
+fi
+fi
+echo "$ECHO_T""$gmp_cv_prog_m4" 1>&6
+M4="$gmp_cv_prog_m4"
+
+# Extract the first word of "ar", so it can be a program name with args.
+set dummy ar; ac_word=$2
+echo $ECHO_N "checking for $ac_word... $ECHO_C" 1>&6
+echo "configure:2178: checking for $ac_word" 1>&5
+if test "${ac_cv_prog_AR+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$AR"; then
+  ac_cv_prog_AR="$AR" # Let the user override the test.
+else
+  for ac_path in `IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+ac_dummy="$PATH"
+for ac_dir in $ac_dummy; do
+  test -z "$ac_dir" && ac_dir=.
+  if test -f $ac_dir/$ac_word; then
+    echo "$ac_dir/$ac_word"
+  fi
+done
+IFS="$ac_save_ifs"
+`; do
+    ac_cv_prog_AR="ar"
+    break
+  done
+fi
+fi
+AR="$ac_cv_prog_AR"
+if test -n "$AR"; then
+  echo "$ECHO_T""$AR" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+# ar on AIX needs to know the object file format
+case "$target" in
+  powerpc64*-*-aix*)
+    AR="$AR -X 64"
+    ;;
+esac
+
+if test "$gmp_no_asm_syntax_testing" != "yes"; then
+  echo $ECHO_N "checking how to switch to text section... $ECHO_C" 1>&6
+echo "configure:2216: checking how to switch to text section" 1>&5
+if test "${gmp_cv_check_asm_text+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  case "$target" in
+  *-*-aix*)
+
+    gmp_cv_check_asm_text=".csect .text[PR]"
+
+    ;;
+  *-*-hpux*) gmp_cv_check_asm_text=".code" ;;
+  *) gmp_cv_check_asm_text=".text" ;;
+esac
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_text" 1>&6
+echo "define(<TEXT>, <$gmp_cv_check_asm_text>)" >> $gmp_tmpconfigm4
+
+  echo $ECHO_N "checking how to switch to data section... $ECHO_C" 1>&6
+echo "configure:2235: checking how to switch to data section" 1>&5
+if test "${gmp_cv_check_asm_data+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  case "$target" in
+  *-*-aix*)
+
+    gmp_cv_check_asm_data=".csect .data[RW]"
+
+    ;;
+  *) gmp_cv_check_asm_data=".data" ;;
+esac
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_data" 1>&6
+echo "define(<DATA>, <$gmp_cv_check_asm_data>)" >> $gmp_tmpconfigm4
+
+  echo $ECHO_N "checking how to export a symbol... $ECHO_C" 1>&6
+echo "configure:2253: checking how to export a symbol" 1>&5
+if test "${gmp_cv_check_asm_globl+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  case "$target" in
+  *-*-hpux*) gmp_cv_check_asm_globl=".export" ;;
+  *) gmp_cv_check_asm_globl=".globl" ;;
+esac
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_globl" 1>&6
+echo "define(<GLOBL>, <$gmp_cv_check_asm_globl>)" >> $gmp_tmpconfigm4
+
+  echo $ECHO_N "checking what assembly label suffix to use... $ECHO_C" 1>&6
+echo "configure:2267: checking what assembly label suffix to use" 1>&5
+if test "${gmp_cv_check_asm_label_suffix+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  case "$target" in
+  *-*-hpux*) gmp_cv_check_asm_label_suffix="" ;;
+  *) gmp_cv_check_asm_label_suffix=":" ;;
+esac
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_label_suffix" 1>&6
+echo "define(<LABEL_SUFFIX>, <\$1$gmp_cv_check_asm_label_suffix>)" >> $gmp_tmpconfigm4
+
+  echo $ECHO_N "checking how the .type assembly directive should be used... $ECHO_C" 1>&6
+echo "configure:2281: checking how the .type assembly directive should be used" 1>&5
+if test "${gmp_cv_check_asm_type+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  ac_assemble="$CCAS $CFLAGS conftest.s 1>&5"
+for gmp_tmp_prefix in @ \# %; do
+  echo "	.type	sym,${gmp_tmp_prefix}function" > conftest.s
+  if { (eval echo configure:2288: \"$ac_assemble\") 1>&5; (eval $ac_assemble) 2>&5; }; then
+    gmp_cv_check_asm_type=".type	\$1,${gmp_tmp_prefix}\$2"
+    break
+  fi
+done
+if test -z "$gmp_cv_check_asm_type"; then
+  gmp_cv_check_asm_type="dnl"
+fi
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_type" 1>&6
+echo "define(<TYPE>, <$gmp_cv_check_asm_type>)" >> $gmp_tmpconfigm4
+
+  echo $ECHO_N "checking if the .size assembly directive works... $ECHO_C" 1>&6
+echo "configure:2302: checking if the .size assembly directive works" 1>&5
+if test "${gmp_cv_check_asm_size+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  ac_assemble="$CCAS $CFLAGS conftest.s 1>&5"
+echo '	.size	sym,1' > conftest.s
+if { (eval echo configure:2308: \"$ac_assemble\") 1>&5; (eval $ac_assemble) 2>&5; }; then
+  gmp_cv_check_asm_size=".size	\$1,\$2"
+else
+  gmp_cv_check_asm_size="dnl"
+fi
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_size" 1>&6
+echo "define(<SIZE>, <$gmp_cv_check_asm_size>)" >> $gmp_tmpconfigm4
+
+echo $ECHO_N "checking what prefix to use for a local label... $ECHO_C" 1>&6
+echo "configure:2319: checking what prefix to use for a local label" 1>&5
+if test "${gmp_cv_check_asm_lsym_prefix+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -z "$NM"; then
+  echo; echo "GMP_CHECK_ASM_LSYM_PREFIX: fatal: need nm"
+  exit 1
+fi
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&5"
+gmp_cv_check_asm_lsym_prefix="L"
+for gmp_tmp_pre in L .L $ L$; do
+  cat > conftest.s <<EOF
+dummy${gmp_cv_check_asm_label_suffix}
+${gmp_tmp_pre}gurkmacka${gmp_cv_check_asm_label_suffix}
+	.byte 0
+EOF
+  if { (eval echo configure:2335: \"$ac_assemble\") 1>&5; (eval $ac_assemble) 2>&5; }; then
+    $NM conftest.o >/dev/null 2>&1
+    gmp_rc=$?
+    if test "$gmp_rc" != "0"; then
+      echo "configure: $NM failure, using default"
+      break
+    fi
+    if $NM conftest.o | grep gurkmacka >/dev/null; then true; else
+      gmp_cv_check_asm_lsym_prefix="$gmp_tmp_pre"
+      break
+    fi
+  else
+    echo "configure: failed program was:" >&5
+    cat conftest.s >&5
+    # Use default.
+  fi
+done
+rm -f conftest*
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_lsym_prefix" 1>&6
+echo "define(<LSYM_PREFIX>, <${gmp_cv_check_asm_lsym_prefix}>)" >> $gmp_tmpconfigm4
+
+echo $ECHO_N "checking how to define a 32-bit word... $ECHO_C" 1>&6
+echo "configure:2359: checking how to [define] a 32-bit word" 1>&5
+if test "${gmp_cv_check_asm_w32+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -z "$NM"; then
+  echo; echo "configure: GMP_CHECK_ASM_W32: fatal: need nm"
+  exit 1
+fi
+
+# FIXME: HPUX puts first symbol at 0x40000000, breaking our assumption
+# that it's at 0x0.  We'll have to declare another symbol before the
+# .long/.word and look at the distance between the two symbols.  The
+# only problem is that the sed expression(s) barfs (on Solaris, for
+# example) for the symbol with value 0.  For now, HPUX uses .word.
+
+case "$target" in
+  *-*-hpux*)
+    gmp_cv_check_asm_w32=".word"
+    ;;
+  *-*-*)
+    ac_assemble="$CCAS $CFLAGS conftest.s 1>&5"
+    for gmp_tmp_op in .long .word; do
+      cat > conftest.s <<EOF
+	$gmp_cv_check_asm_data
+	$gmp_cv_check_asm_globl	foo
+	$gmp_tmp_op	0
+foo${gmp_cv_check_asm_label_suffix}
+	.byte	0
+EOF
+      if { (eval echo configure:2388: \"$ac_assemble\") 1>&5; (eval $ac_assemble) 2>&5; }; then
+
+        gmp_tmp_val=`$NM conftest.o | grep foo | sed -e 's;[[][0-9][]]\(.*\);\1;' \
+             -e 's;[^1-9]*\([0-9]*\).*;\1;'`
+                if test "$gmp_tmp_val" = "4"; then
+          gmp_cv_check_asm_w32="$gmp_tmp_op"
+          break
+        fi
+      fi
+    done
+    ;;
+esac
+
+if test -z "$gmp_cv_check_asm_w32"; then
+  echo; echo "configure: GMP_CHECK_ASM_W32: fatal: do not know how to define a 32-bit word"
+  exit 1
+fi
+rm -f conftest*
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_w32" 1>&6
+echo "define(<W32>, <$gmp_cv_check_asm_w32>)" >> $gmp_tmpconfigm4
+
+  echo $ECHO_N "checking if symbols are prefixed by underscore... $ECHO_C" 1>&6
+echo "configure:2412: checking if symbols are prefixed by underscore" 1>&5
+if test "${gmp_cv_check_asm_underscore+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 2417 "configure"
+#include "confdefs.h"
+int underscore_test() {
+return; }
+EOF
+if { (eval echo configure:2422: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  if grep _underscore_test conftest* >/dev/null; then
+    gmp_cv_check_asm_underscore=yes
+  else
+    gmp_cv_check_asm_underscore=no
+  fi
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+fi
+rm -f conftest*
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_underscore" 1>&6
+if test "$gmp_cv_check_asm_underscore" = "yes"; then
+
+echo 'define(<GSYM_PREFIX>, <_>)' >> $gmp_tmpconfigm4
+
+  underscore=yes
+else
+
+echo 'define(<GSYM_PREFIX>, <>)' >> $gmp_tmpconfigm4
+
+  underscore=no
+fi
+
+echo $ECHO_N "checking if .align assembly directive is logarithmic... $ECHO_C" 1>&6
+echo "configure:2449: checking if .align assembly directive is logarithmic" 1>&5
+if test "${gmp_cv_check_asm_align_log+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -z "$NM"; then
+  echo; echo "configure: GMP_CHECK_ASM_ALIGN_LOG: fatal: need nm"
+  exit 1
+fi
+cat > conftest.s <<EOF
+      	$gmp_cv_check_asm_data
+      	.align  4
+	$gmp_cv_check_asm_globl	foo
+	.byte	1
+	.align	4
+foo$gmp_cv_check_asm_label_suffix
+	.byte	2
+EOF
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&5"
+if { (eval echo configure:2467: \"$ac_assemble\") 1>&5; (eval $ac_assemble) 2>&5; }; then
+
+  gmp_tmp_val=`$NM conftest.o | grep foo | sed -e 's;[[][0-9][]]\(.*\);\1;' \
+       -e 's;[^1-9]*\([0-9]*\).*;\1;'`
+    if test "$gmp_tmp_val" = "10" || test "$gmp_tmp_val" = "16"; then
+    gmp_cv_check_asm_align_log=yes
+  else
+    gmp_cv_check_asm_align_log=no
+  fi
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.s >&5
+fi
+rm -f conftest*
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_align_log" 1>&6
+
+echo "define(<ALIGN_LOGARITHMIC>,<$gmp_cv_check_asm_align_log>)" >> $gmp_tmpconfigm4
+
+if test "$gmp_cv_check_asm_align_log" = "yes"; then
+  asm_align=log
+else
+  asm_align=nolog
+fi
+
+fi
+
+family=generic
+
+case ${target} in
+  arm*-*-*)
+    path="arm"
+    ;;
+  sparcv9*-*-solaris2.[789]* | sparc64*-*-solaris2.[789]* | ultrasparc*-*-solaris2.[789]*)
+    if test -n "$CC64"
+      then path="sparc64"
+      else path="sparc32/v9 sparc32/v8 sparc32"
+    fi
+    ;;
+  sparc64-*-linux*)
+    if test -n "$CC64"
+      then path="sparc64"
+      else path="sparc32/v9 sparc32/v8 sparc32"
+    fi
+    ;;
+  sparcv8*-*-* | microsparc*-*-*)
+	path="sparc32/v8 sparc32"
+	if test x${floating_point} = xno
+	  then extra_functions="udiv_nfp"
+	  else extra_functions="udiv_fp"
+	fi
+	;;
+  sparcv9*-*-* | ultrasparc*-*-*)
+	path="sparc32/v9 sparc32/v8 sparc32"
+	extra_functions="udiv_fp"
+	;;
+  supersparc*-*-*)
+	path="sparc32/v8/supersparc sparc32/v8 sparc32"
+	extra_functions="udiv"
+	;;
+  sparc*-*-*) path="sparc32"
+	if test x${floating_point} = xno
+	  then extra_functions="udiv_nfp"
+	  else extra_functions="udiv_fp"
+	fi
+	;;
+  hppa7000*-*-*)
+    path="hppa/hppa1_1 hppa"
+    extra_functions="udiv_qrnnd"
+    ;;
+  hppa1.0*-*-*)
+    path="hppa"
+    extra_functions="udiv_qrnnd"
+    ;;
+  hppa2.0w-*-*)
+    path="pa64w"
+    extra_functions="umul_ppmm udiv_qrnnd"
+    ;;
+  hppa2.0*-*-*)
+    if test -n "$CC64"; then
+      path="pa64"
+      extra_functions="umul_ppmm udiv_qrnnd"
+      # We need to use the system compiler, or actually the system assembler,
+      # since GAS has not been ported to understand the 2.0 instructions.
+      CCAS="$CC64 -c"
+    else
+      # FIXME: path should be "hppa/hppa2_0 hppa/hppa1_1 hppa"
+      path="hppa/hppa1_1 hppa"
+      extra_functions="udiv_qrnnd"
+    fi
+    ;;
+  hppa*-*-*)					#assume pa7100
+    path="hppa/hppa1_1/pa7100 hppa/hppa1_1 hppa"
+    extra_functions="udiv_qrnnd";;
+  f30[01]-fujitsu-sysv*)
+    path=fujitsu;;
+  alphaev6*-*-*) path="alpha/ev6 alpha"; extra_functions="invert_limb cntlz";;
+  alphaev5*-*-*) path="alpha/ev5 alpha"; extra_functions="invert_limb cntlz";;
+  alpha*-*-*) path="alpha"; extra_functions="invert_limb cntlz";;
+  # Cray vector machines.  This must come after alpha* so that we can
+  # recognize present and future vector processors with a wildcard.
+  *-cray-unicos*)
+    path="cray"
+    extra_functions="mulww";;
+  am29000*-*-*) path="a29k";;
+  a29k*-*-*) path="a29k";;
+
+  # AMD and Intel x86 configurations
+
+  i?86*-*-* | k[5-8]*-*-* | pentium*-*-* | athlon-*-*)
+    gmp_m4postinc="x86/x86-defs.m4"
+    extra_functions="udiv umul"
+    CALLING_CONVENTIONS_OBJS="x86call.o x86check.o"
+
+echo $ECHO_N "checking if the assembler takes cl with shldl... $ECHO_C" 1>&6
+echo "configure:2583: checking if the assembler takes cl with shldl" 1>&5
+if test "${gmp_cv_check_asm_shldl_cl+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat > conftest.s <<EOF
+	$gmp_cv_check_asm_text
+	shldl	%cl, %eax, %ebx
+EOF
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&5"
+if { (eval echo configure:2592: \"$ac_assemble\") 1>&5; (eval $ac_assemble) 2>&5; }; then
+  gmp_cv_check_asm_shldl_cl=yes
+else
+  gmp_cv_check_asm_shldl_cl=no
+fi
+rm -f conftest*
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_shldl_cl" 1>&6
+if test "$gmp_cv_check_asm_shldl_cl" = "yes"; then
+
+echo 'define(<WANT_SHLDL_CL>, <1>)' >> $gmp_tmpconfigm4
+
+else
+
+echo 'define(<WANT_SHLDL_CL>, <0>)' >> $gmp_tmpconfigm4
+
+fi
+
+    echo $ECHO_N "checking if the .align directive accepts an 0x90 fill in .text... $ECHO_C" 1>&6
+echo "configure:2612: checking if the .align directive accepts an 0x90 fill in .text" 1>&5
+if test "${gmp_cv_check_asm_align_fill_0x90+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+
+cat > conftest.s <<EOF
+      	$gmp_cv_check_asm_text
+      	.align  4, 0x90
+	.byte   0
+      	.align  4, 0x90
+EOF
+gmp_tmp_val="`$CCAS $CFLAGS conftest.s 2>&1`"
+if test $? = 0; then
+  echo "$gmp_tmp_val" 1>&5
+  if echo "$gmp_tmp_val" | grep "Warning: Fill parameter ignored for executable section"; then
+    echo "Supressing this warning by omitting 0x90" 1>&5
+    gmp_cv_check_asm_align_fill_0x90=no
+  else
+    gmp_cv_check_asm_align_fill_0x90=yes
+  fi
+else
+  echo "Non-zero exit code" 1>&5
+  echo "$gmp_tmp_val" 1>&5
+  gmp_cv_check_asm_align_fill_0x90=no
+fi
+rm -f conftest*
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_align_fill_0x90" 1>&6
+
+echo "define(<ALIGN_FILL_0x90>,<$gmp_cv_check_asm_align_fill_0x90>)" >> $gmp_tmpconfigm4
+
+    # the CPUs below wanting to know about mmx
+    case ${target} in
+      pentiummmx-*-* | pentium[23]-*-* | k6*-*-* | athlon-*-*)
+
+echo $ECHO_N "checking if the assembler knows about MMX instructions... $ECHO_C" 1>&6
+echo "configure:2649: checking if the assembler knows about MMX instructions" 1>&5
+if test "${gmp_cv_check_asm_mmx+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat > conftest.s <<EOF
+	$gmp_cv_check_asm_text
+	por	%mm0, %mm0
+EOF
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&5"
+if { (eval echo configure:2658: \"$ac_assemble\") 1>&5; (eval $ac_assemble) 2>&5; }; then
+  gmp_cv_check_asm_mmx=yes
+else
+  gmp_cv_check_asm_mmx=no
+fi
+rm -f conftest*
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_mmx" 1>&6
+if test "$gmp_cv_check_asm_mmx" = "yes"; then
+  tmp_mmx=yes
+else
+  echo "configure: warning: +----------------------------------------------------------" 1>&2
+  echo "configure: warning: | WARNING WARNING WARNING" 1>&2
+  echo "configure: warning: | Target CPU has MMX code, but it can't be assembled by" 1>&2
+  echo "configure: warning: |     $CCAS $CFLAGS" 1>&2
+  echo "configure: warning: | Non-MMX replacements will be used." 1>&2
+  echo "configure: warning: | This will be an inferior build." 1>&2
+  echo "configure: warning: +----------------------------------------------------------" 1>&2
+  tmp_mmx=no
+fi
+
+        ;;
+    esac
+
+    # default for anything not otherwise mentioned
+    path="x86"
+
+    case ${target} in
+      i[34]86*-*-*)
+        path="x86"
+        ;;
+      k5*-*-*)
+        # don't know what best suits k5
+        path="x86"
+        ;;
+      i586*-*-* | pentium-*-*)
+	path="x86/pentium x86"
+        ;;
+      pentiummmx-*-*)
+	path="x86/pentium x86"
+	if test "$tmp_mmx" = yes; then
+          path="x86/pentium/mmx $path"
+        fi
+        ;;
+      i686*-*-* | pentiumpro-*-*)
+	path="x86/p6 x86"
+        ;;
+      pentium2-*-*)
+	path="x86/p6 x86"
+	# The pentium/mmx lshift and rshift are good on p6 and can be used
+        # until there's something specific for p6.
+	if test "$tmp_mmx" = yes; then
+          path="x86/p6/mmx x86/pentium/mmx $path"
+        fi
+        ;;
+      pentium3-*-*)
+	path="x86/p6 x86"
+	# The pentium/mmx lshift and rshift are good on p6 and can be used
+        # until there's something specific for p6.
+	if test "$tmp_mmx" = yes; then
+          path="x86/p6/p3mmx x86/p6/mmx x86/pentium/mmx $path"
+        fi
+        ;;
+      k6[23]*-*-*)
+	path="x86/k6 x86"
+	if test "$tmp_mmx" = yes; then
+          path="x86/k6/k62mmx x86/k6/mmx $path"
+        fi
+        ;;
+      k6*-*-*)
+	path="x86/k6 x86"
+	if test "$tmp_mmx" = yes; then
+          path="x86/k6/mmx $path"
+        fi
+        ;;
+      athlon-*-*)
+	path="x86/k7 x86"
+	if test "$tmp_mmx" = yes; then
+          path="x86/k7/mmx $path"
+        fi
+        ;;
+    esac
+    ;;
+
+  i960*-*-*) path="i960";;
+
+  ia64*-*-*) path="ia64";;
+
+# Motorola 68k configurations.  Let m68k mean 68020-68040.
+  m680[234]0*-*-* | m68k*-*-* | \
+  m68*-next-nextstep*)		# Nexts are at least '020
+    path="m68k/mc68020 m68k"
+    family=m68k
+    ;;
+  m68000*-*-*)
+    path="m68k"
+    family=m68k
+    ;;
+
+  m88k*-*-* | m88k*-*-*) path="m88k";;
+  m88110*-*-*) path="m88k/mc88110 m88k";;
+  ns32k*-*-*) path="ns32k";;
+
+  pyramid-*-*) path="pyr";;
+
+  ppc601-*-*) path="power powerpc32";;
+  powerpc64*-*-*) path="powerpc64";;
+  powerpc*-*-*) path="powerpc32";;
+  rs6000-*-* | power-*-* | power2-*-*)
+    path="power"
+    extra_functions="udiv_w_sdiv"
+    ;;
+
+  sh-*-*) path="sh";;
+  sh2-*-*) path="sh/sh2 sh";;
+
+  mips[34]*-*-*) path="mips3";;
+  mips*-*-irix6*) path="mips3";;
+  mips*-*-*) path="mips2";;
+
+  vax*-*-*) path="vax"; extra_functions="udiv_w_sdiv";;
+
+  z8000x*-*-*) path="z8000x"; extra_functions="udiv_w_sdiv";;
+  z8000*-*-*) path="z8000"; extra_functions="udiv_w_sdiv";;
+
+  clipper*-*-*) path="clipper";;
+esac
+
+if test -n "$CALLING_CONVENTIONS_OBJS"; then
+  cat >>confdefs.h <<\EOF
+#define HAVE_CALLING_CONVENTIONS 1
+EOF
+
+fi
+
+case ${target} in
+  i[5-8]86*-*-* | k[5-8]*-*-* | pentium*-*-* | athlon-*-*)
+    # rdtsc is in pentium and up, not in i386 and i486
+    SPEED_CYCLECOUNTER_OBJS=pentium.lo
+    ;;
+  alpha*-*-*)
+    SPEED_CYCLECOUNTER_OBJS=alpha.lo
+    ;;
+  sparcv9*-*-* | ultrasparc*-*-* | sparc64*-*-*)
+    SPEED_CYCLECOUNTER_OBJS=sparcv9.lo
+    ;;
+  hppa2*-*-*)
+    SPEED_CYCLECOUNTER_OBJS=hppa2.lo
+    ;;
+  hppa*-*-*)
+    SPEED_CYCLECOUNTER_OBJS=hppa.lo
+    ;;
+esac
+
+if test -n "$SPEED_CYCLECOUNTER_OBJS"
+then
+  cat >>confdefs.h <<\EOF
+#define HAVE_SPEED_CYCLECOUNTER 1
+EOF
+
+fi
+
+echo $ECHO_N "checking for Cygwin environment... $ECHO_C" 1>&6
+echo "configure:2822: checking for Cygwin environment" 1>&5
+if test "${ac_cv_cygwin+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat >conftest.$ac_ext <<EOF
+#line 2827 "configure"
+#include "confdefs.h"
+
+int
+main ()
+{
+#ifndef __CYGWIN__
+# define __CYGWIN__ __CYGWIN32__
+#endif
+return __CYGWIN__;
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:2841: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  ac_cv_cygwin=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_cygwin=no
+fi
+rm -f conftest*
+fi
+echo "$ECHO_T""$ac_cv_cygwin" 1>&6
+CYGWIN=
+test "$ac_cv_cygwin" = yes && CYGWIN=yes
+echo $ECHO_N "checking for mingw32 environment... $ECHO_C" 1>&6
+echo "configure:2856: checking for mingw32 environment" 1>&5
+if test "${ac_cv_mingw32+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat >conftest.$ac_ext <<EOF
+#line 2861 "configure"
+#include "confdefs.h"
+
+int
+main ()
+{
+return __MINGW32__;
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:2872: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  ac_cv_mingw32=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_mingw32=no
+fi
+rm -f conftest*
+fi
+echo "$ECHO_T""$ac_cv_mingw32" 1>&6
+MINGW32=
+test "$ac_cv_mingw32" = yes && MINGW32=yes
+echo $ECHO_N "checking for EMX OS/2 environment... $ECHO_C" 1>&6
+echo "configure:2887: checking for EMX OS/2 environment" 1>&5
+if test "${ac_cv_emxos2+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat >conftest.$ac_ext <<EOF
+#line 2892 "configure"
+#include "confdefs.h"
+
+int
+main ()
+{
+return __EMX__;
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:2903: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  ac_cv_emxos2=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_emxos2=no
+fi
+rm -f conftest*
+fi
+echo "$ECHO_T""$ac_cv_emxos2" 1>&6
+EMXOS2=
+test "$ac_cv_emxos2" = yes && EMXOS2=yes
+
+echo $ECHO_N "checking for executable suffix... $ECHO_C" 1>&6
+echo "configure:2919: checking for executable suffix" 1>&5
+if test "${ac_cv_exeext+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test "$CYGWIN" = yes || test "$MINGW32" = yes || test "$EMXOS2" = yes; then
+  ac_cv_exeext=.exe
+else
+  rm -f conftest*
+  echo 'int main () { return 0; }' >conftest.$ac_ext
+  ac_cv_exeext=
+  if { (eval echo configure:2929: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; }; then
+    for ac_file in conftest.*; do
+      case $ac_file in
+      *.c | *.C | *.o | *.obj | *.xcoff) ;;
+      *) ac_cv_exeext=`echo $ac_file | sed -e s/conftest//` ;;
+      esac
+    done
+  else
+    { echo "configure: error: installation or configuration problem: compiler cannot create executables." 1>&2; exit 1; }
+  fi
+  rm -f conftest*
+  test x"${ac_cv_exeext}" = x && ac_cv_exeext=no
+fi
+fi
+
+EXEEXT=""
+test x"${ac_cv_exeext}" != xno && EXEEXT=${ac_cv_exeext}
+echo "$ECHO_T""${ac_cv_exeext}" 1>&6
+ac_exeext=$EXEEXT
+
+echo $ECHO_N "checking for object suffix... $ECHO_C" 1>&6
+echo "configure:2950: checking for object suffix" 1>&5
+if test "${ac_cv_objext+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  rm -f conftest*
+echo 'int i = 1;' >conftest.$ac_ext
+if { (eval echo configure:2956: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  for ac_file in conftest.*; do
+    case $ac_file in
+    *.c) ;;
+    *) ac_cv_objext=`echo $ac_file | sed -e s/conftest.//` ;;
+    esac
+  done
+else
+  { echo "configure: error: installation or configuration problem; compiler does not work" 1>&2; exit 1; }
+fi
+rm -f conftest*
+fi
+
+echo "$ECHO_T""$ac_cv_objext" 1>&6
+OBJEXT=$ac_cv_objext
+ac_objext=$ac_cv_objext
+
+case "$target" in
+  *-*-aix4.[3-9]*) enable_shared=no ;;
+esac
+# Check whether --enable-shared or --disable-shared was given.
+if test "${enable_shared+set}" = set; then
+  enableval="$enable_shared"
+  p=${PACKAGE-default}
+case "$enableval" in
+yes) enable_shared=yes ;;
+no) enable_shared=no ;;
+*)
+  enable_shared=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_shared=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac
+else
+  enable_shared=yes
+fi
+# Check whether --enable-static or --disable-static was given.
+if test "${enable_static+set}" = set; then
+  enableval="$enable_static"
+  p=${PACKAGE-default}
+case "$enableval" in
+yes) enable_static=yes ;;
+no) enable_static=no ;;
+*)
+  enable_static=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_static=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac
+else
+  enable_static=yes
+fi
+# Check whether --enable-fast-install or --disable-fast-install was given.
+if test "${enable_fast_install+set}" = set; then
+  enableval="$enable_fast_install"
+  p=${PACKAGE-default}
+case "$enableval" in
+yes) enable_fast_install=yes ;;
+no) enable_fast_install=no ;;
+*)
+  enable_fast_install=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_fast_install=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac
+else
+  enable_fast_install=yes
+fi
+
+echo $ECHO_N "checking build system type... $ECHO_C" 1>&6
+echo "configure:3044: checking build system type" 1>&5
+if test "x$ac_cv_build" = "x" || (test "x$build" != "xNONE" && test "x$build" != "x$ac_cv_build_alias"); then
+
+  # Make sure we can run config.sub.
+  if $ac_config_sub sun4 >/dev/null 2>&1; then :; else
+    { echo "configure: error: cannot run $ac_config_sub" 1>&2; exit 1; }
+  fi
+
+  ac_cv_build_alias=$build
+  case "$ac_cv_build_alias" in
+  NONE)
+    case $nonopt in
+    NONE)
+      ac_cv_build_alias=$host_alias ;;
+    *) ac_cv_build_alias=$nonopt ;;
+    esac ;;
+  esac
+
+  ac_cv_build=`$ac_config_sub $ac_cv_build_alias` || exit 1
+  ac_cv_build_cpu=`echo $ac_cv_build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+  ac_cv_build_vendor=`echo $ac_cv_build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+  ac_cv_build_os=`echo $ac_cv_build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+else
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+fi
+
+echo "$ECHO_T""$ac_cv_build" 1>&6
+
+build=$ac_cv_build
+build_alias=$ac_cv_build_alias
+build_cpu=$ac_cv_build_cpu
+build_vendor=$ac_cv_build_vendor
+build_os=$ac_cv_build_os
+
+# Check whether --with-gnu-ld or --without-gnu-ld was given.
+if test "${with_gnu_ld+set}" = set; then
+  withval="$with_gnu_ld"
+  test "$withval" = no || with_gnu_ld=yes
+else
+  with_gnu_ld=no
+fi
+
+ac_prog=ld
+if test "$ac_cv_prog_gcc" = yes; then
+  # Check if gcc -print-prog-name=ld gives a path.
+  echo $ECHO_N "checking for ld used by GCC... $ECHO_C" 1>&6
+echo "configure:3090: checking for ld used by GCC" 1>&5
+  case $target in
+  *-*-mingw*)
+    # gcc leaves a trailing carriage return which upsets mingw
+    ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
+  *)
+    ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
+  esac
+  case "$ac_prog" in
+    # Accept absolute paths.
+    [\\/]* | [A-Za-z]:[\\/]*)
+      re_direlt='/[^/][^/]*/\.\./'
+      # Canonicalize the path of ld
+      ac_prog=`echo $ac_prog| sed 's%\\\\%/%g'`
+      while echo $ac_prog | grep "$re_direlt" > /dev/null 2>&1; do
+	ac_prog=`echo $ac_prog| sed "s%$re_direlt%/%"`
+      done
+      test -z "$LD" && LD="$ac_prog"
+      ;;
+  "")
+    # If it fails, then pretend we aren't using GCC.
+    ac_prog=ld
+    ;;
+  *)
+    # If it is relative, then search for the first ld in PATH.
+    with_gnu_ld=unknown
+    ;;
+  esac
+elif test "$with_gnu_ld" = yes; then
+  echo $ECHO_N "checking for GNU ld... $ECHO_C" 1>&6
+echo "configure:3120: checking for GNU ld" 1>&5
+else
+  echo $ECHO_N "checking for non-GNU ld... $ECHO_C" 1>&6
+echo "configure:3123: checking for non-GNU ld" 1>&5
+fi
+if test "${ac_cv_path_LD+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -z "$LD"; then
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+  for ac_dir in $PATH; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
+      ac_cv_path_LD="$ac_dir/$ac_prog"
+      # Check to see if the program is GNU ld.  I'd rather use --version,
+      # but apparently some GNU ld's only accept -v.
+      # Break only if it was the GNU/non-GNU ld that we prefer.
+      if "$ac_cv_path_LD" -v 2>&1 < /dev/null | egrep '(GNU|with BFD)' > /dev/null; then
+	test "$with_gnu_ld" != no && break
+      else
+	test "$with_gnu_ld" != yes && break
+      fi
+    fi
+  done
+  IFS="$ac_save_ifs"
+else
+  ac_cv_path_LD="$LD" # Let the user override the test with a path.
+fi
+fi
+
+LD="$ac_cv_path_LD"
+if test -n "$LD"; then
+  echo "$ECHO_T""$LD" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+test -z "$LD" && { echo "configure: error: no acceptable ld found in \$PATH" 1>&2; exit 1; }
+echo $ECHO_N "checking if the linker ($LD) is GNU ld... $ECHO_C" 1>&6
+echo "configure:3158: checking if the linker ($LD) is GNU ld" 1>&5
+if test "${ac_cv_prog_gnu_ld+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  # I'd rather use --version here, but apparently some GNU ld's only accept -v.
+if $LD -v 2>&1 </dev/null | egrep '(GNU|with BFD)' 1>&5; then
+  ac_cv_prog_gnu_ld=yes
+else
+  ac_cv_prog_gnu_ld=no
+fi
+fi
+echo "$ECHO_T""$ac_cv_prog_gnu_ld" 1>&6
+with_gnu_ld=$ac_cv_prog_gnu_ld
+
+echo $ECHO_N "checking for $LD option to reload object files... $ECHO_C" 1>&6
+echo "configure:3173: checking for $LD option to reload object files" 1>&5
+if test "${lt_cv_ld_reload_flag+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  lt_cv_ld_reload_flag='-r'
+fi
+echo "$ECHO_T""$lt_cv_ld_reload_flag" 1>&6
+reload_flag=$lt_cv_ld_reload_flag
+test -n "$reload_flag" && reload_flag=" $reload_flag"
+
+echo $ECHO_N "checking how to recognise dependant libraries... $ECHO_C" 1>&6
+echo "configure:3184: checking how to recognise dependant libraries" 1>&5
+if test "${lt_cv_deplibs_check_method+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  lt_cv_file_magic_cmd='${MAGIC}'
+lt_cv_file_magic_test_file=
+lt_cv_deplibs_check_method='unknown'
+# Need to set the preceding variable on all platforms that support
+# interlibrary dependencies.
+# 'none' -- dependencies not supported.
+# `unknown' -- same as none, but documents that we really don't know.
+# 'pass_all' -- all dependencies passed with no checks.
+# 'test_compile' -- check by making test program.
+# 'file_magic [regex]' -- check by looking for files in library path
+# which responds to the $file_magic_cmd with a given egrep regex.
+# If you have `file' or equivalent on your system and you're not sure
+# whether `pass_all' will *always* work, you probably want this one.
+
+case "$host_os" in
+aix4* | beos*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+bsdi4*)
+    lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib)'
+    lt_cv_file_magic_test_file=/shlib/libc.so
+  ;;
+
+cygwin* | mingw*)
+  lt_cv_deplibs_check_method='file_magic file format pei*-i386(.*architecture: i386)?'
+  lt_cv_file_magic_cmd='${OBJDUMP} -f'
+  ;;
+
+freebsd*)
+  case "$version_type" in
+  freebsd-elf*)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  esac
+  ;;
+
+gnu*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+irix5* | irix6*)
+  case "$host_os" in
+  irix5*)
+    # this will be overridden with pass_all, but let us keep it just in case
+    lt_cv_deplibs_check_method="file_magic ELF 32-bit MSB dynamic lib MIPS - version 1"
+    ;;
+  *)
+    case "$LD" in
+    *-32|*"-32 ") libmagic=32-bit;;
+    *-n32|*"-n32 ") libmagic=N32;;
+    *-64|*"-64 ") libmagic=64-bit;;
+    *) libmagic=never-match;;
+    esac
+    # this will be overridden with pass_all, but let us keep it just in case
+        lt_cv_deplibs_check_method="file_magic ELF ${libmagic} MSB mips-[1234] dynamic lib MIPS - version 1"
+        ;;
+  esac
+  lt_cv_file_magic_test_file=`echo /lib${libsuff}/libc.so*`
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+# This must be Linux ELF.
+linux-gnu*)
+  case "$host_cpu" in
+  alpha* | i*86 | powerpc* | sparc* )
+    lt_cv_deplibs_check_method=pass_all ;;
+  *)
+    # glibc up to 2.1.1 does not perform some relocations on ARM
+        lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [LM]SB (shared object|dynamic lib )' ;;
+      esac
+  lt_cv_file_magic_test_file=`echo /lib/libc.so* /lib/libc-*.so`
+  ;;
+
+osf3* | osf4* | osf5*)
+  # this will be overridden with pass_all, but let us keep it just in case
+  lt_cv_deplibs_check_method='file_magic COFF format alpha shared library'
+  lt_cv_file_magic_test_file=/shlib/libc.so
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+sco3.2v5*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+solaris*)
+  lt_cv_deplibs_check_method=pass_all
+  lt_cv_file_magic_test_file=/lib/libc.so
+  ;;
+
+sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+  case "$host_vendor" in
+  ncr)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  motorola)
+        lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib) M[0-9][0-9]* Version [0-9]'
+        lt_cv_file_magic_test_file=`echo /usr/lib/libc.so*`
+    ;;
+  esac
+  ;;
+esac
+
+fi
+echo "$ECHO_T""$lt_cv_deplibs_check_method" 1>&6
+file_magic_cmd=$lt_cv_file_magic_cmd
+deplibs_check_method=$lt_cv_deplibs_check_method
+
+if test $host != $build; then
+  ac_tool_prefix=${host_alias}-
+else
+  ac_tool_prefix=
+fi
+
+# Only perform the check for file, if the check method requires it
+case "$deplibs_check_method" in
+file_magic*)
+  if test "$file_magic_cmd" = '${MAGIC}'; then
+
+echo $ECHO_N "checking for ${ac_tool_prefix}file... $ECHO_C" 1>&6
+echo "configure:3308: checking for ${ac_tool_prefix}file" 1>&5
+if test "${lt_cv_path_MAGIC+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  case "$MAGIC" in
+  /*)
+  lt_cv_path_MAGIC="$MAGIC" # Let the user override the test with a path.
+  ;;
+  ?:/*)
+  ac_cv_path_MAGIC="$MAGIC" # Let the user override the test with a dos path.
+  ;;
+  *)
+  ac_save_MAGIC="$MAGIC"
+  IFS="${IFS=   }"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="/usr/bin:$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/${ac_tool_prefix}file; then
+      lt_cv_path_MAGIC="$ac_dir/${ac_tool_prefix}file"
+      if test -n "$file_magic_test_file"; then
+	case "$deplibs_check_method" in
+	"file_magic "*)
+	  file_magic_regex="`expr \"$deplibs_check_method\" : \"file_magic \(.*\)\"`"
+	  MAGIC="$lt_cv_path_MAGIC"
+	  if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+	    egrep "$file_magic_regex" > /dev/null; then
+	    :
+	  else
+	    cat <<EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such.  This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem.  Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
+EOF
+	  fi ;;
+	esac
+      fi
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+  MAGIC="$ac_save_MAGIC"
+  ;;
+esac
+fi
+
+MAGIC="$lt_cv_path_MAGIC"
+if test -n "$MAGIC"; then
+  echo "$ECHO_T""$MAGIC" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+if test -z "$lt_cv_path_MAGIC"; then
+  if test -n "$ac_tool_prefix"; then
+    echo $ECHO_N "checking for file... $ECHO_C" 1>&6
+echo "configure:3370: checking for file" 1>&5
+if test "${lt_cv_path_MAGIC+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  case "$MAGIC" in
+  /*)
+  lt_cv_path_MAGIC="$MAGIC" # Let the user override the test with a path.
+  ;;
+  ?:/*)
+  ac_cv_path_MAGIC="$MAGIC" # Let the user override the test with a dos path.
+  ;;
+  *)
+  ac_save_MAGIC="$MAGIC"
+  IFS="${IFS=   }"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="/usr/bin:$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/file; then
+      lt_cv_path_MAGIC="$ac_dir/file"
+      if test -n "$file_magic_test_file"; then
+	case "$deplibs_check_method" in
+	"file_magic "*)
+	  file_magic_regex="`expr \"$deplibs_check_method\" : \"file_magic \(.*\)\"`"
+	  MAGIC="$lt_cv_path_MAGIC"
+	  if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+	    egrep "$file_magic_regex" > /dev/null; then
+	    :
+	  else
+	    cat <<EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such.  This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem.  Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
+EOF
+	  fi ;;
+	esac
+      fi
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+  MAGIC="$ac_save_MAGIC"
+  ;;
+esac
+fi
+
+MAGIC="$lt_cv_path_MAGIC"
+if test -n "$MAGIC"; then
+  echo "$ECHO_T""$MAGIC" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+  else
+    MAGIC=:
+  fi
+fi
+
+  fi
+  ;;
+esac
+
+case "$target" in
+NONE) lt_target="$host" ;;
+*) lt_target="$target" ;;
+esac
+
+# Extract the first word of "${ac_tool_prefix}ranlib", so it can be a program name with args.
+set dummy ${ac_tool_prefix}ranlib; ac_word=$2
+echo $ECHO_N "checking for $ac_word... $ECHO_C" 1>&6
+echo "configure:3446: checking for $ac_word" 1>&5
+if test "${ac_cv_prog_RANLIB+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$RANLIB"; then
+  ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
+else
+  for ac_path in `IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+ac_dummy="$PATH"
+for ac_dir in $ac_dummy; do
+  test -z "$ac_dir" && ac_dir=.
+  if test -f $ac_dir/$ac_word; then
+    echo "$ac_dir/$ac_word"
+  fi
+done
+IFS="$ac_save_ifs"
+`; do
+    ac_cv_prog_RANLIB="${ac_tool_prefix}ranlib"
+    break
+  done
+fi
+fi
+RANLIB="$ac_cv_prog_RANLIB"
+if test -n "$RANLIB"; then
+  echo "$ECHO_T""$RANLIB" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+if test -z "$ac_cv_prog_RANLIB"; then
+  if test -n "$ac_tool_prefix"; then
+    # Extract the first word of "ranlib", so it can be a program name with args.
+set dummy ranlib; ac_word=$2
+echo $ECHO_N "checking for $ac_word... $ECHO_C" 1>&6
+echo "configure:3480: checking for $ac_word" 1>&5
+if test "${ac_cv_prog_RANLIB+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$RANLIB"; then
+  ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
+else
+  for ac_path in `IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+ac_dummy="$PATH"
+for ac_dir in $ac_dummy; do
+  test -z "$ac_dir" && ac_dir=.
+  if test -f $ac_dir/$ac_word; then
+    echo "$ac_dir/$ac_word"
+  fi
+done
+IFS="$ac_save_ifs"
+`; do
+    ac_cv_prog_RANLIB="ranlib"
+    break
+  done
+  test -z "$ac_cv_prog_RANLIB" && ac_cv_prog_RANLIB=":"
+fi
+fi
+RANLIB="$ac_cv_prog_RANLIB"
+if test -n "$RANLIB"; then
+  echo "$ECHO_T""$RANLIB" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+  else
+    RANLIB=":"
+  fi
+fi
+
+# Extract the first word of "${ac_tool_prefix}strip", so it can be a program name with args.
+set dummy ${ac_tool_prefix}strip; ac_word=$2
+echo $ECHO_N "checking for $ac_word... $ECHO_C" 1>&6
+echo "configure:3518: checking for $ac_word" 1>&5
+if test "${ac_cv_prog_STRIP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$STRIP"; then
+  ac_cv_prog_STRIP="$STRIP" # Let the user override the test.
+else
+  for ac_path in `IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+ac_dummy="$PATH"
+for ac_dir in $ac_dummy; do
+  test -z "$ac_dir" && ac_dir=.
+  if test -f $ac_dir/$ac_word; then
+    echo "$ac_dir/$ac_word"
+  fi
+done
+IFS="$ac_save_ifs"
+`; do
+    ac_cv_prog_STRIP="${ac_tool_prefix}strip"
+    break
+  done
+fi
+fi
+STRIP="$ac_cv_prog_STRIP"
+if test -n "$STRIP"; then
+  echo "$ECHO_T""$STRIP" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+if test -z "$ac_cv_prog_STRIP"; then
+  if test -n "$ac_tool_prefix"; then
+    # Extract the first word of "strip", so it can be a program name with args.
+set dummy strip; ac_word=$2
+echo $ECHO_N "checking for $ac_word... $ECHO_C" 1>&6
+echo "configure:3552: checking for $ac_word" 1>&5
+if test "${ac_cv_prog_STRIP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$STRIP"; then
+  ac_cv_prog_STRIP="$STRIP" # Let the user override the test.
+else
+  for ac_path in `IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+ac_dummy="$PATH"
+for ac_dir in $ac_dummy; do
+  test -z "$ac_dir" && ac_dir=.
+  if test -f $ac_dir/$ac_word; then
+    echo "$ac_dir/$ac_word"
+  fi
+done
+IFS="$ac_save_ifs"
+`; do
+    ac_cv_prog_STRIP="strip"
+    break
+  done
+  test -z "$ac_cv_prog_STRIP" && ac_cv_prog_STRIP=":"
+fi
+fi
+STRIP="$ac_cv_prog_STRIP"
+if test -n "$STRIP"; then
+  echo "$ECHO_T""$STRIP" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+  else
+    STRIP=":"
+  fi
+fi
+
+# Check for any special flags to pass to ltconfig.
+libtool_flags="--cache-file=$cache_file"
+test "$enable_shared" = no && libtool_flags="$libtool_flags --disable-shared"
+test "$enable_static" = no && libtool_flags="$libtool_flags --disable-static"
+test "$enable_fast_install" = no && libtool_flags="$libtool_flags --disable-fast-install"
+test "$ac_cv_prog_gcc" = yes && libtool_flags="$libtool_flags --with-gcc"
+test "$ac_cv_prog_gnu_ld" = yes && libtool_flags="$libtool_flags --with-gnu-ld"
+
+# Check whether --enable-libtool-lock or --disable-libtool-lock was given.
+if test "${enable_libtool_lock+set}" = set; then
+  enableval="$enable_libtool_lock"
+
+fi
+test "x$enable_libtool_lock" = xno && libtool_flags="$libtool_flags --disable-lock"
+test x"$silent" = xyes && libtool_flags="$libtool_flags --silent"
+
+# Check whether --with-pic or --without-pic was given.
+if test "${with_pic+set}" = set; then
+  withval="$with_pic"
+  pic_mode="$withval"
+else
+  pic_mode=default
+fi
+test x"$pic_mode" = xyes && libtool_flags="$libtool_flags --prefer-pic"
+test x"$pic_mode" = xno && libtool_flags="$libtool_flags --prefer-non-pic"
+
+# Some flags need to be propagated to the compiler or linker for good
+# libtool support.
+case "$lt_target" in
+*-*-irix6*)
+  # Find out which ABI we are using.
+  echo '#line 3618 "configure"' > conftest.$ac_ext
+  if { (eval echo configure:3619: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+    case "`/usr/bin/file conftest.o`" in
+    *32-bit*)
+      LD="${LD-ld} -32"
+      ;;
+    *N32*)
+      LD="${LD-ld} -n32"
+      ;;
+    *64-bit*)
+      LD="${LD-ld} -64"
+      ;;
+    esac
+  fi
+  rm -rf conftest*
+  ;;
+
+*-*-sco3.2v5*)
+  # On SCO OpenServer 5, we need -belf to get full-featured binaries.
+  SAVE_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -belf"
+  echo $ECHO_N "checking whether the C compiler needs -belf... $ECHO_C" 1>&6
+echo "configure:3640: checking whether the C compiler needs -belf" 1>&5
+if test "${lt_cv_cc_needs_belf+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+
+     ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+     cat >conftest.$ac_ext <<EOF
+#line 3653 "configure"
+#include "confdefs.h"
+
+int
+main()
+{
+
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:3664: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  lt_cv_cc_needs_belf=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  lt_cv_cc_needs_belf=no
+fi
+rm -f conftest*
+
+     ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+fi
+echo "$ECHO_T""$lt_cv_cc_needs_belf" 1>&6
+  if test x"$lt_cv_cc_needs_belf" != x"yes"; then
+    # this is probably gcc 2.8.0, egcs 1.0 or newer; no need for -belf
+    CFLAGS="$SAVE_CFLAGS"
+  fi
+  ;;
+
+esac
+
+# Save cache, so that ltconfig can load it
+cat >confcache <<\EOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs.  It is not useful on other systems.
+# If it contains results you don't want to keep, you may remove or edit it.
+#
+# By default, configure uses ./config.cache as the cache file,
+# creating it if it does not exist already.  You can give configure
+# the --cache-file=FILE option to use a different cache file; that is
+# what configure does when it calls configure scripts in
+# subdirectories, so they share the cache.
+# Giving --cache-file=/dev/null disables caching, for debugging configure.
+# config.status only pays attention to the cache file if you give it the
+# --recheck option to rerun configure.
+#
+EOF
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, don't put newlines in cache variables' values.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(set) 2>&1 |
+  case `(ac_space=' '; set | grep ac_space) 2>&1` in
+  *ac_space=\ *)
+    # `set' does not quote correctly, so add quotes (double-quote substitution
+    # turns \\\\ into \\, and sed turns \\ into \).
+    sed -n \
+      -e "s/'/'\\\\''/g" \
+      -e "s/^\\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\\)=\\(.*\\)/\\1=\${\\1='\\2'}/p"
+    ;;
+  *)
+    # `set' quotes correctly as required by POSIX, so do not add quotes.
+    sed -n -e 's/^\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\)=\(.*\)/\1=${\1=\2}/p'
+    ;;
+  esac >>confcache
+if cmp -s $cache_file confcache; then :; else
+  if test -w $cache_file; then
+    echo "updating cache $cache_file"
+    cat confcache >$cache_file
+  else
+    echo "not updating unwritable cache $cache_file"
+  fi
+fi
+rm -f confcache
+
+# Actually configure libtool.  ac_aux_dir is where install-sh is found.
+AR="$AR" CC="$CC" CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" \
+MAGIC="$MAGIC" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig --no-reexec \
+$libtool_flags --no-verify --build="$build" $ac_aux_dir/ltmain.sh $lt_target \
+|| { echo "configure: error: libtool configure failed" 1>&2; exit 1; }
+
+# Reload cache, that may have been modified by ltconfig
+if test -r "$cache_file"; then
+  echo "loading cache $cache_file"
+      test -f "$cache_file" && . $cache_file
+else
+  echo "creating cache $cache_file"
+  >$cache_file
+fi
+
+# This can be used to rebuild libtool when needed
+LIBTOOL_DEPS="$ac_aux_dir/ltconfig $ac_aux_dir/ltmain.sh"
+
+# Always use our own libtool.
+LIBTOOL='$(SHELL) $(top_builddir)/libtool'
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+
+echo $ECHO_N "checking whether optarg is declared... $ECHO_C" 1>&6
+echo "configure:3769: checking whether optarg is declared" 1>&5
+if test "${ac_cv_have_decl_optarg+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat >conftest.$ac_ext <<EOF
+#line 3774 "configure"
+#include "confdefs.h"
+$ac_includes_default
+int
+main ()
+{
+#ifndef optarg
+  char *p = (char *) optarg;
+#endif
+
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:3788: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  ac_cv_have_decl_optarg=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_have_decl_optarg=no
+fi
+rm -f conftest*
+fi
+echo "$ECHO_T""$ac_cv_have_decl_optarg" 1>&6
+if test $ac_cv_have_decl_optarg = yes; then
+  cat >>confdefs.h <<EOF
+#define HAVE_DECL_OPTARG 1
+EOF
+
+else
+  cat >>confdefs.h <<EOF
+#define HAVE_DECL_OPTARG 0
+EOF
+
+fi
+
+echo $ECHO_N "checking for ANSI C header files... $ECHO_C" 1>&6
+echo "configure:3813: checking for ANSI C header files" 1>&5
+if test "${ac_cv_header_stdc+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+
+cat >conftest.$ac_ext <<EOF
+#line 3819 "configure"
+#include "confdefs.h"
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <float.h>
+
+EOF
+ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
+{ (eval echo configure:3828: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  rm -rf conftest*
+  ac_cv_header_stdc=yes
+else
+  echo "$ac_err" >&5
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+if test $ac_cv_header_stdc = yes; then
+  # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
+
+cat >conftest.$ac_ext <<EOF
+#line 3846 "configure"
+#include "confdefs.h"
+#include <string.h>
+
+EOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  egrep "memchr" >/dev/null 2>&1; then
+  :
+else
+  rm -rf conftest*
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
+
+cat >conftest.$ac_ext <<EOF
+#line 3866 "configure"
+#include "confdefs.h"
+#include <stdlib.h>
+
+EOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  egrep "free" >/dev/null 2>&1; then
+  :
+else
+  rm -rf conftest*
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
+if test "$cross_compiling" = yes; then
+  :
+else
+  cat >conftest.$ac_ext <<EOF
+#line 3888 "configure"
+#include "confdefs.h"
+#include <ctype.h>
+#if ((' ' & 0x0FF) == 0x020)
+# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
+# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
+#else
+# define ISLOWER(c) (('a' <= (c) && (c) <= 'i') \
+                     || ('j' <= (c) && (c) <= 'r') \
+                     || ('s' <= (c) && (c) <= 'z'))
+# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
+#endif
+
+#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 256; i++)
+    if (XOR (islower (i), ISLOWER (i))
+        || toupper (i) != TOUPPER (i))
+      exit(2);
+  exit (0);
+}
+EOF
+if { (eval echo configure:3913: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
+then
+  :
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -fr conftest*
+  ac_cv_header_stdc=no
+fi
+rm -fr conftest*
+
+fi
+
+fi
+fi
+echo "$ECHO_T""$ac_cv_header_stdc" 1>&6
+if test $ac_cv_header_stdc = yes; then
+  cat >>confdefs.h <<\EOF
+#define STDC_HEADERS 1
+EOF
+
+fi
+
+for ac_header in getopt.h unistd.h sys/sysctl.h sys/time.h
+do
+ac_ac_Header=`echo "ac_cv_header_$ac_header" | $ac_tr_sh`
+echo $ECHO_N "checking for $ac_header... $ECHO_C" 1>&6
+echo "configure:3940: checking for $ac_header" 1>&5
+if eval "test \"\${$ac_ac_Header+set}\" = set"; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+
+cat >conftest.$ac_ext <<EOF
+#line 3946 "configure"
+#include "confdefs.h"
+#include <$ac_header>
+
+EOF
+ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
+{ (eval echo configure:3952: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  rm -rf conftest*
+  eval "$ac_ac_Header=yes"
+else
+  echo "$ac_err" >&5
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  eval "$ac_ac_Header=no"
+fi
+rm -f conftest*
+fi
+echo "$ECHO_T""`eval echo '${'$ac_ac_Header'}'`" 1>&6
+if test `eval echo '${'$ac_ac_Header'}'` = yes; then
+  cat >>confdefs.h <<EOF
+#define `echo "HAVE_$ac_header" | $ac_tr_cpp` 1
+EOF
+
+fi
+done
+
+echo $ECHO_N "checking for void... $ECHO_C" 1>&6
+echo "configure:3976: checking for void" 1>&5
+if test "${ac_cv_type_void+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat >conftest.$ac_ext <<EOF
+#line 3981 "configure"
+#include "confdefs.h"
+$ac_includes_default
+int
+main ()
+{
+if ((void *) 0)
+  return 0;
+if (sizeof (void))
+  return 0;
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:3995: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  ac_cv_type_void=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_type_void=no
+fi
+rm -f conftest*
+fi
+echo "$ECHO_T""$ac_cv_type_void" 1>&6
+if test $ac_cv_type_void = yes; then
+  cat >>confdefs.h <<EOF
+#define HAVE_VOID 1
+EOF
+
+fi
+
+echo $ECHO_N "checking for preprocessor stringizing operator... $ECHO_C" 1>&6
+echo "configure:4015: checking for preprocessor stringizing operator" 1>&5
+if test "${ac_cv_c_stringize+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+
+cat >conftest.$ac_ext <<EOF
+#line 4021 "configure"
+#include "confdefs.h"
+
+#define x(y) #y
+
+char *s = x(teststring);
+
+EOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  egrep "#teststring" >/dev/null 2>&1; then
+  rm -rf conftest*
+  ac_cv_c_stringize=no
+else
+  rm -rf conftest*
+  ac_cv_c_stringize=yes
+fi
+rm -f conftest*
+
+fi
+
+if test "${ac_cv_c_stringize}" = yes; then
+  cat >>confdefs.h <<\EOF
+#define HAVE_STRINGIZE 1
+EOF
+
+fi
+echo "$ECHO_T""${ac_cv_c_stringize}" 1>&6
+
+for ac_func in getopt_long getpagesize popen processor_info strtoul sysconf sysctlbyname
+do
+ac_ac_var=`echo "ac_cv_func_$ac_func" | $ac_tr_sh`
+echo $ECHO_N "checking for $ac_func... $ECHO_C" 1>&6
+echo "configure:4053: checking for $ac_func" 1>&5
+if eval "test \"\${$ac_ac_var+set}\" = set"; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat >conftest.$ac_ext <<EOF
+#line 4058 "configure"
+#include "confdefs.h"
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char $ac_func(); below.  */
+#include <assert.h>
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+char $ac_func();
+char (*f)();
+
+int
+main()
+{
+
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined (__stub_$ac_func) || defined (__stub___$ac_func)
+choke me
+#else
+f = $ac_func;
+#endif
+
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:4086: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  eval "$ac_ac_var=yes"
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  eval "$ac_ac_var=no"
+fi
+rm -f conftest*
+
+fi
+echo "$ECHO_T""`eval echo '${'$ac_ac_var'}'`" 1>&6
+if test `eval echo '${'$ac_ac_var'}'` = yes; then
+  cat >>confdefs.h <<EOF
+#define `echo "HAVE_$ac_func" | $ac_tr_cpp` 1
+EOF
+
+fi
+done
+
+echo $ECHO_N "checking if ansi2knr should be used... $ECHO_C" 1>&6
+echo "configure:4108: checking if ansi2knr should be used" 1>&5
+if test "${gmp_cv_c_ansi2knr+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat >conftest.c <<EOF
+int main (int argc, char *argv) { return 0; }
+EOF
+if { (eval echo configure:4115: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  gmp_cv_c_ansi2knr=no
+else
+  gmp_cv_c_ansi2knr=yes
+fi
+rm -f conftest.*
+
+fi
+echo "$ECHO_T""$gmp_cv_c_ansi2knr" 1>&6
+if test $gmp_cv_c_ansi2knr = no; then
+  U= ANSI2KNR=
+else
+  U=_ ANSI2KNR=./ansi2knr
+  # Ensure some checks needed by ansi2knr itself.
+
+echo $ECHO_N "checking for ANSI C header files... $ECHO_C" 1>&6
+echo "configure:4131: checking for ANSI C header files" 1>&5
+if test "${ac_cv_header_stdc+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+
+cat >conftest.$ac_ext <<EOF
+#line 4137 "configure"
+#include "confdefs.h"
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <float.h>
+
+EOF
+ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
+{ (eval echo configure:4146: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  rm -rf conftest*
+  ac_cv_header_stdc=yes
+else
+  echo "$ac_err" >&5
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+if test $ac_cv_header_stdc = yes; then
+  # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
+
+cat >conftest.$ac_ext <<EOF
+#line 4164 "configure"
+#include "confdefs.h"
+#include <string.h>
+
+EOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  egrep "memchr" >/dev/null 2>&1; then
+  :
+else
+  rm -rf conftest*
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
+
+cat >conftest.$ac_ext <<EOF
+#line 4184 "configure"
+#include "confdefs.h"
+#include <stdlib.h>
+
+EOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  egrep "free" >/dev/null 2>&1; then
+  :
+else
+  rm -rf conftest*
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
+if test "$cross_compiling" = yes; then
+  :
+else
+  cat >conftest.$ac_ext <<EOF
+#line 4206 "configure"
+#include "confdefs.h"
+#include <ctype.h>
+#if ((' ' & 0x0FF) == 0x020)
+# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
+# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
+#else
+# define ISLOWER(c) (('a' <= (c) && (c) <= 'i') \
+                     || ('j' <= (c) && (c) <= 'r') \
+                     || ('s' <= (c) && (c) <= 'z'))
+# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
+#endif
+
+#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 256; i++)
+    if (XOR (islower (i), ISLOWER (i))
+        || toupper (i) != TOUPPER (i))
+      exit(2);
+  exit (0);
+}
+EOF
+if { (eval echo configure:4231: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
+then
+  :
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -fr conftest*
+  ac_cv_header_stdc=no
+fi
+rm -fr conftest*
+
+fi
+
+fi
+fi
+echo "$ECHO_T""$ac_cv_header_stdc" 1>&6
+if test $ac_cv_header_stdc = yes; then
+  cat >>confdefs.h <<\EOF
+#define STDC_HEADERS 1
+EOF
+
+fi
+
+  for ac_header in string.h
+do
+ac_ac_Header=`echo "ac_cv_header_$ac_header" | $ac_tr_sh`
+echo $ECHO_N "checking for $ac_header... $ECHO_C" 1>&6
+echo "configure:4258: checking for $ac_header" 1>&5
+if eval "test \"\${$ac_ac_Header+set}\" = set"; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+
+cat >conftest.$ac_ext <<EOF
+#line 4264 "configure"
+#include "confdefs.h"
+#include <$ac_header>
+
+EOF
+ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
+{ (eval echo configure:4270: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  rm -rf conftest*
+  eval "$ac_ac_Header=yes"
+else
+  echo "$ac_err" >&5
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  eval "$ac_ac_Header=no"
+fi
+rm -f conftest*
+fi
+echo "$ECHO_T""`eval echo '${'$ac_ac_Header'}'`" 1>&6
+if test `eval echo '${'$ac_ac_Header'}'` = yes; then
+  cat >>confdefs.h <<EOF
+#define `echo "HAVE_$ac_header" | $ac_tr_cpp` 1
+EOF
+
+fi
+done
+
+fi
+
+syntax=
+# For now, we use the old switch for setting syntax.
+# FIXME: Remove when conversion to .asm is completed.
+case "${target}" in
+  m680[234]0*-*-linuxaout* | m68k*-*-linuxaout* | \
+  m68k-next-nextstep* | \
+  m68000*-*-*)
+    syntax=mit
+    ;;
+  m680[234]0*-*-linux* | m68k*-*-linux*)
+    syntax=elf
+    ;;
+  m680[234]0*-*-* | m68k*-*-*)
+    syntax=mit
+    ;;
+esac
+
+# Now build an asm-syntax.h file for targets that include that from the
+# assembly files.
+# FIXME: Remove when conversion to .asm is completed.
+case "${family}-${underscore}-${asm_align}-${syntax}" in
+  m68k-yes-log-mit)
+    echo '#define MIT_SYNTAX' >asm-syntax.h
+    cat $srcdir/mpn/underscore.h >>asm-syntax.h
+    echo '#include "'$srcdir'/mpn/m68k/syntax.h"' >>asm-syntax.h;;
+  m68k-no-nolog-elf)
+    echo '#define ELF_SYNTAX' >asm-syntax.h
+    echo '#define C_SYMBOL_NAME(name) name' >>asm-syntax.h
+    echo '#include "'$srcdir'/mpn/m68k/syntax.h"' >>asm-syntax.h;;
+esac
+
+# The pattern here tests for an absolute path the same way as
+# _AC_OUTPUT_FILES in autoconf acgeneral.m4.
+
+echo "dnl  CONFIG_TOP_SRCDIR is a path from the mpn builddir to the top srcdir" >> $gmp_tmpconfigm4
+
+case "$srcdir" in
+[\\/]* | ?:[\\/]* )
+
+echo "define(<CONFIG_TOP_SRCDIR>,<\`$srcdir'>)" >> $gmp_tmpconfigm4
+    ;;
+*)
+
+echo "define(<CONFIG_TOP_SRCDIR>,<\`../$srcdir'>)" >> $gmp_tmpconfigm4
+ ;;
+esac
+
+echo "include(CONFIG_TOP_SRCDIR\`/mpn/asm-defs.m4')" >> $gmp_tmpconfigm4p
+
+# Must be after asm-defs.m4
+
+echo "define_not_for_expansion(\`HAVE_TARGET_CPU_$target_cpu')" >> $gmp_tmpconfigm4p
+
+case "$target" in
+  alpha*-cray-unicos*)
+    gmp_m4postinc="alpha/unicos.m4"
+    ;;
+  alpha*-*-*)
+    gmp_m4postinc="alpha/default.m4"
+    ;;
+  power*-*-*)
+    case "$target" in
+      *-*-mach* | *-*-rhapsody* | *-*-nextstep* | *-*-darwin* | *-*-macosx*)
+        ;;	# these use non-conventional assembly syntax.
+      powerpc64-*-aix*)
+	gmp_m4postinc="powerpc32/regmap.m4 powerpc64/aix.m4"
+        ;;
+      *-*-aix*)
+	gmp_m4postinc="powerpc32/regmap.m4 powerpc32/aix.m4"
+        ;;
+      *)
+	gmp_m4postinc="powerpc32/regmap.m4"
+	;;
+    esac
+    ;;
+esac
+
+for tmp_f in $gmp_m4postinc; do
+
+echo "include_mpn(\`$tmp_f')" >> $gmp_tmpconfigm4p
+
+done
+
+# Set up `gmp_links'.  It's a list of link:file pairs that configure will
+# process to create link -> file.
+gmp_links=
+
+# If the user specified `MPN_PATH', use that instead of the path we've
+# come up with.
+if test -z "$MPN_PATH"; then
+  path="$path generic"
+else
+  path="$MPN_PATH"
+fi
+
+# Pick the correct source files in $path and link them to mpn/.
+# $gmp_mpn_functions lists all functions we need.
+#
+# The rule is to find a file with the function name and a .asm, .S,
+# .s, or .c extension.  Certain multi-function files with special names
+# can provide some functions too.  (mpn/Makefile.am passes
+# -DOPERATION_<func> to get them to generate the right code.)
+
+# FIXME: udiv and umul aren't in $gmp_mpn_functions_optional yet since
+# there's some versions of those files which should be checked for bit
+# rot first.  Put them in $extra_functions for each target for now,
+# change to standard optionals when all are ready.
+
+# Note: The following lines defining $gmp_mpn_functions_optional
+#       and $gmp_mpn_functions are parsed by the "macos/configure"
+#       Perl script. So if you change the lines in a major way
+#       make sure to run and examine the output from
+#
+#           % (cd macos; perl configure)
+
+gmp_mpn_functions_optional="copyi copyd com_n		\
+  and_n andn_n nand_n ior_n iorn_n nior_n xor_n xnor_n"
+
+gmp_mpn_functions="${extra_functions} inlines add_n sub_n mul_1 addmul_1   \
+  submul_1 lshift rshift diveby3 divrem divrem_1 divrem_2                  \
+  mod_1 mod_1_rs pre_mod_1 dump                                            \
+  mul mul_fft mul_n mul_basecase sqr_basecase random                       \
+  random2 sqrtrem get_str set_str scan0 scan1 popcount hamdist cmp perfsqr \
+  bdivmod gcd_1 gcd gcdext tdiv_qr bz_divrem_n sb_divrem_mn jacbase        \
+  $gmp_mpn_functions_optional"
+
+# the list of all object files used by mpn/Makefile.in and the
+# top-level Makefile.in, respectively
+mpn_objects=
+mpn_objs_in_libgmp="mpn/mp_bases.lo"
+
+# SLPJ trace
+echo "Peering at file structure (takes a while)..." 1>&6
+
+for tmp_fn in ${gmp_mpn_functions} ; do
+# SLPJ trace
+  echo "...$tmp_fn..." 1>&6
+
+# This line was
+#    rm -f mpn/${tmp_fn}.[Ssc] mpn/${tmp_fn}.asm
+# but I found that on my NT workstation the command
+# would unpredictably hang.  rm wasn't an active process,
+# but absolutlely nothing was happening.  
+# I *think* that expanding the [Ssc] cures the problem
+#    SLPJ May 01
+  rm -f mpn/${tmp_fn}.S mpn/${tmp_fn}.s mpn/${tmp_fn}.c mpn/${tmp_fn}.asm
+
+  echo "...$tmp_fn (done rm)..." 1>&6
+
+  # functions that can be provided by multi-function files
+  tmp_mulfunc=
+  case $tmp_fn in
+  add_n|sub_n)       tmp_mulfunc="aors_n"    ;;
+  addmul_1|submul_1) tmp_mulfunc="aorsmul_1" ;;
+  popcount|hamdist)  tmp_mulfunc="popham"    ;;
+  and_n|andn_n|nand_n | ior_n|iorn_n|nior_n | xor_n|xnor_n)
+                     tmp_mulfunc="logops_n"  ;;
+  esac
+
+  found=no
+  for tmp_dir in $path; do
+
+# SLPJ trace
+# We get stuck sometimes
+	echo "  ...dir $tmp_dir..." 1>&6
+    for tmp_base in $tmp_fn $tmp_mulfunc; do
+
+# SLPJ trace
+# We get stuck sometimes
+	echo "   ...base $tmp_base..." 1>&6
+      for tmp_ext in asm S s c; do
+        tmp_file=$srcdir/mpn/$tmp_dir/$tmp_base.$tmp_ext
+
+# SLPJ trace
+# We get stuck sometimes
+	echo "   ...$tmp_file..." 1>&6
+
+        if test -f $tmp_file; then
+          found=yes
+
+          mpn_objects="$mpn_objects ${tmp_fn}.lo"
+          mpn_objs_in_libgmp="$mpn_objs_in_libgmp mpn/${tmp_fn}.lo"
+          gmp_links="$gmp_links mpn/$tmp_fn.$tmp_ext:mpn/$tmp_dir/$tmp_base.$tmp_ext"
+
+          # duplicate AC_DEFINEs are harmless, so it doesn't matter
+          # that multi-function files get grepped here repeatedly
+          gmp_ep="`
+            sed -n 's/^[ 	]*MULFUNC_PROLOGUE(\(.*\))/\1/p' $tmp_file ;
+            sed -n 's/^[ 	]*PROLOGUE.*(\(.*\))/\1/p' $tmp_file
+          `"
+          for gmp_tmp in $gmp_ep; do
+            cat >>confdefs.h <<EOF
+#define HAVE_NATIVE_${gmp_tmp} 1
+EOF
+
+          done
+
+          break
+        fi
+      done
+      if test $found = yes; then break ; fi
+    done
+    if test $found = yes; then break ; fi
+  done
+
+  if test $found = no; then
+    for tmp_optional in $gmp_mpn_functions_optional; do
+      if test $tmp_optional = $tmp_fn; then
+        found=yes
+      fi
+    done
+    if test $found = no; then
+      { echo "configure: error: no version of $tmp_fn found in path: $path" 1>&2; exit 1; }
+    fi
+  fi
+done
+
+
+# Create link for gmp-mparam.h.
+
+# SLPJ trace
+echo "Creating link for gmp-mparam.h..." 1>&6
+
+for tmp_dir in $path ; do
+  rm -f gmp-mparam.h
+  if test -f $srcdir/mpn/${tmp_dir}/gmp-mparam.h ; then
+    gmp_links="$gmp_links gmp-mparam.h:mpn/${tmp_dir}/gmp-mparam.h"
+
+    # Copy any KARATSUBA_SQR_THRESHOLD in gmp-mparam.h to config.m4.
+    # Some versions of sqr_basecase.asm use this.
+    tmp_gmp_karatsuba_sqr_threshold="`sed -n 's/^#define KARATSUBA_SQR_THRESHOLD[ 	]*\([0-9][0-9]*\).*$/\1/p' $srcdir/mpn/${tmp_dir}/gmp-mparam.h`"
+    if test -n "$tmp_gmp_karatsuba_sqr_threshold"; then
+
+echo "define(<KARATSUBA_SQR_THRESHOLD>,<$tmp_gmp_karatsuba_sqr_threshold>)" >> $gmp_tmpconfigm4
+
+    fi
+
+    break
+  fi
+done
+
+# SLPJ trace
+echo "Digging out links to include in DISTCLEANFILES..." 1>&6
+
+# Dig out the links from `gmp_links' for inclusion in DISTCLEANFILES.
+gmp_srclinks=
+for f in $gmp_links; do
+  gmp_srclinks="$gmp_srclinks `echo $f | sed 's/\(.*\):.*/\1/'`"
+done
+
+echo "creating $gmp_configm4"
+echo "dnl $gmp_configm4.  Generated automatically by configure." > $gmp_configm4
+if test -f $gmp_tmpconfigm4; then
+  echo "changequote(<,>)dnl" >> $gmp_configm4
+  echo "ifdef(<__CONFIG_M4_INCLUDED__>,,<" >> $gmp_configm4
+  cat $gmp_tmpconfigm4 >> $gmp_configm4
+  echo ">)" >> $gmp_configm4
+  echo "changequote(\`,')dnl" >> $gmp_configm4
+  rm $gmp_tmpconfigm4
+fi
+echo "ifdef(\`__CONFIG_M4_INCLUDED__',,\`" >> $gmp_configm4
+if test -f $gmp_tmpconfigm4i; then
+  cat $gmp_tmpconfigm4i >> $gmp_configm4
+  rm $gmp_tmpconfigm4i
+fi
+if test -f $gmp_tmpconfigm4p; then
+  cat $gmp_tmpconfigm4p >> $gmp_configm4
+  rm $gmp_tmpconfigm4p
+fi
+echo "')" >> $gmp_configm4
+echo "define(\`__CONFIG_M4_INCLUDED__')" >> $gmp_configm4
+
+trap '' 1 2 15
+cat >confcache <<\EOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs.  It is not useful on other systems.
+# If it contains results you don't want to keep, you may remove or edit it.
+#
+# By default, configure uses ./config.cache as the cache file,
+# creating it if it does not exist already.  You can give configure
+# the --cache-file=FILE option to use a different cache file; that is
+# what configure does when it calls configure scripts in
+# subdirectories, so they share the cache.
+# Giving --cache-file=/dev/null disables caching, for debugging configure.
+# config.status only pays attention to the cache file if you give it the
+# --recheck option to rerun configure.
+#
+EOF
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, don't put newlines in cache variables' values.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(set) 2>&1 |
+  case `(ac_space=' '; set | grep ac_space) 2>&1` in
+  *ac_space=\ *)
+    # `set' does not quote correctly, so add quotes (double-quote substitution
+    # turns \\\\ into \\, and sed turns \\ into \).
+    sed -n \
+      -e "s/'/'\\\\''/g" \
+      -e "s/^\\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\\)=\\(.*\\)/\\1=\${\\1='\\2'}/p"
+    ;;
+  *)
+    # `set' quotes correctly as required by POSIX, so do not add quotes.
+    sed -n -e 's/^\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\)=\(.*\)/\1=${\1=\2}/p'
+    ;;
+  esac >>confcache
+if cmp -s $cache_file confcache; then :; else
+  if test -w $cache_file; then
+    echo "updating cache $cache_file"
+    cat confcache >$cache_file
+  else
+    echo "not updating unwritable cache $cache_file"
+  fi
+fi
+rm -f confcache
+
+trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15
+
+test "x$prefix" = xNONE && prefix=$ac_default_prefix
+# Let make expand exec_prefix.
+test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
+
+# Any assignment to VPATH causes Sun make to only execute
+# the first set of double-colon rules, so remove it if not needed.
+# If there is a colon in the path, we need to keep it.
+if test "x$srcdir" = x.; then
+  ac_vpsub='/^[ 	]*VPATH[ 	]*=[^:]*$/d'
+fi
+
+DEFS=-DHAVE_CONFIG_H
+
+: ${CONFIG_STATUS=./config.status}
+trap 'rm -f $CONFIG_STATUS conftest*; exit 1' 1 2 15
+echo creating $CONFIG_STATUS
+cat >$CONFIG_STATUS <<EOF
+#! /bin/sh
+# Generated automatically by configure.
+# Run this file to recreate the current configuration.
+# This directory was configured as follows,
+# on host `(hostname || uname -n) 2>/dev/null | sed 1q`:
+#
+# $0 $ac_configure_args
+#
+# Compiler output produced by configure, useful for debugging
+# configure, is in ./config.log if it exists.
+
+# Files that config.status was made for.
+config_files="\\
+  Makefile mpn/Makefile mpz/Makefile"
+config_headers="\\
+  config.h:config.in"
+config_links="\\
+  $gmp_links"
+config_commands="\\
+  default-1"
+
+ac_cs_usage="\\
+\\\`$CONFIG_STATUS' instantiates files from templates according to the
+current configuration.
+
+Usage: $CONFIG_STATUS [OPTIONS] FILE...
+
+  --recheck    Update $CONFIG_STATUS by reconfiguring in the same conditions
+  --version    Print the version of Autoconf and exit
+  --help       Display this help and exit
+  --file=FILE[:TEMPLATE]
+               Instantiate the configuration file FILE
+  --header=FILE[:TEMPLATE]
+               Instantiate the configuration header FILE
+
+Configuration files:
+\$config_files
+
+Configuration headers:
+\$config_headers
+
+Configuration links:
+\$config_links
+
+Configuration commands:
+\$config_commands
+
+Report bugs to <bug-autoconf@gnu.org>."
+
+ac_cs_version="\\
+$CONFIG_STATUS generated by autoconf version 2.14a.
+Configured on host `(hostname || uname -n) 2>/dev/null | sed 1q` by
+  `echo "$0 $ac_configure_args" | sed 's/[\\"\`\$]/\\\\&/g'`"
+
+# Root of the tmp file names.  Use pid to allow concurrent executions.
+ac_cs_root=cs\$\$
+ac_given_srcdir=$srcdir
+ac_given_INSTALL="$INSTALL"
+
+# If no file are specified by the user, then we need to provide default
+# value.  By we need to know if files were specified by the user.
+ac_need_defaults=:
+while test \$# != 0
+do
+  case "\$1" in
+  --*=*)
+    ac_option=\`echo "\$1" | sed -e 's/=.*//'\`
+    ac_optarg=\`echo "\$1" | sed -e 's/[^=]*=//'\`
+    shift
+    set dummy "\$ac_option" "\$ac_optarg" \${1+"\$@"}
+    shift
+    ;;
+  -*);;
+  *) # This is not an option, so the user has probably given explicit
+     # arguments.
+     ac_need_defaults=false;;
+  esac
+
+  case "\$1" in
+
+  # Handling of the options.
+  -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
+    echo "running \${CONFIG_SHELL-/bin/sh} $0 `echo "$ac_configure_args" | sed 's/[\\"\`\$]/\\\\&/g'` --no-create --no-recursion"
+    exec \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion ;;
+  -version | --version | --versio | --versi | --vers | --ver | --ve | --v)
+    echo "\$ac_cs_version"; exit 0 ;;
+  --he | --h)
+    # Conflict between --help and --header
+    echo "$CONFIG_STATUS: ambiguous option: \$ac_option
+Try \\\`$CONFIG_STATUS --help' for more information."; exit 1 ;;
+  -help | --help | --hel )
+    echo "\$ac_cs_usage"; exit 0 ;;
+  --file | --fil | --fi | --f )
+    shift
+    CONFIG_FILES="\$CONFIG_FILES \$1"
+    ac_need_defaults=false;;
+  --header | --heade | --head | --hea )
+    shift
+    CONFIG_HEADERS="\$CONFIG_FILES \$1"
+    ac_need_defaults=false;;
+
+  # Handling of arguments.
+  'Makefile' ) CONFIG_FILES="\$CONFIG_FILES Makefile" ;;
+  'mpz/Makefile' ) CONFIG_FILES="\$CONFIG_FILES mpz/Makefile" ;;
+  'mpn/Makefile' ) CONFIG_FILES="\$CONFIG_FILES mpn/Makefile" ;;
+  '$gmp_links' ) CONFIG_LINKS="\$CONFIG_LINKS $gmp_links" ;;
+  'default-1' ) CONFIG_COMMANDS="\$CONFIG_COMMANDS default-1" ;;
+  'config.h' ) CONFIG_HEADERS="\$CONFIG_HEADERS config.h:config.in" ;;
+
+  # This is an error.
+  -*) echo "$CONFIG_STATUS: unrecognized option: \$1
+Try \\\`$CONFIG_STATUS --help' for more information."; exit 1 ;;
+  *) echo "$CONFIG_STATUS: invalid argument: \$1"; exit 1 ;;
+  esac
+  shift
+done
+
+EOF
+
+cat >>$CONFIG_STATUS <<\EOF
+# If the user did not use the arguments to specify the items to instantiate,
+# then the envvar interface is used.  Set only those that are not.
+if $ac_need_defaults; then
+  : ${CONFIG_FILES=$config_files}
+  : ${CONFIG_HEADERS=$config_headers}
+  : ${CONFIG_LINKS=$config_links}
+  : ${CONFIG_COMMANDS=$config_commands}
+fi
+
+# Trap to remove the temp files.
+trap 'rm -fr $ac_cs_root*; exit 1' 1 2 15
+
+EOF
+
+cat >>$CONFIG_STATUS <<EOF
+#
+# INIT-COMMANDS section.
+#
+
+EOF
+
+cat >>$CONFIG_STATUS <<EOF
+
+#
+# CONFIG_FILES section.
+#
+
+# No need to generate the scripts if there are no CONFIG_FILES.
+# This happens for instance when ./config.status config.h
+if test -n "\$CONFIG_FILES"; then
+  # Protect against being on the right side of a sed subst in config.status.
+  sed 's/%@/@@/; s/@%/@@/; s/%;t t\$/@;t t/; /@;t t\$/s/[\\\\&%]/\\\\&/g;
+   s/@@/%@/; s/@@/@%/; s/@;t t\$/%;t t/' >\$ac_cs_root.subs <<\\CEOF
+s%@exec_prefix@%$exec_prefix%;t t
+s%@prefix@%$prefix%;t t
+s%@program_transform_name@%$program_transform_name%;t t
+s%@bindir@%$bindir%;t t
+s%@sbindir@%$sbindir%;t t
+s%@libexecdir@%$libexecdir%;t t
+s%@datadir@%$datadir%;t t
+s%@sysconfdir@%$sysconfdir%;t t
+s%@sharedstatedir@%$sharedstatedir%;t t
+s%@localstatedir@%$localstatedir%;t t
+s%@libdir@%$libdir%;t t
+s%@includedir@%$includedir%;t t
+s%@oldincludedir@%$oldincludedir%;t t
+s%@infodir@%$infodir%;t t
+s%@mandir@%$mandir%;t t
+s%@SHELL@%$SHELL%;t t
+s%@ECHO_C@%$ECHO_C%;t t
+s%@ECHO_N@%$ECHO_N%;t t
+s%@ECHO_T@%$ECHO_T%;t t
+s%@CFLAGS@%$CFLAGS%;t t
+s%@CPPFLAGS@%$CPPFLAGS%;t t
+s%@CXXFLAGS@%$CXXFLAGS%;t t
+s%@FFLAGS@%$FFLAGS%;t t
+s%@DEFS@%$DEFS%;t t
+s%@LDFLAGS@%$LDFLAGS%;t t
+s%@LIBS@%$LIBS%;t t
+s%@host@%$host%;t t
+s%@host_alias@%$host_alias%;t t
+s%@host_cpu@%$host_cpu%;t t
+s%@host_vendor@%$host_vendor%;t t
+s%@host_os@%$host_os%;t t
+s%@target@%$target%;t t
+s%@target_alias@%$target_alias%;t t
+s%@target_cpu@%$target_cpu%;t t
+s%@target_vendor@%$target_vendor%;t t
+s%@target_os@%$target_os%;t t
+s%@build@%$build%;t t
+s%@build_alias@%$build_alias%;t t
+s%@build_cpu@%$build_cpu%;t t
+s%@build_vendor@%$build_vendor%;t t
+s%@build_os@%$build_os%;t t
+s%@INSTALL_PROGRAM@%$INSTALL_PROGRAM%;t t
+s%@INSTALL_SCRIPT@%$INSTALL_SCRIPT%;t t
+s%@INSTALL_DATA@%$INSTALL_DATA%;t t
+s%@PACKAGE@%$PACKAGE%;t t
+s%@VERSION@%$VERSION%;t t
+s%@ACLOCAL@%$ACLOCAL%;t t
+s%@AUTOCONF@%$AUTOCONF%;t t
+s%@AUTOMAKE@%$AUTOMAKE%;t t
+s%@AUTOHEADER@%$AUTOHEADER%;t t
+s%@MAKEINFO@%$MAKEINFO%;t t
+s%@AMTAR@%$AMTAR%;t t
+s%@install_sh@%$install_sh%;t t
+s%@AWK@%$AWK%;t t
+s%@SET_MAKE@%$SET_MAKE%;t t
+s%@AMDEP@%$AMDEP%;t t
+s%@AMDEPBACKSLASH@%$AMDEPBACKSLASH%;t t
+s%@DEPDIR@%$DEPDIR%;t t
+s%@MAINTAINER_MODE_TRUE@%$MAINTAINER_MODE_TRUE%;t t
+s%@MAINTAINER_MODE_FALSE@%$MAINTAINER_MODE_FALSE%;t t
+s%@MAINT@%$MAINT%;t t
+s%@WANT_MPBSD_TRUE@%$WANT_MPBSD_TRUE%;t t
+s%@WANT_MPBSD_FALSE@%$WANT_MPBSD_FALSE%;t t
+s%@WANT_MPFR_TRUE@%$WANT_MPFR_TRUE%;t t
+s%@WANT_MPFR_FALSE@%$WANT_MPFR_FALSE%;t t
+s%@CC@%$CC%;t t
+s%@CCAS@%$CCAS%;t t
+s%@CPP@%$CPP%;t t
+s%@LN_S@%$LN_S%;t t
+s%@M4@%$M4%;t t
+s%@AR@%$AR%;t t
+s%@CALLING_CONVENTIONS_OBJS@%$CALLING_CONVENTIONS_OBJS%;t t
+s%@SPEED_CYCLECOUNTER_OBJS@%$SPEED_CYCLECOUNTER_OBJS%;t t
+s%@EXEEXT@%$EXEEXT%;t t
+s%@OBJEXT@%$OBJEXT%;t t
+s%@RANLIB@%$RANLIB%;t t
+s%@STRIP@%$STRIP%;t t
+s%@LIBTOOL@%$LIBTOOL%;t t
+s%@U@%$U%;t t
+s%@ANSI2KNR@%$ANSI2KNR%;t t
+s%@mpn_objects@%$mpn_objects%;t t
+s%@mpn_objs_in_libgmp@%$mpn_objs_in_libgmp%;t t
+s%@gmp_srclinks@%$gmp_srclinks%;t t
+CEOF
+
+EOF
+
+  cat >>$CONFIG_STATUS <<\EOF
+  # Split the substitutions into bite-sized pieces for seds with
+  # small command number limits, like on Digital OSF/1 and HP-UX.
+  ac_max_sed_lines=48
+  ac_sed_frag=1 # Number of current file.
+  ac_beg=1 # First line for current file.
+  ac_end=$ac_max_sed_lines # Line after last line for current file.
+  ac_more_lines=:
+  ac_sed_cmds=""
+  while $ac_more_lines; do
+    if test $ac_beg -gt 1; then
+      sed "1,${ac_beg}d; ${ac_end}q" $ac_cs_root.subs >$ac_cs_root.sfrag
+    else
+      sed "${ac_end}q" $ac_cs_root.subs >$ac_cs_root.sfrag
+    fi
+    if test ! -s $ac_cs_root.sfrag; then
+      ac_more_lines=false
+      rm -f $ac_cs_root.sfrag
+    else
+      # The purpose of the label and of the branching condition is to
+      # speed up the sed processing (if there are no `@' at all, there
+      # is no need to browse any of the substitutions).
+      # These are the two extra sed commands mentioned above.
+      (echo ':t
+  /@[a-zA-Z_][a-zA-Z_0-9]*@/!b' && cat $ac_cs_root.sfrag) >$ac_cs_root.s$ac_sed_frag
+      if test -z "$ac_sed_cmds"; then
+  	ac_sed_cmds="sed -f $ac_cs_root.s$ac_sed_frag"
+      else
+  	ac_sed_cmds="$ac_sed_cmds | sed -f $ac_cs_root.s$ac_sed_frag"
+      fi
+      ac_sed_frag=`expr $ac_sed_frag + 1`
+      ac_beg=$ac_end
+      ac_end=`expr $ac_end + $ac_max_sed_lines`
+    fi
+  done
+  if test -z "$ac_sed_cmds"; then
+    ac_sed_cmds=cat
+  fi
+fi # test -n "$CONFIG_FILES"
+
+EOF
+cat >>$CONFIG_STATUS <<\EOF
+for ac_file in .. $CONFIG_FILES; do if test "x$ac_file" != x..; then
+  # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in".
+  case "$ac_file" in
+  *:*) ac_file_in=`echo "$ac_file" | sed 's%[^:]*:%%'`
+       ac_file=`echo "$ac_file" | sed 's%:.*%%'` ;;
+  *) ac_file_in="${ac_file}.in" ;;
+  esac
+
+  # Adjust a relative srcdir, top_srcdir, and INSTALL for subdirectories.
+
+  # Remove last slash and all that follows it.  Not all systems have dirname.
+  ac_dir=`echo "$ac_file" | sed 's%/[^/][^/]*$%%'`
+  if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then
+    # The file is in a subdirectory.
+    test ! -d "$ac_dir" && mkdir "$ac_dir"
+    ac_dir_suffix="/`echo $ac_dir|sed 's%^\./%%'`"
+    # A "../" for each directory in $ac_dir_suffix.
+    ac_dots=`echo "$ac_dir_suffix" | sed 's%/[^/]*%../%g'`
+  else
+    ac_dir_suffix= ac_dots=
+  fi
+
+  case "$ac_given_srcdir" in
+  .)  srcdir=.
+      if test -z "$ac_dots"; then top_srcdir=.
+      else top_srcdir=`echo $ac_dots | sed 's%/$%%'`; fi ;;
+  [\\/]* | ?:[\\/]* )
+      srcdir="$ac_given_srcdir$ac_dir_suffix";
+      top_srcdir=$ac_given_srcdir ;;
+  *) # Relative path.
+    srcdir="$ac_dots$ac_given_srcdir$ac_dir_suffix"
+    top_srcdir="$ac_dots$ac_given_srcdir" ;;
+  esac
+
+  case "$ac_given_INSTALL" in
+  [\\/$]* | ?:[\\/]* ) INSTALL="$ac_given_INSTALL" ;;
+  *) INSTALL="$ac_dots$ac_given_INSTALL" ;;
+  esac
+
+  echo creating "$ac_file"
+  rm -f "$ac_file"
+  configure_input="Generated automatically from `echo $ac_file_in |
+                                                 sed 's%.*/%%'` by configure."
+  case "$ac_file" in
+  *[Mm]akefile*) ac_comsub="1i\\
+# $configure_input" ;;
+  *) ac_comsub= ;;
+  esac
+
+  # Don't redirect the output to AC_FILE directly: use `mv' so that updating
+  # is atomic, and doesn't need trapping.
+  ac_file_inputs=`echo "$ac_file_in" |
+                  sed -e "s%:% $ac_given_srcdir/%g;s%^%$ac_given_srcdir/%"`
+  for ac_file_input in $ac_file_inputs;
+  do
+    test -f "$ac_file_input" ||
+        { echo "configure: error: cannot find input file \`$ac_file_input'" 1>&2; exit 1; }
+  done
+EOF
+cat >>$CONFIG_STATUS <<EOF
+  sed -e "$ac_comsub
+$ac_vpsub
+$extrasub
+EOF
+cat >>$CONFIG_STATUS <<\EOF
+:t
+/@[a-zA-Z_][a-zA-Z_0-9]*@/!b
+s%@configure_input@%$configure_input%;t t
+s%@srcdir@%$srcdir%;t t
+s%@top_srcdir@%$top_srcdir%;t t
+s%@INSTALL@%$INSTALL%;t t
+" $ac_file_inputs | (eval "$ac_sed_cmds") >$ac_cs_root.out
+  mv $ac_cs_root.out $ac_file
+
+fi; done
+rm -f $ac_cs_root.s*
+EOF
+cat >>$CONFIG_STATUS <<\EOF
+
+#
+# CONFIG_HEADER section.
+#
+
+# These sed commands are passed to sed as "A NAME B NAME C VALUE D", where
+# NAME is the cpp macro being defined and VALUE is the value it is being given.
+#
+# ac_d sets the value in "#define NAME VALUE" lines.
+ac_dA='s%^\([ 	]*\)#\([ 	]*define[ 	][ 	]*\)'
+ac_dB='[ 	].*$%\1#\2'
+ac_dC=' '
+ac_dD='%;t'
+# ac_u turns "#undef NAME" without trailing blanks into "#define NAME VALUE".
+ac_uA='s%^\([ 	]*\)#\([ 	]*\)undef\([ 	][ 	]*\)'
+ac_uB='$%\1#\2define\3'
+ac_uC=' '
+ac_uD='%;t'
+
+for ac_file in .. $CONFIG_HEADERS; do if test "x$ac_file" != x..; then
+  # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in".
+  case "$ac_file" in
+  *:*) ac_file_in=`echo "$ac_file" | sed 's%[^:]*:%%'`
+       ac_file=`echo "$ac_file" | sed 's%:.*%%'` ;;
+  *) ac_file_in="${ac_file}.in" ;;
+  esac
+
+  echo creating $ac_file
+
+  rm -f $ac_cs_root.frag $ac_cs_root.in $ac_cs_root.out
+  ac_file_inputs=`echo "$ac_file_in" |
+                  sed -e "s%:% $ac_given_srcdir/%g;s%^%$ac_given_srcdir/%"`
+    for ac_file_input in $ac_file_inputs;
+  do
+    test -f "$ac_file_input" ||
+        { echo "configure: error: cannot find input file \`$ac_file_input'" 1>&2; exit 1; }
+  done
+  # Remove the trailing spaces.
+  sed -e 's/[ 	]*$//' $ac_file_inputs >$ac_cs_root.in
+
+EOF
+
+# Transform confdefs.h into two sed scripts, `conftest.defines' and
+# `conftest.undefs', that substitutes the proper values into
+# config.h.in to produce config.h.  The first handles `#define'
+# templates, and the second `#undef' templates.
+# And first: Protect against being on the right side of a sed subst in
+# config.status.  Protect against being in an unquoted here document
+# in config.status.
+rm -f conftest.defines conftest.undefs
+ac_cs_root=conftest
+cat >$ac_cs_root.hdr <<\EOF
+s/[\\&%]/\\&/g
+s%[\\$`]%\\&%g
+t clear
+: clear
+s%^[ 	]*#[ 	]*define[ 	][ 	]*\(\([^ 	(][^ 	(]*\)([^)]*)\)[ 	]*\(.*\)$%${ac_dA}\2${ac_dB}\1${ac_dC}\3${ac_dD}%gp
+t cleanup
+s%^[ 	]*#[ 	]*define[ 	][ 	]*\([^ 	][^ 	]*\)[ 	]*\(.*\)$%${ac_dA}\1${ac_dB}\1${ac_dC}\2${ac_dD}%gp
+: cleanup
+EOF
+# If some macros were called several times there might be several times
+# the same #defines, which is useless.  Nevertheless, we may not want to
+# sort them, since we want the *last* AC_DEFINE to be honored.
+uniq confdefs.h | sed -n -f $ac_cs_root.hdr >conftest.defines
+sed -e 's/ac_d/ac_u/g' conftest.defines >conftest.undefs
+rm -f $ac_cs_root.hdr
+
+# This sed command replaces #undef with comments.  This is necessary, for
+# example, in the case of _POSIX_SOURCE, which is predefined and required
+# on some systems where configure will not decide to define it.
+cat >>conftest.undefs <<\EOF
+s%^[ 	]*#[ 	]*undef[ 	][ 	]*[a-zA-Z_][a-zA-Z_0-9]*%/* & */%
+EOF
+
+# Break up conftest.defines because some shells have a limit on the size
+# of here documents, and old seds have small limits too (100 cmds).
+echo '  # Handle all the #define templates only if necessary.' >>$CONFIG_STATUS
+echo '  if egrep "^[ 	]*#[ 	]*define" $ac_cs_root.in >/dev/null; then' >>$CONFIG_STATUS
+echo '  # If there are no defines, we may have an empty if/fi' >>$CONFIG_STATUS
+echo '  :' >>$CONFIG_STATUS
+rm -f conftest.tail
+while grep . conftest.defines >/dev/null
+do
+  # Write a limited-size here document to $ac_cs_root.frag.
+  echo '  cat >$ac_cs_root.frag <<CEOF' >>$CONFIG_STATUS
+  echo '/^[ 	]*#[ 	]*define/!b' >>$CONFIG_STATUS
+  sed ${ac_max_here_lines}q conftest.defines >>$CONFIG_STATUS
+  echo 'CEOF
+  sed -f $ac_cs_root.frag $ac_cs_root.in >$ac_cs_root.out
+  rm -f $ac_cs_root.in
+  mv $ac_cs_root.out $ac_cs_root.in
+' >>$CONFIG_STATUS
+  sed 1,${ac_max_here_lines}d conftest.defines >conftest.tail
+  rm -f conftest.defines
+  mv conftest.tail conftest.defines
+done
+rm -f conftest.defines
+echo '  fi # egrep' >>$CONFIG_STATUS
+echo >>$CONFIG_STATUS
+
+# Break up conftest.undefs because some shells have a limit on the size
+# of here documents, and old seds have small limits too (100 cmds).
+echo '  # Handle all the #undef templates' >>$CONFIG_STATUS
+rm -f conftest.tail
+while grep . conftest.undefs >/dev/null
+do
+  # Write a limited-size here document to $ac_cs_root.frag.
+  echo '  cat >$ac_cs_root.frag <<CEOF' >>$CONFIG_STATUS
+  echo '/^[ 	]*#[ 	]*undef/!b' >>$CONFIG_STATUS
+  sed ${ac_max_here_lines}q conftest.undefs >>$CONFIG_STATUS
+  echo 'CEOF
+  sed -f $ac_cs_root.frag $ac_cs_root.in >$ac_cs_root.out
+  rm -f $ac_cs_root.in
+  mv $ac_cs_root.out $ac_cs_root.in
+' >>$CONFIG_STATUS
+  sed 1,${ac_max_here_lines}d conftest.undefs >conftest.tail
+  rm -f conftest.undefs
+  mv conftest.tail conftest.undefs
+done
+rm -f conftest.undefs
+
+cat >>$CONFIG_STATUS <<\EOF
+  rm -f $ac_cs_root.frag $ac_cs_root.h
+  echo "/* $ac_file.  Generated automatically by configure.  */" >$ac_cs_root.h
+  cat $ac_cs_root.in >>$ac_cs_root.h
+  rm -f $ac_cs_root.in
+  if cmp -s $ac_file $ac_cs_root.h 2>/dev/null; then
+    echo "$ac_file is unchanged"
+    rm -f $ac_cs_root.h
+  else
+    # Remove last slash and all that follows it.  Not all systems have dirname.
+    ac_dir=`echo "$ac_file" | sed 's%/[^/][^/]*$%%'`
+    if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then
+      # The file is in a subdirectory.
+      test ! -d "$ac_dir" && mkdir "$ac_dir"
+    fi
+    rm -f $ac_file
+    mv $ac_cs_root.h $ac_file
+  fi
+fi; done
+EOF
+cat >>$CONFIG_STATUS <<\EOF
+
+#
+# CONFIG_LINKS section.
+#
+srcdir=$ac_given_srcdir
+
+for ac_file in : $CONFIG_LINKS; do if test "x$ac_file" != x:; then
+  ac_dest=`echo "$ac_file" | sed 's%:.*%%'`
+  ac_source=`echo "$ac_file" | sed 's%[^:]*:%%'`
+
+  echo "copying $srcdir/$ac_source to $ac_dest"
+
+  if test ! -r $srcdir/$ac_source; then
+    { echo "configure: error: $srcdir/$ac_source: File not found" 1>&2; exit 1; }
+  fi
+  rm -f $ac_dest
+
+  # Make relative symlinks.
+  # Remove last slash and all that follows it.  Not all systems have dirname.
+  ac_dest_dir=`echo $ac_dest | sed 's%/[^/][^/]*$%%'`
+  if test "$ac_dest_dir" != "$ac_dest" && test "$ac_dest_dir" != .; then
+    # The dest file is in a subdirectory.
+    test ! -d "$ac_dest_dir" && mkdir "$ac_dest_dir"
+    ac_dest_dir_suffix="/`echo $ac_dest_dir|sed 's%^\./%%'`"
+    # A "../" for each directory in $ac_dest_dir_suffix.
+    ac_dots=`echo $ac_dest_dir_suffix|sed 's%/[^/]*%../%g'`
+  else
+    ac_dest_dir_suffix= ac_dots=
+  fi
+
+  case "$srcdir" in
+  [\\/$]* | ?:[\\/]* ) ac_rel_source="$srcdir/$ac_source" ;;
+      *) ac_rel_source="$ac_dots$srcdir/$ac_source" ;;
+  esac
+
+  # Note: Dodgy local mods to 'make things work' in an environment (cygwin)
+  # that supports symlinks (through silly hack) using tools that don't
+  # understand them (mingw). The end sometimes justifies the means, son.
+  #
+  # Make a symlink if possible; otherwise try a hard link.
+  #if ln -s $ac_rel_source $ac_dest 2>/dev/null ||
+  #   ln $srcdir/$ac_source $ac_dest; then :
+  # 
+  # Note: If the -p offends your 'cp', just drop it; no harm done, you'll just 
+  # get more recompilations.
+  # 
+  if cp -p $srcdir/$ac_source $ac_dest; then :
+  else
+    { echo "configure: error: cannot copy $ac_dest to $srcdir/$ac_source" 1>&2; exit 1; }
+  fi
+fi; done
+EOF
+cat >>$CONFIG_STATUS <<\EOF
+
+#
+# CONFIG_COMMANDS section.
+#
+for ac_file in .. $CONFIG_COMMANDS; do if test "x$ac_file" != x..; then
+  ac_dest=`echo "$ac_file" | sed 's%:.*%%'`
+  ac_source=`echo "$ac_file" | sed 's%[^:]*:%%'`
+
+  case "$ac_dest" in
+    default-1 ) test -z "$CONFIG_HEADERS" || echo timestamp > stamp-h ;;
+  esac
+fi;done
+EOF
+
+cat >>$CONFIG_STATUS <<\EOF
+
+exit 0
+EOF
+chmod +x $CONFIG_STATUS
+rm -fr confdefs* $ac_clean_files
+trap 'exit 1' 1 2 15
+
+test "$no_create" = yes || $SHELL $CONFIG_STATUS || exit 1
diff --git a/rts/gmp/configure.in b/rts/gmp/configure.in
new file mode 100644
index 0000000000..18f610fe29
--- /dev/null
+++ b/rts/gmp/configure.in
@@ -0,0 +1,950 @@
+dnl  Process this file with autoconf to produce a configure script.
+
+
+dnl  Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+AC_REVISION($Revision: 1.8 $)dnl
+AC_PREREQ(2.14)dnl
+AC_INIT(gmp-impl.h)
+
+dnl Check system.
+AC_CANONICAL_SYSTEM
+
+dnl  Automake
+AM_INIT_AUTOMAKE(gmp, GMP_VERSION)
+AM_CONFIG_HEADER(config.h:config.in)
+AM_MAINTAINER_MODE
+
+dnl  GMP specific
+GMP_INIT(config.m4)
+
+
+AC_ARG_ENABLE(assert,
+AC_HELP_STRING([--enable-assert],[enable ASSERT checking [default=no]]),
+[case "${enableval}" in
+yes|no) ;;
+*) AC_MSG_ERROR([bad value ${enableval} for --enable-assert, need yes or no]) ;;
+esac],
+[enable_assert=no])
+
+if test "$enable_assert" = "yes"; then
+  AC_DEFINE(WANT_ASSERT,1,
+  [./configure --enable-assert option, to enable some ASSERT()s])
+fi
+
+
+AC_ARG_ENABLE(alloca,
+AC_HELP_STRING([--enable-alloca],[use alloca for temp space [default=yes]]),
+[case "${enableval}" in
+yes|no) ;;
+*) AC_MSG_ERROR([bad value ${enableval} for --enable-alloca, need yes or no]) ;;
+esac],
+[enable_alloca=yes])
+
+if test "$enable_alloca" = "no"; then
+  AC_DEFINE(USE_STACK_ALLOC,1,
+  [./configure --disable-alloca option, to use stack-alloc.c, not alloca])
+fi
+
+
+AC_ARG_ENABLE(fft,
+AC_HELP_STRING([--enable-fft],[enable FFTs for multiplication [default=no]]),
+[case "${enableval}" in
+yes|no) ;;
+*) AC_MSG_ERROR([bad value ${enableval} for --enable-fft, need yes or no]) ;;
+esac],
+[enable_fft=no])
+
+if test "$enable_fft" = "yes"; then
+  AC_DEFINE(WANT_FFT,1,
+  [./configure --enable-fft option, to enable FFTs for multiplication])
+fi
+
+
+AC_ARG_ENABLE(mpbsd,
+AC_HELP_STRING([--enable-mpbsd],[build Berkley MP compatibility library [default=no]]),
+[case "${enableval}" in
+yes|no) ;;
+*) AC_MSG_ERROR([bad value ${enableval} for --enable-mpbsd, need yes or no]) ;;
+esac],
+[enable_mpbsd=no])
+AM_CONDITIONAL(WANT_MPBSD, test "$enable_mpbsd" = "yes")
+
+
+AC_ARG_ENABLE(mpfr,
+AC_HELP_STRING([--enable-mpfr],[build MPFR [default=no]]),
+[case "${enableval}" in
+yes|no) ;;
+*) AC_MSG_ERROR([bad value ${enableval} for --enable-mpfr, need yes or no]) ;;
+esac],
+[enable_mpfr=no])
+AM_CONDITIONAL(WANT_MPFR, test "$enable_mpfr" = "yes")
+
+
+dnl Switch on OS and determine what compiler to use.
+dnl
+dnl  os_64bit	           Set to "yes" if OS is 64-bit capable.
+dnl                        FIXME: Rename to `check_64bit_compiler'!
+dnl  cclist		   List of compilers, best first.
+dnl  gmp_cflags_{cc}	   Flags for compiler named {cc}.
+dnl  gmp_cflags64_{cc}     Flags for compiler named {cc} for 64-bit code.
+dnl  gmp_optcflags_{cc}	   Optional compiler flags.
+dnl  gmp_xoptcflags_{cc}   Exclusive optional compiler flags.
+dnl
+os_64bit="no"
+cclist="gcc cc"		# FIXME: Prefer c89 to cc.
+gmp_cflags_gcc="-g -O2"
+gmp_cflags64_gcc="-g -O2"
+gmp_cflags_cc="-g"
+gmp_cflags64_cc="-g"
+
+case "$target" in
+  # Alpha
+  alpha*-cray-unicos*)
+    # Don't perform any assembly syntax tests on this beast.
+    gmp_no_asm_syntax_testing=yes
+    cclist=cc
+    gmp_cflags_cc="$gmp_cflags_cc -O"
+    ;;
+  alpha*-*-osf*)
+    flavour=`echo $target_cpu | sed 's/^alpha//g'`
+    if test -n "$flavour"; then
+      case $flavour in	   # compilers don't seem to understand `ev67' and such.
+	ev6? | ev7*) flavour=ev6;;
+      esac
+      gmp_optcflags_gcc="-mcpu=$flavour"
+      # FIXME: We shouldn't fail fatally if none of these work, but that's
+      # how xoptcflags work and we don't have any other mechanism right now.
+      # Why do we need this here and not for alpha*-*-* below?
+      gmp_xoptcflags_gcc="-Wa,-arch,${flavour} -Wa,-m${flavour}"
+      gmp_optcflags_cc="-arch $flavour -tune $flavour"
+    fi
+    ;;
+  alpha*-*-*)
+    cclist="gcc"
+    flavour=`echo $target_cpu | sed 's/^alpha//g'`
+    if test -n "$flavour"; then
+      case $flavour in
+	ev6? | ev7*) flavour=ev6;;
+      esac
+      gmp_optcflags_gcc="-mcpu=$flavour"
+    fi
+    ;;
+  # Cray vector machines.  This must come after alpha* so that we can
+  # recognize present and future vector processors with a wildcard.
+  *-cray-unicos*)
+    # Don't perform any assembly syntax tests on this beast.
+    gmp_no_asm_syntax_testing=yes
+    cclist=cc
+    # Don't inherit default gmp_cflags_cc value; it comes with -g which
+    # disables all optimization on Cray vector systems
+    gmp_cflags_cc="-O"
+    ;;
+
+  # AMD and Intel x86 configurations
+  [i?86*-*-* | k[5-8]*-*-* | pentium*-*-* | athlon-*-*])
+    # Rumour has it -O2 used to give worse register allocation than just -O.
+    gmp_cflags_gcc="-g -O -fomit-frame-pointer"
+
+    case "${target}" in
+      i386*-*-*)    gmp_optcflags_gcc="-mcpu=i386 -march=i386";;
+      i486*-*-*)    gmp_optcflags_gcc="-mcpu=i486 -march=i486";;
+      i586*-*-* | pentium-*-* | pentiummmx-*-*)
+                    gmp_optcflags_gcc="-mcpu=pentium -march=pentium";;
+
+      # -march=pentiumpro not used because mpz/powm.c (swox cvs rev 1.4)
+      # tickles a bug in gcc 2.95.2 (believed fixed in 2.96).
+      [i686*-*-* | pentiumpro-*-* | pentium[23]-*-*])
+                    gmp_optcflags_gcc="-mcpu=pentiumpro";;
+
+      k6*-*-*)      gmp_optcflags_gcc="-mcpu=k6 -march=k6";;
+
+      # Athlon instruction costs are close to p6: 3 cycle load latency, 4-6
+      # cycle mul, 40 cycle div, pairable adc, ...
+      # FIXME: Change this when gcc gets something specific for Athlon.
+      # -march=pentiumpro not used, per i686 above.
+      athlon-*-*)   gmp_optcflags_gcc="-mcpu=pentiumpro";;
+    esac
+    ;;
+
+  # Sparc
+  [ultrasparc*-*-solaris2.[7-9] | sparcv9-*-solaris2.[7-9]])
+    os_64bit=yes
+    gmp_cflags_gcc="$gmp_cflags_gcc -Wa,-xarch=v8plus"
+    gmp_xoptcflags_gcc="-mcpu=v9 -mcpu=v8 -mv8"
+    gmp_cflags64_gcc="$gmp_cflags64_gcc -m64 -mptr64 -Wa,-xarch=v9 -mcpu=v9"
+    gmp_cflags_cc="-xtarget=native -xarch=v8 -xO4"
+    gmp_cflags64_cc="-xtarget=native -xarch=v9 -xO4"
+    ;;
+  sparc64-*-linux*)
+    # Need to think more about the options passed here.  This isn't good for
+    # some sparc64 linux distros, since we end up not optimizing when all the
+    # options below fail.
+    os_64bit=yes
+    gmp_cflags64_gcc="$gmp_cflags64_gcc -m64 -mptr64 -Wa,-xarch=v9 -mcpu=v9"
+    gmp_cflags_gcc="$gmp_cflags_gcc -m32"
+    gmp_xoptflags_gcc="-mcpu=ultrasparc -mvis"
+    ;;
+  ultrasparc*-*-* | sparcv9-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -Wa,-xarch=v8plus"
+    gmp_xoptcflags_gcc="-mcpu=v9 -mcpu=v8 -mv8"
+    gmp_cflags_cc="-xtarget=native -xarch=v8 -xO4"
+    ;;
+  sparcv8*-*-solaris2.* | microsparc*-*-solaris2.*)
+    gmp_cflags_gcc="$gmp_cflags_gcc"
+    gmp_xoptcflags_gcc="-mcpu=v8 -mv8"
+    gmp_cflags_cc="-xtarget=native -xarch=v8 -xO4"
+    ;;
+  sparcv8*-*-* | microsparc*-*-*)		# SunOS, Linux, *BSD
+    cclist="gcc acc cc"
+    gmp_cflags_gcc="$gmp_cflags_gcc"
+    gmp_xoptcflags_gcc="-mcpu=v8 -mv8"
+    gmp_cflags_acc="-g -O2 -cg92"
+    gmp_cflags_cc="-O2"		# FIXME: Flag for v8?
+    ;;
+  supersparc*-*-solaris2.*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -DSUPERSPARC"
+    gmp_xoptcflags_gcc="-mcpu=v8 -mv8"
+    gmp_cflags_cc="-xtarget=native -xarch=v8 -xO4 -DSUPERSPARC"
+    ;;
+  supersparc*-*-*)		# SunOS, Linux, *BSD
+    cclist="gcc acc cc"
+    gmp_cflags_gcc="$gmp_cflags_gcc -DSUPERSPARC"
+    gmp_xoptcflags_gcc="-mcpu=v8 -mv8"
+    gmp_cflags_acc="-g -O2 -cg92 -DSUPERSPARC"
+    gmp_cflags_cc="-O2 -DSUPERSPARC"	# FIXME: Flag for v8?
+    ;;
+  *sparc*-*-*)
+    cclist="gcc acc cc"
+    gmp_cflags_acc="-g -O2"
+    gmp_cflags_cc="-g -O2"
+    ;;
+
+  # POWER/PowerPC
+  powerpc64-*-aix*)
+    cclist="gcc xlc"
+    gmp_cflags_gcc="$gmp_cflags_gcc -maix64 -mpowerpc64"
+    gmp_cflags_xlc="-g -O2 -q64 -qtune=pwr3"
+    ;;
+  powerpc*-*-aix*)
+    cclist="gcc xlc"
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpowerpc"
+    gmp_cflags_xlc="$gmp_cflags_cc -qarch=ppc -O2"
+    ;;
+  power-*-aix*)
+    cclist="gcc xlc"
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpower"
+    gmp_cflags_xlc="$gmp_cflags_cc -qarch=pwr -O2"
+    ;;
+  powerpc64*-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpowerpc64"
+    AC_DEFINE(_LONG_LONG_LIMB)	dnl FIXME: Remove.
+    ;;
+  powerpc-apple-darwin* | powerpc-apple-macosx*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpowerpc -traditional-cpp"
+    ;;
+  powerpc*-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpowerpc"
+    ;;
+
+  # MIPS
+  mips-sgi-irix6.*)
+    os_64bit=yes
+    gmp_cflags64_gcc="-g -O2 -mabi=n32"
+    gmp_cflags64_cc="$gmp_cflags64_cc -O2 -n32"
+    ;;
+
+  # Motorola 68k family
+  m88110*-*-*)
+    gmp_cflags_gcc="-g -O -m88110"	dnl  FIXME: Use `-O2'?
+    ;;
+  m68*-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -fomit-frame-pointer"
+    ;;
+
+  # HP
+  hppa1.0*-*-*)
+    cclist="gcc c89 cc"
+    gmp_cflags_c89="$gmp_cflags_cc +O2"
+    gmp_cflags_cc="$gmp_cflags_cc +O2"
+    ;;
+  hppa2.0w*-*-*)
+    cclist="c89 cc"
+    gmp_cflags_c89="+DD64 +O3"
+    gmp_cflags_cc="+DD64 +O3"
+    ;;
+  hppa2.0*-*-*)
+    os_64bit=yes
+    cclist="gcc c89 cc"
+    gmp_cflags64_gcc="$gmp_cflags64_gcc -mWHAT -D_LONG_LONG_LIMB"
+    # +O2 to cc triggers bug in mpz/powm.c (1.4)
+    gmp_cflags64_c89="+DA2.0 +e +O3 -D_LONG_LONG_LIMB"
+    gmp_cflags64_cc="+DA2.0 +e +O3 -D_LONG_LONG_LIMB"
+    gmp_cflags_c89="$gmp_cflags_cc +O2"
+    gmp_cflags_cc="$gmp_cflags_cc +O2"
+    ;;
+
+  # VAX
+  vax*-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -fomit-frame-pointer"
+    ;;
+
+  # Fujitsu
+  [f30[01]-fujitsu-sysv*])
+    cclist="gcc vcc"
+    gmp_cflags_vcc="-g"		# FIXME: flags for vcc?
+    ;;
+esac
+
+case "${target}" in
+  *-*-mingw32) gmp_cflags_gcc="$gmp_cflags_gcc -mno-cygwin";;
+esac
+
+dnl  Check for programs needed by macros for finding compiler.
+dnl  More programs are checked for below, when a compiler is found.
+AC_PROG_NM	dnl  Macro from Libtool.
+# nm on 64-bit AIX needs to know the object file format 
+case "$target" in
+  powerpc64*-*-aix*)
+    NM="$NM -X 64"
+    ;;
+esac   
+
+# Save CFLAGS given on command line.
+gmp_user_CFLAGS="$CFLAGS"
+
+if test -z "$CC"; then
+  # Find compiler.
+  GMP_PROG_CC_FIND($cclist, $os_64bit)
+
+  # If 64-bit OS and we have a 64-bit compiler, use it.
+  if test -n "$os_64bit" && test -n "$CC64"; then
+    CC=$CC64
+    CFLAGS=$CFLAGS64
+  else
+    eval CFLAGS=\$gmp_cflags_$CC  
+  fi
+
+  # Try compiler flags that may work with only some compiler versions.
+  # gmp_optcflags: All or nothing.
+  eval optcflags=\$gmp_optcflags_$CC
+  if test -n "$optcflags"; then
+    CFLAGS_save="$CFLAGS"
+    CFLAGS="$CFLAGS $optcflags"
+    AC_MSG_CHECKING([whether $CC accepts $optcflags])
+    AC_LANG_C
+    AC_TRY_COMPILER([int main(){return(0);}], optok, cross)
+    if test "$optok" = "yes"; then
+      AC_MSG_RESULT([yes])
+    else
+      AC_MSG_RESULT([no])
+      CFLAGS="$CFLAGS_save"
+    fi
+  fi
+  # gmp_xoptcflags: First is best, one has to work.
+  eval xoptcflags=\$gmp_xoptcflags_$CC
+  if test -n "$xoptcflags"; then
+    gmp_found="no"
+    for xopt in $xoptcflags; do
+      CFLAGS_save="$CFLAGS"
+      CFLAGS="$CFLAGS $xopt"
+      AC_MSG_CHECKING([whether $CC accepts $xopt])
+      AC_LANG_C
+      AC_TRY_COMPILER([int main(){return(0);}], optok, cross)
+      if test "$optok" = "yes"; then
+        AC_MSG_RESULT([yes])
+        gmp_found="yes"
+        break
+      else
+        AC_MSG_RESULT([no])
+        CFLAGS="$CFLAGS_save"
+      fi
+    done
+    if test "$gmp_found" = "no"; then
+      echo ["$0: fatal: need a compiler that understands one of $xoptcflags"]
+      exit 1
+    fi
+  fi
+fi
+
+# Restore CFLAGS given on command line.
+# FIXME: We've run through quite some unnecessary code looking for a
+# nice compiler and working flags for it, just to spoil that with user
+# supplied flags.
+test -n "$gmp_user_CFLAGS" && CFLAGS="$gmp_user_CFLAGS"
+
+# Select chosen compiler.
+GMP_PROG_CC_SELECT
+
+# How to assemble.
+CCAS="$CC -c"
+AC_SUBST(CCAS)
+
+dnl Checks for programs.
+dnl --------------------
+AC_PROG_CPP
+AC_PROG_INSTALL
+AC_PROG_LN_S
+GMP_PROG_M4
+AC_CHECK_PROG(AR, ar, ar)
+# ar on AIX needs to know the object file format 
+case "$target" in
+  powerpc64*-*-aix*)
+    AR="$AR -X 64"
+    ;;
+esac   
+dnl  FIXME: Find good ld?  /usr/ucb/ld on Solaris won't work.
+
+dnl Checks for assembly syntax.
+if test "$gmp_no_asm_syntax_testing" != "yes"; then
+  GMP_CHECK_ASM_TEXT
+  GMP_CHECK_ASM_DATA
+  GMP_CHECK_ASM_GLOBL
+  GMP_CHECK_ASM_LABEL_SUFFIX
+  GMP_CHECK_ASM_TYPE
+  GMP_CHECK_ASM_SIZE
+  GMP_CHECK_ASM_LSYM_PREFIX
+  GMP_CHECK_ASM_W32
+  GMP_CHECK_ASM_UNDERSCORE(underscore=yes, underscore=no)
+  GMP_CHECK_ASM_ALIGN_LOG(asm_align=log, asm_align=nolog)
+fi
+
+dnl  FIXME: Check for FPU and set `floating_point' appropriately.
+
+dnl  ========================================
+dnl  Configuring mpn.
+dnl  ----------------------------------------
+dnl  Set the following target specific variables:
+dnl  path		where to search for source files
+dnl  family		processor family (Needed for building 
+dnl 			asm-syntax.h for now.  FIXME: Remove.)
+dnl  extra_functions	extra functions
+
+family=generic
+
+case ${target} in
+  arm*-*-*)
+    path="arm"
+    ;;
+  [sparcv9*-*-solaris2.[789]* | sparc64*-*-solaris2.[789]* | ultrasparc*-*-solaris2.[789]*])
+    if test -n "$CC64"
+      then path="sparc64"
+      else path="sparc32/v9 sparc32/v8 sparc32"
+    fi
+    ;;
+  sparc64-*-linux*)
+    if test -n "$CC64"
+      then path="sparc64"
+      else path="sparc32/v9 sparc32/v8 sparc32"
+    fi
+    ;;
+  sparcv8*-*-* | microsparc*-*-*)
+	path="sparc32/v8 sparc32"
+	if test x${floating_point} = xno
+	  then extra_functions="udiv_nfp"
+	  else extra_functions="udiv_fp"
+	fi
+	;;
+  sparcv9*-*-* | ultrasparc*-*-*)
+	path="sparc32/v9 sparc32/v8 sparc32"
+	extra_functions="udiv_fp"
+	;;
+  supersparc*-*-*)
+	path="sparc32/v8/supersparc sparc32/v8 sparc32"
+	extra_functions="udiv"
+	;;
+  sparc*-*-*) path="sparc32"
+	if test x${floating_point} = xno
+	  then extra_functions="udiv_nfp"
+	  else extra_functions="udiv_fp"
+	fi
+	;;
+  hppa7000*-*-*)
+    path="hppa/hppa1_1 hppa"
+    extra_functions="udiv_qrnnd"
+    ;;
+  hppa1.0*-*-*)
+    path="hppa"
+    extra_functions="udiv_qrnnd"
+    ;;
+  hppa2.0w-*-*)
+    path="pa64w"
+    extra_functions="umul_ppmm udiv_qrnnd"
+    ;;
+  hppa2.0*-*-*)
+    if test -n "$CC64"; then
+      path="pa64"
+      extra_functions="umul_ppmm udiv_qrnnd"
+      # We need to use the system compiler, or actually the system assembler,
+      # since GAS has not been ported to understand the 2.0 instructions.
+      CCAS="$CC64 -c"
+    else
+      # FIXME: path should be "hppa/hppa2_0 hppa/hppa1_1 hppa"
+      path="hppa/hppa1_1 hppa"
+      extra_functions="udiv_qrnnd"
+    fi
+    ;;
+  hppa*-*-*)					#assume pa7100
+    path="hppa/hppa1_1/pa7100 hppa/hppa1_1 hppa"
+    extra_functions="udiv_qrnnd";;
+  [f30[01]-fujitsu-sysv*])
+    path=fujitsu;;
+  alphaev6*-*-*) path="alpha/ev6 alpha"; extra_functions="invert_limb cntlz";;
+  alphaev5*-*-*) path="alpha/ev5 alpha"; extra_functions="invert_limb cntlz";;
+  alpha*-*-*) path="alpha"; extra_functions="invert_limb cntlz";;
+  # Cray vector machines.  This must come after alpha* so that we can
+  # recognize present and future vector processors with a wildcard.
+  *-cray-unicos*)
+    path="cray"
+    extra_functions="mulww";;
+  am29000*-*-*) path="a29k";;
+  a29k*-*-*) path="a29k";;
+
+  # AMD and Intel x86 configurations
+
+  [i?86*-*-* | k[5-8]*-*-* | pentium*-*-* | athlon-*-*])
+    gmp_m4postinc="x86/x86-defs.m4"
+    extra_functions="udiv umul"
+    CALLING_CONVENTIONS_OBJS="x86call.o x86check.o"
+
+    GMP_CHECK_ASM_SHLDL_CL(
+      [GMP_DEFINE(WANT_SHLDL_CL,1)],
+      [GMP_DEFINE(WANT_SHLDL_CL,0)])
+    GMP_CHECK_ASM_ALIGN_FILL_0x90
+
+    # the CPUs below wanting to know about mmx
+    case ${target} in
+      [pentiummmx-*-* | pentium[23]-*-* | k6*-*-* | athlon-*-*])
+	GMP_CHECK_ASM_MMX(tmp_mmx=yes, tmp_mmx=no)
+        ;;
+    esac
+
+    # default for anything not otherwise mentioned
+    path="x86"
+ 
+    case ${target} in
+      [i[34]86*-*-*])
+        path="x86"
+        ;;
+      k5*-*-*)
+        # don't know what best suits k5
+        path="x86"
+        ;;
+      i586*-*-* | pentium-*-*)
+	path="x86/pentium x86"
+        ;;
+      pentiummmx-*-*)
+	path="x86/pentium x86"
+	if test "$tmp_mmx" = yes; then
+          path="x86/pentium/mmx $path"
+        fi
+        ;;
+      i686*-*-* | pentiumpro-*-*)
+	path="x86/p6 x86"
+        ;;
+      pentium2-*-*)
+	path="x86/p6 x86"
+	# The pentium/mmx lshift and rshift are good on p6 and can be used
+        # until there's something specific for p6.
+	if test "$tmp_mmx" = yes; then
+          path="x86/p6/mmx x86/pentium/mmx $path"
+        fi
+        ;;
+      pentium3-*-*)
+	path="x86/p6 x86"
+	# The pentium/mmx lshift and rshift are good on p6 and can be used
+        # until there's something specific for p6.
+	if test "$tmp_mmx" = yes; then
+          path="x86/p6/p3mmx x86/p6/mmx x86/pentium/mmx $path"
+        fi
+        ;;
+      [k6[23]*-*-*])
+	path="x86/k6 x86"
+	if test "$tmp_mmx" = yes; then
+          path="x86/k6/k62mmx x86/k6/mmx $path"
+        fi
+        ;;
+      k6*-*-*)
+	path="x86/k6 x86"
+	if test "$tmp_mmx" = yes; then
+          path="x86/k6/mmx $path"
+        fi
+        ;;
+      athlon-*-*)
+	path="x86/k7 x86"
+	if test "$tmp_mmx" = yes; then
+          path="x86/k7/mmx $path"
+        fi
+        ;;
+    esac
+    ;;
+
+
+  i960*-*-*) path="i960";;
+
+  ia64*-*-*) path="ia64";;
+
+# Motorola 68k configurations.  Let m68k mean 68020-68040.
+  [m680[234]0*-*-* | m68k*-*-* | \
+  m68*-next-nextstep*])		# Nexts are at least '020
+    path="m68k/mc68020 m68k"
+    family=m68k
+    ;;
+  m68000*-*-*)
+    path="m68k"
+    family=m68k
+    ;;
+
+  m88k*-*-* | m88k*-*-*) path="m88k";;
+  m88110*-*-*) path="m88k/mc88110 m88k";;
+  ns32k*-*-*) path="ns32k";;
+
+  pyramid-*-*) path="pyr";;
+
+  ppc601-*-*) path="power powerpc32";;
+  powerpc64*-*-*) path="powerpc64";;
+  powerpc*-*-*) path="powerpc32";;
+  rs6000-*-* | power-*-* | power2-*-*)
+    path="power"
+    extra_functions="udiv_w_sdiv"
+    ;;
+
+  sh-*-*) path="sh";;
+  sh2-*-*) path="sh/sh2 sh";;
+
+  [mips[34]*-*-*]) path="mips3";;
+  mips*-*-irix6*) path="mips3";;
+  mips*-*-*) path="mips2";;
+
+  vax*-*-*) path="vax"; extra_functions="udiv_w_sdiv";;
+
+  z8000x*-*-*) path="z8000x"; extra_functions="udiv_w_sdiv";;
+  z8000*-*-*) path="z8000"; extra_functions="udiv_w_sdiv";;
+
+  clipper*-*-*) path="clipper";;
+esac
+
+AC_SUBST(CALLING_CONVENTIONS_OBJS)
+if test -n "$CALLING_CONVENTIONS_OBJS"; then
+  AC_DEFINE(HAVE_CALLING_CONVENTIONS,1,
+  [Define if mpn/tests has calling conventions checking for the CPU])
+fi
+
+
+case ${target} in
+  [i[5-8]86*-*-* | k[5-8]*-*-* | pentium*-*-* | athlon-*-*])
+    # rdtsc is in pentium and up, not in i386 and i486
+    SPEED_CYCLECOUNTER_OBJS=pentium.lo
+    ;;
+  alpha*-*-*)
+    SPEED_CYCLECOUNTER_OBJS=alpha.lo
+    ;;
+  sparcv9*-*-* | ultrasparc*-*-* | sparc64*-*-*)
+    SPEED_CYCLECOUNTER_OBJS=sparcv9.lo
+    ;;
+  hppa2*-*-*)
+    SPEED_CYCLECOUNTER_OBJS=hppa2.lo
+    ;;
+  hppa*-*-*)
+    SPEED_CYCLECOUNTER_OBJS=hppa.lo
+    ;;
+esac
+
+AC_SUBST(SPEED_CYCLECOUNTER_OBJS)
+
+if test -n "$SPEED_CYCLECOUNTER_OBJS"
+then
+  AC_DEFINE(HAVE_SPEED_CYCLECOUNTER, 1,
+  [Define if a speed_cyclecounter exists (for the tune programs)])
+fi
+
+
+dnl Extensions for executable and object files.
+dnl -------------------------------------------
+AC_EXEEXT
+AC_OBJEXT
+
+dnl Use Libtool.
+dnl ------------
+dnl  FIXME: Shared libs seem to fail on aix4.3.
+dnl  FIXME: Should invoke [AC_DISABLE_SHARED], but m4 recurses to death.
+case "$target" in
+  [*-*-aix4.[3-9]*]) enable_shared=no ;;
+esac
+AC_PROG_LIBTOOL
+
+dnl Checks for libraries.
+dnl ---------------------
+AC_CHECK_DECLS((optarg))
+
+dnl Checks for header files.
+dnl ------------------------
+AC_HEADER_STDC
+AC_CHECK_HEADERS(getopt.h unistd.h sys/sysctl.h sys/time.h)
+
+dnl Checks for typedefs, structures, and compiler characteristics.
+dnl --------------------------------------------------------------
+AC_CHECK_TYPES((void))
+AC_C_STRINGIZE
+
+dnl Checks for library functions.
+dnl -----------------------------
+dnl Most of these are only for the benefit of supplementary programs.  The
+dnl library itself doesn't use anything weird.
+dnl AC_FUNC_MEMCMP
+dnl AC_TYPE_SIGNAL
+dnl AC_CHECK_FUNCS(strtol)
+AC_CHECK_FUNCS(getopt_long getpagesize popen processor_info strtoul sysconf sysctlbyname)
+
+dnl Trick automake into thinking we've run AM_C_PROTOTYPES which it wants
+dnl for ansi2knr, and instead use our own test.  (It's only a warning
+dnl automake prints, but it's good to suppress it.)
+ifelse(0,1,[
+AM_C_PROTOTYPES
+])
+GMP_C_ANSI2KNR
+
+
+dnl  Set `syntax' to one of <blank>, "mit", "elf", "aix", "macho".
+syntax=
+# For now, we use the old switch for setting syntax.  
+# FIXME: Remove when conversion to .asm is completed.
+changequote(,)dnl
+case "${target}" in
+  m680[234]0*-*-linuxaout* | m68k*-*-linuxaout* | \
+  m68k-next-nextstep* | \
+  m68000*-*-*)
+    syntax=mit
+    ;;
+  m680[234]0*-*-linux* | m68k*-*-linux*)
+    syntax=elf
+    ;;
+  m680[234]0*-*-* | m68k*-*-*)
+    syntax=mit
+    ;;
+esac
+changequote([,])dnl
+
+dnl  ----------------------------------------
+# Now build an asm-syntax.h file for targets that include that from the
+# assembly files.
+# FIXME: Remove when conversion to .asm is completed.
+case "${family}-${underscore}-${asm_align}-${syntax}" in
+  m68k-yes-log-mit)
+    echo '#define MIT_SYNTAX' >asm-syntax.h
+    cat $srcdir/mpn/underscore.h >>asm-syntax.h
+    echo '#include "'$srcdir'/mpn/m68k/syntax.h"' >>asm-syntax.h;;
+  m68k-no-nolog-elf)
+    echo '#define ELF_SYNTAX' >asm-syntax.h
+    echo '#define C_SYMBOL_NAME(name) name' >>asm-syntax.h
+    echo '#include "'$srcdir'/mpn/m68k/syntax.h"' >>asm-syntax.h;;
+esac
+
+
+# The pattern here tests for an absolute path the same way as
+# _AC_OUTPUT_FILES in autoconf acgeneral.m4.
+GMP_DEFINE_RAW(["dnl  CONFIG_TOP_SRCDIR is a path from the mpn builddir to the top srcdir"])
+case "$srcdir" in
+[[\\/]]* | ?:[[\\/]]* )
+     GMP_DEFINE_RAW(["define(<CONFIG_TOP_SRCDIR>,<\`$srcdir'>)"])    ;;
+*)   GMP_DEFINE_RAW(["define(<CONFIG_TOP_SRCDIR>,<\`../$srcdir'>)"]) ;;
+esac
+ 
+GMP_DEFINE_RAW(["include(CONFIG_TOP_SRCDIR\`/mpn/asm-defs.m4')"], POST)
+
+# Must be after asm-defs.m4
+GMP_DEFINE_RAW("define_not_for_expansion(\`HAVE_TARGET_CPU_$target_cpu')", POST)
+
+
+dnl  config.m4 post-includes
+dnl  -----------------------
+dnl  (Note x86 post include set with $path above.)
+changequote(,)dnl
+case "$target" in
+  alpha*-cray-unicos*)
+    gmp_m4postinc="alpha/unicos.m4"
+    ;;
+  alpha*-*-*)
+    gmp_m4postinc="alpha/default.m4"
+    ;;
+  power*-*-*)
+    case "$target" in
+      *-*-mach* | *-*-rhapsody* | *-*-nextstep* | *-*-darwin* | *-*-macosx*)
+        ;;	# these use non-conventional assembly syntax.
+      powerpc64-*-aix*)
+	gmp_m4postinc="powerpc32/regmap.m4 powerpc64/aix.m4"
+        ;;
+      *-*-aix*)
+	gmp_m4postinc="powerpc32/regmap.m4 powerpc32/aix.m4"
+        ;;
+      *)
+	gmp_m4postinc="powerpc32/regmap.m4"
+	;;
+    esac
+    ;;
+esac
+changequote([, ])dnl
+
+for tmp_f in $gmp_m4postinc; do
+  GMP_DEFINE_RAW(["include_mpn(\`$tmp_f')"], POST)
+done
+
+
+# Set up `gmp_links'.  It's a list of link:file pairs that configure will
+# process to create link -> file.
+gmp_links=
+
+# If the user specified `MPN_PATH', use that instead of the path we've
+# come up with.
+if test -z "$MPN_PATH"; then
+  path="$path generic"
+else
+  path="$MPN_PATH"
+fi
+
+# Pick the correct source files in $path and link them to mpn/.
+# $gmp_mpn_functions lists all functions we need.
+#
+# The rule is to find a file with the function name and a .asm, .S,
+# .s, or .c extension.  Certain multi-function files with special names
+# can provide some functions too.  (mpn/Makefile.am passes
+# -DOPERATION_<func> to get them to generate the right code.)
+
+# FIXME: udiv and umul aren't in $gmp_mpn_functions_optional yet since
+# there's some versions of those files which should be checked for bit
+# rot first.  Put them in $extra_functions for each target for now,
+# change to standard optionals when all are ready.
+
+# Note: The following lines defining $gmp_mpn_functions_optional
+#       and $gmp_mpn_functions are parsed by the "macos/configure"
+#       Perl script. So if you change the lines in a major way
+#       make sure to run and examine the output from
+#       
+#           % (cd macos; perl configure)
+
+gmp_mpn_functions_optional="copyi copyd com_n		\
+  and_n andn_n nand_n ior_n iorn_n nior_n xor_n xnor_n"
+
+gmp_mpn_functions="${extra_functions} inlines add_n sub_n mul_1 addmul_1   \
+  submul_1 lshift rshift diveby3 divrem divrem_1 divrem_2                  \
+  mod_1 mod_1_rs pre_mod_1 dump                                            \
+  mul mul_fft mul_n mul_basecase sqr_basecase random                       \
+  random2 sqrtrem get_str set_str scan0 scan1 popcount hamdist cmp perfsqr \
+  bdivmod gcd_1 gcd gcdext tdiv_qr bz_divrem_n sb_divrem_mn jacbase        \
+  $gmp_mpn_functions_optional"
+
+# the list of all object files used by mpn/Makefile.in and the
+# top-level Makefile.in, respectively
+mpn_objects=
+mpn_objs_in_libgmp="mpn/mp_bases.lo"
+
+for tmp_fn in ${gmp_mpn_functions} ; do
+  [rm -f mpn/${tmp_fn}.[Ssc] mpn/${tmp_fn}.asm]
+
+  # functions that can be provided by multi-function files
+  tmp_mulfunc=
+  case $tmp_fn in
+  add_n|sub_n)       tmp_mulfunc="aors_n"    ;;
+  addmul_1|submul_1) tmp_mulfunc="aorsmul_1" ;;
+  popcount|hamdist)  tmp_mulfunc="popham"    ;;
+  and_n|andn_n|nand_n | ior_n|iorn_n|nior_n | xor_n|xnor_n)
+                     tmp_mulfunc="logops_n"  ;;
+  esac
+
+  found=no
+  for tmp_dir in $path; do
+    for tmp_base in $tmp_fn $tmp_mulfunc; do
+      for tmp_ext in asm S s c; do
+        tmp_file=$srcdir/mpn/$tmp_dir/$tmp_base.$tmp_ext
+        if test -f $tmp_file; then
+          found=yes
+
+          mpn_objects="$mpn_objects ${tmp_fn}.lo"
+          mpn_objs_in_libgmp="$mpn_objs_in_libgmp mpn/${tmp_fn}.lo"
+          gmp_links="$gmp_links mpn/$tmp_fn.$tmp_ext:mpn/$tmp_dir/$tmp_base.$tmp_ext"
+
+          # duplicate AC_DEFINEs are harmless, so it doesn't matter
+          # that multi-function files get grepped here repeatedly
+          gmp_ep=["`
+            sed -n 's/^[ 	]*MULFUNC_PROLOGUE(\(.*\))/\1/p' $tmp_file ;
+            sed -n 's/^[ 	]*PROLOGUE.*(\(.*\))/\1/p' $tmp_file
+          `"]
+          for gmp_tmp in $gmp_ep; do
+            AC_DEFINE_UNQUOTED(HAVE_NATIVE_${gmp_tmp})
+          done
+
+          break
+        fi
+      done
+      if test $found = yes; then break ; fi
+    done
+    if test $found = yes; then break ; fi
+  done
+
+  if test $found = no; then
+    for tmp_optional in $gmp_mpn_functions_optional; do
+      if test $tmp_optional = $tmp_fn; then
+        found=yes
+      fi
+    done
+    if test $found = no; then
+      AC_MSG_ERROR([no version of $tmp_fn found in path: $path])
+    fi
+  fi
+done
+
+# Create link for gmp-mparam.h.
+for tmp_dir in $path ; do
+  rm -f gmp-mparam.h
+  if test -f $srcdir/mpn/${tmp_dir}/gmp-mparam.h ; then
+    gmp_links="$gmp_links gmp-mparam.h:mpn/${tmp_dir}/gmp-mparam.h"
+
+    # Copy any KARATSUBA_SQR_THRESHOLD in gmp-mparam.h to config.m4.
+    # Some versions of sqr_basecase.asm use this.
+    tmp_gmp_karatsuba_sqr_threshold="`sed -n 's/^#define KARATSUBA_SQR_THRESHOLD[ 	]*\([0-9][0-9]*\).*$/\1/p' $srcdir/mpn/${tmp_dir}/gmp-mparam.h`"
+    if test -n "$tmp_gmp_karatsuba_sqr_threshold"; then
+      GMP_DEFINE_RAW(["define(<KARATSUBA_SQR_THRESHOLD>,<$tmp_gmp_karatsuba_sqr_threshold>)"])
+    fi
+
+    break
+  fi
+done
+
+# Dig out the links from `gmp_links' for inclusion in DISTCLEANFILES.
+gmp_srclinks=
+for f in $gmp_links; do
+  gmp_srclinks="$gmp_srclinks `echo $f | sed 's/\(.*\):.*/\1/'`"
+done
+
+AC_SUBST(mpn_objects)
+AC_SUBST(mpn_objs_in_libgmp)
+AC_SUBST(gmp_srclinks)
+
+dnl  ----------------------------------------
+dnl  Make links.
+AC_CONFIG_LINKS($gmp_links)
+
+dnl  Create config.m4.
+GMP_FINISH
+
+dnl  Create Makefiles
+dnl  FIXME: Upcoming version of autoconf/automake may not like broken lines.
+AC_OUTPUT(Makefile mpz/Makefile mpn/Makefile)
diff --git a/rts/gmp/depcomp b/rts/gmp/depcomp
new file mode 100644
index 0000000000..7906096738
--- /dev/null
+++ b/rts/gmp/depcomp
@@ -0,0 +1,269 @@
+#! /bin/sh
+
+# depcomp - compile a program generating dependencies as side-effects
+# Copyright (C) 1999 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+
+# Originally written by Alexandre Oliva <oliva@dcc.unicamp.br>.
+
+if test -z "$depmode" || test -z "$source" || test -z "$object"; then
+  echo "depcomp: Variables source, object and depmode must be set" 1>&2
+  exit 1
+fi
+# `libtool' can also be set to `yes' or `no'.
+
+depfile=${depfile-`echo "$object" | sed 's,\([^/]*\)$,.deps/\1,;s/\.\([^.]*\)$/.P\1/'`}
+tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`}
+
+rm -f "$tmpdepfile"
+
+# Some modes work just like other modes, but use different flags.  We
+# parameterize here, but still list the modes in the big case below,
+# to make depend.m4 easier to write.  Note that we *cannot* use a case
+# here, because this file can only contain one case statement.
+if test "$depmode" = hp; then
+  # HP compiler uses -M and no extra arg.
+  gccflag=-M
+  depmode=gcc
+fi
+
+if test "$depmode" = dashXmstdout; then
+   # This is just like dashmstdout with a different argument.
+   dashmflag=-xM
+   depmode=dashmstdout
+fi
+
+case "$depmode" in
+gcc)
+## There are various ways to get dependency output from gcc.  Here's
+## why we pick this rather obscure method:
+## - Don't want to use -MD because we'd like the dependencies to end
+##   up in a subdir.  Having to rename by hand is ugly.
+##   (We might end up doing this anyway to support other compilers.)
+## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like
+##   -MM, not -M (despite what the docs say).
+## - Using -M directly means running the compiler twice (even worse
+##   than renaming).
+  if test -z "$gccflag"; then
+    gccflag=-MD,
+  fi
+  if "$@" -Wp,"$gccflag$tmpdepfile"; then :
+  else
+    stat=$?
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile" 
+  echo "$object : \\" > "$depfile"
+  sed 's/^[^:]*: / /' < "$tmpdepfile" >> "$depfile"
+## This next piece of magic avoids the `deleted header file' problem.
+## The problem is that when a header file which appears in a .P file
+## is deleted, the dependency causes make to die (because there is
+## typically no way to rebuild the header).  We avoid this by adding
+## dummy dependencies for each header file.  Too bad gcc doesn't do
+## this for us directly.
+  tr ' ' '
+' < "$tmpdepfile" |
+## Some versions of gcc put a space before the `:'.  On the theory
+## that the space means something, we add a space to the output as
+## well.
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+hp)
+  # This case exists only to let depend.m4 do its work.  It works by
+  # looking at the text of this script.  This case will never be run,
+  # since it is checked for above.
+  exit 1
+  ;;
+
+dashmd)
+  # The Java front end to gcc doesn't run cpp, so we can't use the -Wp
+  # trick.  Instead we must use -M and then rename the resulting .d
+  # file.  This is also the case for older versions of gcc, which
+  # don't implement -Wp.
+  if "$@" -MD; then :
+  else
+    stat=$?
+    rm -f FIXME
+    exit $stat
+  fi
+  FIXME: rewrite the file
+  ;;
+
+sgi)
+  if test "$libtool" = yes; then
+    "$@" "-Wc,-MDupdate,$tmpdepfile"
+  else
+    "$@" -MDupdate "$tmpdepfile"
+  fi
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+    stat=$?
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile" 
+  echo "$object : \\" > "$depfile"
+  sed 's/^[^:]*: / /' < "$tmpdepfile" >> "$depfile"
+  tr ' ' '
+' < "$tmpdepfile" | \
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+#nosideeffect)
+  # This comment above is used by automake to tell side-effect
+  # dependency tracking mechanisms from slower ones.
+
+dashmstdout)
+  # Important note: in order to support this mode, a compiler *must*
+  # always write the proprocessed file to stdout, regardless of -o,
+  # because we must use -o when running libtool.
+  test -z "$dashmflag" && dashmflag=-M
+  ( IFS=" "
+    case " $* " in
+    *" --mode=compile "*) # this is libtool, let us make it quiet
+      for arg
+      do # cycle over the arguments
+        case "$arg" in
+	"--mode=compile")
+	  # insert --quiet before "--mode=compile"
+	  set fnord "$@" --quiet
+	  shift # fnord
+	  ;;
+	esac
+	set fnord "$@" "$arg"
+	shift # fnord
+	shift # "$arg"
+      done
+      ;;
+    esac
+    "$@" $dashmflag | sed 's:^[^:]*\:[ 	]*:'"$object"'\: :' > "$tmpdepfile"
+  ) &
+  proc=$!
+  "$@"
+  stat=$?
+  wait "$proc"
+  if test "$stat" != 0; then exit $stat; fi
+  rm -f "$depfile" 
+  cat < "$tmpdepfile" > "$depfile"
+  tr ' ' '
+' < "$tmpdepfile" | \
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+dashXmstdout)
+  # This case only exists to satisfy depend.m4.  It is never actually
+  # run, as this mode is specially recognized in the preamble.
+  exit 1
+  ;;
+
+makedepend)
+  # X makedepend
+  (
+    shift
+    cleared=no
+    for arg in "$@"; do
+      case $cleared in no)
+        set ""; shift
+	cleared=yes
+      esac
+      case "$arg" in
+        -D*|-I*)
+	  set fnord "$@" "$arg"; shift;;
+	-*)
+	  ;;
+	*)
+	  set fnord "$@" "$arg"; shift;;
+      esac
+    done
+    obj_suffix="`echo $object | sed 's/^.*\././'`"
+    touch "$tmpdepfile"
+    ${MAKEDEPEND-makedepend} 2>/dev/null -o"$obj_suffix" -f"$tmpdepfile" "$@"
+  ) &
+  proc=$!
+  "$@"
+  stat=$?
+  wait "$proc"
+  if test "$stat" != 0; then exit $stat; fi
+  rm -f "$depfile" 
+  cat < "$tmpdepfile" > "$depfile"
+  tail +3 "$tmpdepfile" | tr ' ' '
+' | \
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile" "$tmpdepfile".bak
+  ;;
+
+cpp)
+  # Important note: in order to support this mode, a compiler *must*
+  # always write the proprocessed file to stdout, regardless of -o,
+  # because we must use -o when running libtool.
+  ( IFS=" "
+    case " $* " in
+    *" --mode=compile "*)
+      for arg
+      do # cycle over the arguments
+        case "$arg" in
+	"--mode=compile")
+	  # insert --quiet before "--mode=compile"
+	  set fnord "$@" --quiet
+	  shift # fnord
+	  ;;
+	esac
+	set fnord "$@" "$arg"
+	shift # fnord
+	shift # "$arg"
+      done
+      ;;
+    esac
+    "$@" -E |
+    sed -n '/^# [0-9][0-9]* "\([^"]*\)"/ s::'"$object"'\: \1:p' > "$tmpdepfile"
+  ) &
+  proc=$!
+  "$@"
+  stat=$?
+  wait "$proc"
+  if test "$stat" != 0; then exit $stat; fi
+  rm -f "$depfile"
+  cat < "$tmpdepfile" > "$depfile"
+  sed < "$tmpdepfile" -e 's/^[^:]*: //' -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+none)
+  exec "$@"
+  ;;
+
+*)
+  echo "Unknown depmode $depmode" 1>&2
+  exit 1
+  ;;
+esac
+
+exit 0
diff --git a/rts/gmp/errno.c b/rts/gmp/errno.c
new file mode 100644
index 0000000000..7dd223c19c
--- /dev/null
+++ b/rts/gmp/errno.c
@@ -0,0 +1,26 @@
+/* gmp_errno -- The largest and most complex file in GMP.
+
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int gmp_errno = 0;
diff --git a/rts/gmp/extract-dbl.c b/rts/gmp/extract-dbl.c
new file mode 100644
index 0000000000..2d70d9a3b2
--- /dev/null
+++ b/rts/gmp/extract-dbl.c
@@ -0,0 +1,187 @@
+/* __gmp_extract_double -- convert from double to array of mp_limb_t.
+
+Copyright (C) 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#ifdef XDEBUG
+#undef _GMP_IEEE_FLOATS
+#endif
+
+#ifndef _GMP_IEEE_FLOATS
+#define _GMP_IEEE_FLOATS 0
+#endif
+
+/* Extract a non-negative double in d.  */
+
+int
+#if __STDC__
+__gmp_extract_double (mp_ptr rp, double d)
+#else
+__gmp_extract_double (rp, d)
+     mp_ptr rp;
+     double d;
+#endif
+{
+  long exp;
+  unsigned sc;
+  mp_limb_t manh, manl;
+
+  /* BUGS
+
+     1. Should handle Inf and NaN in IEEE specific code.
+     2. Handle Inf and NaN also in default code, to avoid hangs.
+     3. Generalize to handle all BITS_PER_MP_LIMB >= 32.
+     4. This lits is incomplete and misspelled.
+   */
+
+  if (d == 0.0)
+    {
+      rp[0] = 0;
+      rp[1] = 0;
+#if BITS_PER_MP_LIMB == 32
+      rp[2] = 0;
+#endif
+      return 0;
+    }
+
+#if _GMP_IEEE_FLOATS
+  {
+#if defined (__alpha) && __GNUC__ == 2 && __GNUC_MINOR__ == 8
+    /* Work around alpha-specific bug in GCC 2.8.x.  */
+    volatile
+#endif
+    union ieee_double_extract x;
+    x.d = d;
+    exp = x.s.exp;
+#if BITS_PER_MP_LIMB == 64
+    manl = (((mp_limb_t) 1 << 63)
+	    | ((mp_limb_t) x.s.manh << 43) | ((mp_limb_t) x.s.manl << 11));
+    if (exp == 0)
+      {
+	/* Denormalized number.  Don't try to be clever about this,
+	   since it is not an important case to make fast.  */
+	exp = 1;
+	do
+	  {
+	    manl = manl << 1;
+	    exp--;
+	  }
+	while ((mp_limb_signed_t) manl >= 0);
+      }
+#else
+    manh = ((mp_limb_t) 1 << 31) | (x.s.manh << 11) | (x.s.manl >> 21);
+    manl = x.s.manl << 11;
+    if (exp == 0)
+      {
+	/* Denormalized number.  Don't try to be clever about this,
+	   since it is not an important case to make fast.  */
+	exp = 1;
+	do
+	  {
+	    manh = (manh << 1) | (manl >> 31);
+	    manl = manl << 1;
+	    exp--;
+	  }
+	while ((mp_limb_signed_t) manh >= 0);
+      }
+#endif
+    exp -= 1022;		/* Remove IEEE bias.  */
+  }
+#else
+  {
+    /* Unknown (or known to be non-IEEE) double format.  */
+    exp = 0;
+    if (d >= 1.0)
+      {
+	if (d * 0.5 == d)
+	  abort ();
+
+	while (d >= 32768.0)
+	  {
+	    d *= (1.0 / 65536.0);
+	    exp += 16;
+	  }
+	while (d >= 1.0)
+	  {
+	    d *= 0.5;
+	    exp += 1;
+	  }
+      }
+    else if (d < 0.5)
+      {
+	while (d < (1.0 / 65536.0))
+	  {
+	    d *=  65536.0;
+	    exp -= 16;
+	  }
+	while (d < 0.5)
+	  {
+	    d *= 2.0;
+	    exp -= 1;
+	  }
+      }
+
+    d *= MP_BASE_AS_DOUBLE;
+#if BITS_PER_MP_LIMB == 64
+    manl = d;
+#else
+    manh = d;
+    manl = (d - manh) * MP_BASE_AS_DOUBLE;
+#endif
+  }
+#endif
+
+  sc = (unsigned) exp % BITS_PER_MP_LIMB;
+
+  /* We add something here to get rounding right.  */
+  exp = (exp + 2048) / BITS_PER_MP_LIMB - 2048 / BITS_PER_MP_LIMB + 1;
+
+#if BITS_PER_MP_LIMB == 64
+  if (sc != 0)
+    {
+      rp[1] = manl >> (BITS_PER_MP_LIMB - sc);
+      rp[0] = manl << sc;
+    }
+  else
+    {
+      rp[1] = manl;
+      rp[0] = 0;
+      exp--;
+    }
+#else
+  if (sc != 0)
+    {
+      rp[2] = manh >> (BITS_PER_MP_LIMB - sc);
+      rp[1] = (manl >> (BITS_PER_MP_LIMB - sc)) | (manh << sc);
+      rp[0] = manl << sc;
+    }
+  else
+    {
+      rp[2] = manh;
+      rp[1] = manl;
+      rp[0] = 0;
+      exp--;
+    }
+#endif
+
+  return exp;
+}
diff --git a/rts/gmp/gmp-impl.h b/rts/gmp/gmp-impl.h
new file mode 100644
index 0000000000..3c7ac26e7d
--- /dev/null
+++ b/rts/gmp/gmp-impl.h
@@ -0,0 +1,1072 @@
+/* Include file for internal GNU MP types and definitions.
+
+   THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND ARE ALMOST CERTAIN TO
+   BE SUBJECT TO INCOMPATIBLE CHANGES IN FUTURE GNU MP RELEASES.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "config.h"
+#include "gmp-mparam.h"
+/* #include "longlong.h" */
+
+/* When using gcc, make sure to use its builtin alloca.  */
+#if ! defined (alloca) && defined (__GNUC__)
+#define alloca __builtin_alloca
+#define HAVE_ALLOCA 1
+#endif
+
+/* When using cc, do whatever necessary to allow use of alloca.  For many
+   machines, this means including alloca.h.  IBM's compilers need a #pragma
+   in "each module that needs to use alloca".  */
+#if ! defined (alloca)
+/* We need lots of variants for MIPS, to cover all versions and perversions
+   of OSes for MIPS.  */
+#if defined (__mips) || defined (MIPSEL) || defined (MIPSEB) \
+ || defined (_MIPSEL) || defined (_MIPSEB) || defined (__sgi) \
+ || defined (__alpha) || defined (__sparc) || defined (sparc) \
+ || defined (__ksr__)
+#include <alloca.h>
+#define HAVE_ALLOCA
+#endif
+#if defined (_IBMR2)
+#pragma alloca
+#define HAVE_ALLOCA
+#endif
+#if defined (__DECC)
+#define alloca(x) __ALLOCA(x)
+#define HAVE_ALLOCA
+#endif
+#endif
+
+#if defined (alloca)
+# ifndef HAVE_ALLOCA
+#define HAVE_ALLOCA
+# endif
+#endif
+
+#if ! defined (HAVE_ALLOCA) || USE_STACK_ALLOC
+#include "stack-alloc.h"
+#else
+#define TMP_DECL(m)
+#define TMP_ALLOC(x) alloca(x)
+#define TMP_MARK(m)
+#define TMP_FREE(m)
+#endif
+
+/* Allocating various types. */
+#define TMP_ALLOC_TYPE(n,type) ((type *) TMP_ALLOC ((n) * sizeof (type)))
+#define TMP_ALLOC_LIMBS(n)     TMP_ALLOC_TYPE(n,mp_limb_t)
+#define TMP_ALLOC_MP_PTRS(n)   TMP_ALLOC_TYPE(n,mp_ptr)
+
+
+#if ! defined (__GNUC__)	/* FIXME: Test for C++ compilers here,
+				   __DECC understands __inline */
+#define inline			/* Empty */
+#endif
+
+#define ABS(x) (x >= 0 ? x : -x)
+#define MIN(l,o) ((l) < (o) ? (l) : (o))
+#define MAX(h,i) ((h) > (i) ? (h) : (i))
+#define numberof(x)  (sizeof (x) / sizeof ((x)[0]))
+
+/* Field access macros.  */
+#define SIZ(x) ((x)->_mp_size)
+#define ABSIZ(x) ABS (SIZ (x))
+#define PTR(x) ((x)->_mp_d)
+#define LIMBS(x) ((x)->_mp_d)
+#define EXP(x) ((x)->_mp_exp)
+#define PREC(x) ((x)->_mp_prec)
+#define ALLOC(x) ((x)->_mp_alloc)
+
+/* Extra casts because shorts are promoted to ints by "~" and "<<".  "-1"
+   rather than "1" in SIGNED_TYPE_MIN avoids warnings from some compilers
+   about arithmetic overflow.  */
+#define UNSIGNED_TYPE_MAX(type)      ((type) ~ (type) 0)
+#define UNSIGNED_TYPE_HIGHBIT(type)  ((type) ~ (UNSIGNED_TYPE_MAX(type) >> 1))
+#define SIGNED_TYPE_MIN(type)        (((type) -1) << (8*sizeof(type)-1))
+#define SIGNED_TYPE_MAX(type)        ((type) ~ SIGNED_TYPE_MIN(type))
+#define SIGNED_TYPE_HIGHBIT(type)    SIGNED_TYPE_MIN(type)
+
+#define MP_LIMB_T_MAX      UNSIGNED_TYPE_MAX (mp_limb_t)
+#define MP_LIMB_T_HIGHBIT  UNSIGNED_TYPE_HIGHBIT (mp_limb_t)
+
+#define MP_SIZE_T_MAX      SIGNED_TYPE_MAX (mp_size_t)
+
+#ifndef ULONG_MAX
+#define ULONG_MAX          UNSIGNED_TYPE_MAX (unsigned long)
+#endif
+#define ULONG_HIGHBIT      UNSIGNED_TYPE_HIGHBIT (unsigned long)
+#define LONG_HIGHBIT       SIGNED_TYPE_HIGHBIT (long)
+#ifndef LONG_MAX
+#define LONG_MAX           SIGNED_TYPE_MAX (long)
+#endif
+
+#ifndef USHORT_MAX
+#define USHORT_MAX         UNSIGNED_TYPE_MAX (unsigned short)
+#endif
+#define USHORT_HIGHBIT     UNSIGNED_TYPE_HIGHBIT (unsigned short)
+#define SHORT_HIGHBIT      SIGNED_TYPE_HIGHBIT (short)
+#ifndef SHORT_MAX
+#define SHORT_MAX          SIGNED_TYPE_MAX (short)
+#endif
+
+
+/* Swap macros. */
+
+#define MP_LIMB_T_SWAP(x, y)                    \
+  do {                                          \
+    mp_limb_t __mp_limb_t_swap__tmp = (x);      \
+    (x) = (y);                                  \
+    (y) = __mp_limb_t_swap__tmp;                \
+  } while (0)
+#define MP_SIZE_T_SWAP(x, y)                    \
+  do {                                          \
+    mp_size_t __mp_size_t_swap__tmp = (x);      \
+    (x) = (y);                                  \
+    (y) = __mp_size_t_swap__tmp;                \
+  } while (0)
+
+#define MP_PTR_SWAP(x, y)               \
+  do {                                  \
+    mp_ptr __mp_ptr_swap__tmp = (x);    \
+    (x) = (y);                          \
+    (y) = __mp_ptr_swap__tmp;           \
+  } while (0)
+#define MP_SRCPTR_SWAP(x, y)                    \
+  do {                                          \
+    mp_srcptr __mp_srcptr_swap__tmp = (x);      \
+    (x) = (y);                                  \
+    (y) = __mp_srcptr_swap__tmp;                \
+  } while (0)
+
+#define MPN_PTR_SWAP(xp,xs, yp,ys)      \
+  do {                                  \
+    MP_PTR_SWAP (xp, yp);               \
+    MP_SIZE_T_SWAP (xs, ys);            \
+  } while(0)
+#define MPN_SRCPTR_SWAP(xp,xs, yp,ys)   \
+  do {                                  \
+    MP_SRCPTR_SWAP (xp, yp);            \
+    MP_SIZE_T_SWAP (xs, ys);            \
+  } while(0)
+
+#define MPZ_PTR_SWAP(x, y)              \
+  do {                                  \
+    mpz_ptr __mpz_ptr_swap__tmp = (x);  \
+    (x) = (y);                          \
+    (y) = __mpz_ptr_swap__tmp;          \
+  } while (0)
+#define MPZ_SRCPTR_SWAP(x, y)                   \
+  do {                                          \
+    mpz_srcptr __mpz_srcptr_swap__tmp = (x);    \
+    (x) = (y);                                  \
+    (y) = __mpz_srcptr_swap__tmp;               \
+  } while (0)
+
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* FIXME: These are purely internal, so do a search and replace to change
+   them to __gmp forms, rather than using these macros. */
+#define _mp_allocate_func      __gmp_allocate_func
+#define _mp_reallocate_func    __gmp_reallocate_func
+#define _mp_free_func          __gmp_free_func
+#define _mp_default_allocate   __gmp_default_allocate
+#define _mp_default_reallocate __gmp_default_reallocate
+#define _mp_default_free       __gmp_default_free
+
+extern void *	(*_mp_allocate_func) _PROTO ((size_t));
+extern void *	(*_mp_reallocate_func) _PROTO ((void *, size_t, size_t));
+extern void	(*_mp_free_func) _PROTO ((void *, size_t));
+
+void *_mp_default_allocate _PROTO ((size_t));
+void *_mp_default_reallocate _PROTO ((void *, size_t, size_t));
+void _mp_default_free _PROTO ((void *, size_t));
+
+#define _MP_ALLOCATE_FUNC_TYPE(n,type) \
+  ((type *) (*_mp_allocate_func) ((n) * sizeof (type)))
+#define _MP_ALLOCATE_FUNC_LIMBS(n)   _MP_ALLOCATE_FUNC_TYPE(n,mp_limb_t)
+
+#define _MP_FREE_FUNC_TYPE(p,n,type) (*_mp_free_func) (p, (n) * sizeof (type))
+#define _MP_FREE_FUNC_LIMBS(p,n)     _MP_FREE_FUNC_TYPE(p,n,mp_limb_t)
+
+
+#if (__STDC__-0) || defined (__cplusplus)
+
+#else
+
+#define const			/* Empty */
+#define signed			/* Empty */
+
+#endif
+
+#if defined (__GNUC__) && defined (__i386__)
+#if 0			/* check that these actually improve things */
+#define MPN_COPY_INCR(DST, SRC, N)					\
+  __asm__ ("cld\n\trep\n\tmovsl" : :					\
+	   "D" (DST), "S" (SRC), "c" (N) :				\
+	   "cx", "di", "si", "memory")
+#define MPN_COPY_DECR(DST, SRC, N)					\
+  __asm__ ("std\n\trep\n\tmovsl" : :					\
+	   "D" ((DST) + (N) - 1), "S" ((SRC) + (N) - 1), "c" (N) :	\
+	   "cx", "di", "si", "memory")
+#define MPN_NORMALIZE_NOT_ZERO(P, N)					\
+  do {									\
+    __asm__ ("std\n\trepe\n\tscasl" : "=c" (N) :			\
+	     "a" (0), "D" ((P) + (N) - 1), "0" (N) :			\
+	     "cx", "di");						\
+    (N)++;								\
+  } while (0)
+#endif
+#endif
+
+#if HAVE_NATIVE_mpn_copyi
+#define mpn_copyi __MPN(copyi)
+void mpn_copyi _PROTO ((mp_ptr, mp_srcptr, mp_size_t));
+#endif
+
+/* Remap names of internal mpn functions.  */
+#define __clz_tab               __MPN(clz_tab)
+#define mpn_udiv_w_sdiv		__MPN(udiv_w_sdiv)
+#define mpn_reciprocal		__MPN(reciprocal)
+
+#define mpn_sb_divrem_mn	__MPN(sb_divrem_mn)
+#define mpn_bz_divrem_n		__MPN(bz_divrem_n)
+/* #define mpn_tdiv_q		__MPN(tdiv_q) */
+
+#define mpn_kara_mul_n	__MPN(kara_mul_n)
+void mpn_kara_mul_n _PROTO((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_ptr));
+
+#define mpn_kara_sqr_n  __MPN(kara_sqr_n)
+void mpn_kara_sqr_n _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_ptr));
+
+#define mpn_toom3_mul_n  __MPN(toom3_mul_n)
+void mpn_toom3_mul_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t,mp_ptr));
+
+#define mpn_toom3_sqr_n  __MPN(toom3_sqr_n)
+void mpn_toom3_sqr_n _PROTO((mp_ptr, mp_srcptr, mp_size_t, mp_ptr));
+
+#define mpn_fft_best_k  __MPN(fft_best_k)
+int mpn_fft_best_k _PROTO ((mp_size_t n, int sqr));
+
+#define mpn_mul_fft  __MPN(mul_fft)
+void mpn_mul_fft _PROTO ((mp_ptr op, mp_size_t pl,
+                          mp_srcptr n, mp_size_t nl,
+                          mp_srcptr m, mp_size_t ml,
+                          int k));
+
+#define mpn_mul_fft_full  __MPN(mul_fft_full)
+void mpn_mul_fft_full _PROTO ((mp_ptr op,
+                               mp_srcptr n, mp_size_t nl,
+                               mp_srcptr m, mp_size_t ml));
+
+#define mpn_fft_next_size  __MPN(fft_next_size)
+mp_size_t mpn_fft_next_size _PROTO ((mp_size_t pl, int k));
+
+mp_limb_t mpn_sb_divrem_mn _PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t));
+mp_limb_t mpn_bz_divrem_n _PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t));
+/* void mpn_tdiv_q _PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t)); */
+
+/* Copy NLIMBS *limbs* from SRC to DST, NLIMBS==0 allowed.  */
+#ifndef MPN_COPY_INCR
+#if HAVE_NATIVE_mpn_copyi
+#define MPN_COPY_INCR(DST, SRC, NLIMBS)   mpn_copyi (DST, SRC, NLIMBS)
+#else
+#define MPN_COPY_INCR(DST, SRC, NLIMBS) \
+  do {									\
+    mp_size_t __i;							\
+    for (__i = 0; __i < (NLIMBS); __i++)				\
+      (DST)[__i] = (SRC)[__i];						\
+  } while (0)
+#endif
+#endif
+
+#if HAVE_NATIVE_mpn_copyd
+#define mpn_copyd __MPN(copyd)
+void mpn_copyd _PROTO ((mp_ptr, mp_srcptr, mp_size_t));
+#endif
+
+/* NLIMBS==0 allowed */
+#ifndef MPN_COPY_DECR
+#if HAVE_NATIVE_mpn_copyd
+#define MPN_COPY_DECR(DST, SRC, NLIMBS)   mpn_copyd (DST, SRC, NLIMBS)
+#else
+#define MPN_COPY_DECR(DST, SRC, NLIMBS) \
+  do {									\
+    mp_size_t __i;							\
+    for (__i = (NLIMBS) - 1; __i >= 0; __i--)				\
+      (DST)[__i] = (SRC)[__i];						\
+  } while (0)
+#endif
+#endif
+
+/* Define MPN_COPY for vector computers.  Since #pragma cannot be in a macro,
+   rely on function inlining. */
+#if defined (_CRAY) || defined (__uxp__)
+static inline void
+_MPN_COPY (d, s, n) mp_ptr d; mp_srcptr s; mp_size_t n;
+{
+  int i;				/* Faster for Cray with plain int */
+#pragma _CRI ivdep			/* Cray PVP systems */
+#pragma loop noalias d,s		/* Fujitsu VPP systems */
+  for (i = 0; i < n; i++)
+    d[i] = s[i];
+}
+#define MPN_COPY _MPN_COPY
+#endif
+
+#ifndef MPN_COPY
+#define MPN_COPY MPN_COPY_INCR
+#endif
+
+/* Zero NLIMBS *limbs* AT DST.  */
+#ifndef MPN_ZERO
+#define MPN_ZERO(DST, NLIMBS) \
+  do {									\
+    mp_size_t __i;							\
+    for (__i = 0; __i < (NLIMBS); __i++)				\
+      (DST)[__i] = 0;							\
+  } while (0)
+#endif
+
+#ifndef MPN_NORMALIZE
+#define MPN_NORMALIZE(DST, NLIMBS) \
+  do {									\
+    while (NLIMBS > 0)							\
+      {									\
+	if ((DST)[(NLIMBS) - 1] != 0)					\
+	  break;							\
+	NLIMBS--;							\
+      }									\
+  } while (0)
+#endif
+#ifndef MPN_NORMALIZE_NOT_ZERO
+#define MPN_NORMALIZE_NOT_ZERO(DST, NLIMBS) \
+  do {									\
+    while (1)								\
+      {									\
+	if ((DST)[(NLIMBS) - 1] != 0)					\
+	  break;							\
+	NLIMBS--;							\
+      }									\
+  } while (0)
+#endif
+
+/* Strip least significant zero limbs from ptr,size by incrementing ptr and
+   decrementing size.  The number in ptr,size must be non-zero, ie. size!=0
+   and somewhere a non-zero limb.  */
+#define MPN_STRIP_LOW_ZEROS_NOT_ZERO(ptr, size) \
+  do                                            \
+    {                                           \
+      ASSERT ((size) != 0);                     \
+      while ((ptr)[0] == 0)                     \
+        {                                       \
+          (ptr)++;                              \
+          (size)--;                             \
+          ASSERT (size >= 0);                   \
+	}                                       \
+    }                                           \
+  while (0)
+
+/* Initialize X of type mpz_t with space for NLIMBS limbs.  X should be a
+   temporary variable; it will be automatically cleared out at function
+   return.  We use __x here to make it possible to accept both mpz_ptr and
+   mpz_t arguments.  */
+#define MPZ_TMP_INIT(X, NLIMBS) \
+  do {									\
+    mpz_ptr __x = (X);							\
+    __x->_mp_alloc = (NLIMBS);						\
+    __x->_mp_d = (mp_ptr) TMP_ALLOC ((NLIMBS) * BYTES_PER_MP_LIMB);	\
+  } while (0)
+
+/* Realloc for an mpz_t WHAT if it has less thann NEEDED limbs.  */
+#define MPZ_REALLOC(what,needed) \
+  do {								\
+    if ((needed) > ALLOC (what))				\
+      _mpz_realloc (what, needed);				\
+  } while (0)
+
+/* If KARATSUBA_MUL_THRESHOLD is not already defined, define it to a
+   value which is good on most machines.  */
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 32
+#endif
+
+/* If TOOM3_MUL_THRESHOLD is not already defined, define it to a
+   value which is good on most machines.  */
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 256
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD (2*KARATSUBA_MUL_THRESHOLD)
+#endif
+
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD (2*TOOM3_MUL_THRESHOLD)
+#endif
+
+/* First k to use for an FFT modF multiply.  A modF FFT is an order
+   log(2^k)/log(2^(k-1)) algorithm, so k=3 is merely 1.5 like karatsuba,
+   whereas k=4 is 1.33 which is faster than toom3 at 1.485.    */
+#define FFT_FIRST_K  4
+
+/* Threshold at which FFT should be used to do a modF NxN -> N multiply. */
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD   (TOOM3_MUL_THRESHOLD * 3)
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD   (TOOM3_SQR_THRESHOLD * 3)
+#endif
+
+/* Threshold at which FFT should be used to do an NxN -> 2N multiply.  This
+   will be a size where FFT is using k=7 or k=8, since an FFT-k used for an
+   NxN->2N multiply and not recursing into itself is an order
+   log(2^k)/log(2^(k-2)) algorithm, so it'll be at least k=7 at 1.39 which
+   is the first better than toom3.  */
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD   (FFT_MODF_MUL_THRESHOLD * 10)
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD   (FFT_MODF_SQR_THRESHOLD * 10)
+#endif
+
+/* Table of thresholds for successive modF FFT "k"s.  The first entry is
+   where FFT_FIRST_K+1 should be used, the second FFT_FIRST_K+2,
+   etc.  See mpn_fft_best_k(). */
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE                           \
+  { TOOM3_MUL_THRESHOLD * 4,   /* k=5 */        \
+    TOOM3_MUL_THRESHOLD * 8,   /* k=6 */        \
+    TOOM3_MUL_THRESHOLD * 16,  /* k=7 */        \
+    TOOM3_MUL_THRESHOLD * 32,  /* k=8 */        \
+    TOOM3_MUL_THRESHOLD * 96,  /* k=9 */        \
+    TOOM3_MUL_THRESHOLD * 288, /* k=10 */       \
+    0 }
+#endif
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE                           \
+  { TOOM3_SQR_THRESHOLD * 4,   /* k=5 */        \
+    TOOM3_SQR_THRESHOLD * 8,   /* k=6 */        \
+    TOOM3_SQR_THRESHOLD * 16,  /* k=7 */        \
+    TOOM3_SQR_THRESHOLD * 32,  /* k=8 */        \
+    TOOM3_SQR_THRESHOLD * 96,  /* k=9 */        \
+    TOOM3_SQR_THRESHOLD * 288, /* k=10 */       \
+    0 }
+#endif
+
+#ifndef FFT_TABLE_ATTRS
+#define FFT_TABLE_ATTRS   static const
+#endif
+
+#define MPN_FFT_TABLE_SIZE  16
+
+
+/* Return non-zero if xp,xsize and yp,ysize overlap.
+   If xp+xsize<=yp there's no overlap, or if yp+ysize<=xp there's no
+   overlap.  If both these are false, there's an overlap. */
+#define MPN_OVERLAP_P(xp, xsize, yp, ysize) \
+  ((xp) + (xsize) > (yp) && (yp) + (ysize) > (xp))
+
+
+/* ASSERT() is a private assertion checking scheme, similar to <assert.h>.
+   ASSERT() does the check only if WANT_ASSERT is selected, ASSERT_ALWAYS()
+   does it always.  Generally assertions are meant for development, but
+   might help when looking for a problem later too.
+
+   ASSERT_NOCARRY() uses ASSERT() to check the expression is zero, but if
+   assertion checking is disabled, the expression is still evaluated.  This
+   is meant for use with routines like mpn_add_n() where the return value
+   represents a carry or whatever that shouldn't occur.  For example,
+   ASSERT_NOCARRY (mpn_add_n (rp, s1p, s2p, size)); */
+
+#ifdef __LINE__
+#define ASSERT_LINE  __LINE__
+#else
+#define ASSERT_LINE  -1
+#endif
+
+#ifdef __FILE__
+#define ASSERT_FILE  __FILE__
+#else
+#define ASSERT_FILE  ""
+#endif
+
+int __gmp_assert_fail _PROTO((const char *filename, int linenum,
+                              const char *expr));
+
+#if HAVE_STRINGIZE
+#define ASSERT_FAIL(expr)  __gmp_assert_fail (ASSERT_FILE, ASSERT_LINE, #expr)
+#else
+#define ASSERT_FAIL(expr)  __gmp_assert_fail (ASSERT_FILE, ASSERT_LINE, "expr")
+#endif
+
+#if HAVE_VOID
+#define CAST_TO_VOID        (void)
+#else
+#define CAST_TO_VOID
+#endif
+
+#define ASSERT_ALWAYS(expr) ((expr) ? 0 : ASSERT_FAIL (expr))
+
+#if WANT_ASSERT
+#define ASSERT(expr)           ASSERT_ALWAYS (expr)
+#define ASSERT_NOCARRY(expr)   ASSERT_ALWAYS ((expr) == 0)
+
+#else
+#define ASSERT(expr)           (CAST_TO_VOID 0)
+#define ASSERT_NOCARRY(expr)   (expr)
+#endif
+
+
+#if HAVE_NATIVE_mpn_com_n
+#define mpn_com_n __MPN(com_n)
+void mpn_com_n _PROTO ((mp_ptr, mp_srcptr, mp_size_t));
+#else
+#define mpn_com_n(d,s,n)        \
+  do                            \
+    {                           \
+      mp_ptr     __d = (d);     \
+      mp_srcptr  __s = (s);     \
+      mp_size_t  __n = (n);     \
+      do                        \
+        *__d++ = ~ *__s++;      \
+      while (--__n);            \
+    }                           \
+  while (0)
+#endif
+
+#define MPN_LOGOPS_N_INLINE(d,s1,s2,n,dop,op,s2op)      \
+  do                                                    \
+    {                                                   \
+      mp_ptr     __d = (d);                             \
+      mp_srcptr  __s1 = (s1);                           \
+      mp_srcptr  __s2 = (s2);                           \
+      mp_size_t  __n = (n);                             \
+      do                                                \
+        *__d++ = dop (*__s1++ op s2op *__s2++);         \
+      while (--__n);                                    \
+    }                                                   \
+  while (0)
+
+#if HAVE_NATIVE_mpn_and_n
+#define mpn_and_n __MPN(and_n)
+void mpn_and_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+#else
+#define mpn_and_n(d,s1,s2,n)  MPN_LOGOPS_N_INLINE(d,s1,s2,n, ,&, )
+#endif
+
+#if HAVE_NATIVE_mpn_andn_n
+#define mpn_andn_n __MPN(andn_n)
+void mpn_andn_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+#else
+#define mpn_andn_n(d,s1,s2,n) MPN_LOGOPS_N_INLINE(d,s1,s2,n, ,&,~)
+#endif
+
+#if HAVE_NATIVE_mpn_nand_n
+#define mpn_nand_n __MPN(nand_n)
+void mpn_nand_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+#else
+#define mpn_nand_n(d,s1,s2,n) MPN_LOGOPS_N_INLINE(d,s1,s2,n,~,&, )
+#endif
+
+#if HAVE_NATIVE_mpn_ior_n
+#define mpn_ior_n __MPN(ior_n)
+void mpn_ior_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+#else
+#define mpn_ior_n(d,s1,s2,n)  MPN_LOGOPS_N_INLINE(d,s1,s2,n, ,|, )
+#endif
+
+#if HAVE_NATIVE_mpn_iorn_n
+#define mpn_iorn_n __MPN(iorn_n)
+void mpn_iorn_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+#else
+#define mpn_iorn_n(d,s1,s2,n) MPN_LOGOPS_N_INLINE(d,s1,s2,n, ,|,~)
+#endif
+
+#if HAVE_NATIVE_mpn_nior_n
+#define mpn_nior_n __MPN(nior_n)
+void mpn_nior_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+#else
+#define mpn_nior_n(d,s1,s2,n) MPN_LOGOPS_N_INLINE(d,s1,s2,n,~,|, )
+#endif
+
+#if HAVE_NATIVE_mpn_xor_n
+#define mpn_xor_n __MPN(xor_n)
+void mpn_xor_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+#else
+#define mpn_xor_n(d,s1,s2,n)  MPN_LOGOPS_N_INLINE(d,s1,s2,n, ,^, )
+#endif
+
+#if HAVE_NATIVE_mpn_xnor_n
+#define mpn_xnor_n __MPN(xnor_n)
+void mpn_xnor_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+#else
+#define mpn_xnor_n(d,s1,s2,n) MPN_LOGOPS_N_INLINE(d,s1,s2,n,~,^, )
+#endif
+
+/* Structure for conversion between internal binary format and
+   strings in base 2..36.  */
+struct bases
+{
+  /* Number of digits in the conversion base that always fits in an mp_limb_t.
+     For example, for base 10 on a machine where a mp_limb_t has 32 bits this
+     is 9, since 10**9 is the largest number that fits into a mp_limb_t.  */
+  int chars_per_limb;
+
+  /* log(2)/log(conversion_base) */
+  double chars_per_bit_exactly;
+
+  /* base**chars_per_limb, i.e. the biggest number that fits a word, built by
+     factors of base.  Exception: For 2, 4, 8, etc, big_base is log2(base),
+     i.e. the number of bits used to represent each digit in the base.  */
+  mp_limb_t big_base;
+
+  /* A BITS_PER_MP_LIMB bit approximation to 1/big_base, represented as a
+     fixed-point number.  Instead of dividing by big_base an application can
+     choose to multiply by big_base_inverted.  */
+  mp_limb_t big_base_inverted;
+};
+
+#define __mp_bases __MPN(mp_bases)
+extern const struct bases __mp_bases[];
+extern mp_size_t __gmp_default_fp_limb_precision;
+
+#if defined (__i386__)
+#define TARGET_REGISTER_STARVED 1
+#else
+#define TARGET_REGISTER_STARVED 0
+#endif
+
+/* Use a library function for invert_limb, if available. */
+#if ! defined (invert_limb) && HAVE_NATIVE_mpn_invert_limb
+#define mpn_invert_limb  __MPN(invert_limb)
+mp_limb_t mpn_invert_limb _PROTO ((mp_limb_t));
+#define invert_limb(invxl,xl)  (invxl = __MPN(invert_limb) (xl))
+#endif
+
+#ifndef invert_limb
+#define invert_limb(invxl,xl) \
+  do {									\
+    mp_limb_t dummy;							\
+    if (xl << 1 == 0)							\
+      invxl = ~(mp_limb_t) 0;						\
+    else								\
+      udiv_qrnnd (invxl, dummy, -xl, 0, xl);				\
+  } while (0)
+#endif
+
+/* Divide the two-limb number in (NH,,NL) by D, with DI being the largest
+   limb not larger than (2**(2*BITS_PER_MP_LIMB))/D - (2**BITS_PER_MP_LIMB).
+   If this would yield overflow, DI should be the largest possible number
+   (i.e., only ones).  For correct operation, the most significant bit of D
+   has to be set.  Put the quotient in Q and the remainder in R.  */
+#define udiv_qrnnd_preinv(q, r, nh, nl, d, di) \
+  do {									\
+    mp_limb_t _q, _ql, _r;						\
+    mp_limb_t _xh, _xl;							\
+    umul_ppmm (_q, _ql, (nh), (di));					\
+    _q += (nh);			/* DI is 2**BITS_PER_MP_LIMB too small */\
+    umul_ppmm (_xh, _xl, _q, (d));					\
+    sub_ddmmss (_xh, _r, (nh), (nl), _xh, _xl);				\
+    if (_xh != 0)							\
+      {									\
+	sub_ddmmss (_xh, _r, _xh, _r, 0, (d));				\
+	_q += 1;							\
+	if (_xh != 0)							\
+	  {								\
+	    sub_ddmmss (_xh, _r, _xh, _r, 0, (d));			\
+	    _q += 1;							\
+	  }								\
+      }									\
+    if (_r >= (d))							\
+      {									\
+	_r -= (d);							\
+	_q += 1;							\
+      }									\
+    (r) = _r;								\
+    (q) = _q;								\
+  } while (0)
+/* Like udiv_qrnnd_preinv, but for for any value D.  DNORM is D shifted left
+   so that its most significant bit is set.  LGUP is ceil(log2(D)).  */
+#define udiv_qrnnd_preinv2gen(q, r, nh, nl, d, di, dnorm, lgup) \
+  do {									\
+    mp_limb_t _n2, _n10, _n1, _nadj, _q1;				\
+    mp_limb_t _xh, _xl;							\
+    _n2 = ((nh) << (BITS_PER_MP_LIMB - (lgup))) + ((nl) >> 1 >> (l - 1));\
+    _n10 = (nl) << (BITS_PER_MP_LIMB - (lgup));				\
+    _n1 = ((mp_limb_signed_t) _n10 >> (BITS_PER_MP_LIMB - 1));		\
+    _nadj = _n10 + (_n1 & (dnorm));					\
+    umul_ppmm (_xh, _xl, di, _n2 - _n1);				\
+    add_ssaaaa (_xh, _xl, _xh, _xl, 0, _nadj);				\
+    _q1 = ~(_n2 + _xh);							\
+    umul_ppmm (_xh, _xl, _q1, d);					\
+    add_ssaaaa (_xh, _xl, _xh, _xl, nh, nl);				\
+    _xh -= (d);								\
+    (r) = _xl + ((d) & _xh);						\
+    (q) = _xh - _q1;							\
+  } while (0)
+/* Exactly like udiv_qrnnd_preinv, but branch-free.  It is not clear which
+   version to use.  */
+#define udiv_qrnnd_preinv2norm(q, r, nh, nl, d, di) \
+  do {									\
+    mp_limb_t _n2, _n10, _n1, _nadj, _q1;				\
+    mp_limb_t _xh, _xl;							\
+    _n2 = (nh);								\
+    _n10 = (nl);							\
+    _n1 = ((mp_limb_signed_t) _n10 >> (BITS_PER_MP_LIMB - 1));		\
+    _nadj = _n10 + (_n1 & (d));						\
+    umul_ppmm (_xh, _xl, di, _n2 - _n1);				\
+    add_ssaaaa (_xh, _xl, _xh, _xl, 0, _nadj);				\
+    _q1 = ~(_n2 + _xh);							\
+    umul_ppmm (_xh, _xl, _q1, d);					\
+    add_ssaaaa (_xh, _xl, _xh, _xl, nh, nl);				\
+    _xh -= (d);								\
+    (r) = _xl + ((d) & _xh);						\
+    (q) = _xh - _q1;							\
+  } while (0)
+
+
+/* modlimb_invert() sets "inv" to the multiplicative inverse of "n" modulo
+   2^BITS_PER_MP_LIMB, ie. so that inv*n == 1 mod 2^BITS_PER_MP_LIMB.
+   "n" must be odd (otherwise such an inverse doesn't exist).
+
+   This is not to be confused with invert_limb(), which is completely
+   different.
+
+   The table lookup gives an inverse with the low 8 bits valid, and each
+   multiply step doubles the number of bits.  See Jebelean's exact division
+   paper, end of section 4 (reference in gmp.texi). */
+
+#define modlimb_invert_table  __gmp_modlimb_invert_table
+extern const unsigned char  modlimb_invert_table[128];
+
+#if BITS_PER_MP_LIMB <= 32
+#define modlimb_invert(inv,n)                                   \
+  do {                                                          \
+    mp_limb_t  __n = (n);                                       \
+    mp_limb_t  __inv;                                           \
+    ASSERT ((__n & 1) == 1);                                    \
+    __inv = modlimb_invert_table[(__n&0xFF)/2]; /*  8 */        \
+    __inv = 2 * __inv - __inv * __inv * __n;    /* 16 */        \
+    __inv = 2 * __inv - __inv * __inv * __n;    /* 32 */        \
+    ASSERT (__inv * __n == 1);                                  \
+    (inv) = __inv;                                              \
+  } while (0)
+#endif
+
+#if BITS_PER_MP_LIMB > 32 && BITS_PER_MP_LIMB <= 64
+#define modlimb_invert(inv,n)                                   \
+  do {                                                          \
+    mp_limb_t  __n = (n);                                       \
+    mp_limb_t  __inv;                                           \
+    ASSERT ((__n & 1) == 1);                                    \
+    __inv = modlimb_invert_table[(__n&0xFF)/2]; /*  8 */        \
+    __inv = 2 * __inv - __inv * __inv * __n;    /* 16 */        \
+    __inv = 2 * __inv - __inv * __inv * __n;    /* 32 */        \
+    __inv = 2 * __inv - __inv * __inv * __n;    /* 64 */        \
+    ASSERT (__inv * __n == 1);                                  \
+    (inv) = __inv;                                              \
+  } while (0)
+#endif
+
+
+/* The `mode' attribute was introduced in GCC 2.2, but we can only distinguish
+   between GCC 2 releases from 2.5, since __GNUC_MINOR__ wasn't introduced
+   until then.  */
+#if (__GNUC__ - 0 > 2 || defined (__GNUC_MINOR__)) && ! defined (__APPLE_CC__)
+/* Define stuff for longlong.h.  */
+typedef unsigned int UQItype	__attribute__ ((mode (QI)));
+typedef		 int SItype	__attribute__ ((mode (SI)));
+typedef unsigned int USItype	__attribute__ ((mode (SI)));
+typedef		 int DItype	__attribute__ ((mode (DI)));
+typedef unsigned int UDItype	__attribute__ ((mode (DI)));
+#else
+typedef unsigned char UQItype;
+typedef		 long SItype;
+typedef unsigned long USItype;
+#if defined _LONGLONG || defined _LONG_LONG_LIMB
+typedef	long long int DItype;
+typedef unsigned long long int UDItype;
+#else /* Assume `long' gives us a wide enough type.  Needed for hppa2.0w.  */
+typedef long int DItype;
+typedef unsigned long int UDItype;
+#endif
+#endif
+
+typedef mp_limb_t UWtype;
+typedef unsigned int UHWtype;
+#define W_TYPE_SIZE BITS_PER_MP_LIMB
+
+/* Define ieee_double_extract and _GMP_IEEE_FLOATS.  */
+
+#if (defined (__arm__) && (defined (__ARMWEL__) || defined (__linux__)))
+/* Special case for little endian ARM since floats remain in big-endian.  */
+#define _GMP_IEEE_FLOATS 1
+union ieee_double_extract
+{
+  struct
+    {
+      unsigned int manh:20;
+      unsigned int exp:11;
+      unsigned int sig:1;
+      unsigned int manl:32;
+    } s;
+  double d;
+};
+#else
+#if defined (_LITTLE_ENDIAN) || defined (__LITTLE_ENDIAN__)		\
+ || defined (__alpha)							\
+ || defined (__clipper__)						\
+ || defined (__cris)							\
+ || defined (__i386__)							\
+ || defined (__i860__)							\
+ || defined (__i960__)							\
+ || defined (MIPSEL) || defined (_MIPSEL)				\
+ || defined (__ns32000__)						\
+ || defined (__WINNT) || defined (_WIN32)
+#define _GMP_IEEE_FLOATS 1
+union ieee_double_extract
+{
+  struct
+    {
+      unsigned int manl:32;
+      unsigned int manh:20;
+      unsigned int exp:11;
+      unsigned int sig:1;
+    } s;
+  double d;
+};
+#else /* Need this as an #else since the tests aren't made exclusive.  */
+#if defined (_BIG_ENDIAN) || defined (__BIG_ENDIAN__)			\
+ || defined (__a29k__) || defined (_AM29K)				\
+ || defined (__arm__)							\
+ || (defined (__convex__) && defined (_IEEE_FLOAT_))			\
+ || defined (_CRAYMPP)							\
+ || defined (__i370__) || defined (__mvs__)				\
+ || defined (__mc68000__) || defined (__mc68020__) || defined (__m68k__)\
+    || defined(mc68020)							\
+ || defined (__m88000__)						\
+ || defined (MIPSEB) || defined (_MIPSEB)				\
+ || defined (__hppa) || defined (__hppa__)				\
+ || defined (__pyr__)							\
+ || defined (__ibm032__)						\
+ || defined (_IBMR2) || defined (_ARCH_PPC)				\
+ || defined (__sh__)							\
+ || defined (__sparc) || defined (sparc)				\
+ || defined (__we32k__)
+#define _GMP_IEEE_FLOATS 1
+union ieee_double_extract
+{
+  struct
+    {
+      unsigned int sig:1;
+      unsigned int exp:11;
+      unsigned int manh:20;
+      unsigned int manl:32;
+    } s;
+  double d;
+};
+#endif
+#endif
+#endif
+
+/* Using "(2.0 * ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1)))" doesn't work on
+   SunOS 4.1.4 native /usr/ucb/cc (K&R), it comes out as -4294967296.0,
+   presumably due to treating the mp_limb_t constant as signed rather than
+   unsigned. */
+#define MP_BASE_AS_DOUBLE (4.0 * ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 2)))
+#if BITS_PER_MP_LIMB == 64
+#define LIMBS_PER_DOUBLE 2
+#else
+#define LIMBS_PER_DOUBLE 3
+#endif
+
+double __gmp_scale2 _PROTO ((double, int));
+int __gmp_extract_double _PROTO ((mp_ptr, double));
+
+extern int __gmp_junk;
+extern const int __gmp_0;
+#define GMP_ERROR(code)   (gmp_errno |= (code), __gmp_junk = 10/__gmp_0)
+#define DIVIDE_BY_ZERO    GMP_ERROR(GMP_ERROR_DIVISION_BY_ZERO)
+#define SQRT_OF_NEGATIVE  GMP_ERROR(GMP_ERROR_SQRT_OF_NEGATIVE)
+
+#if defined _LONG_LONG_LIMB
+#if defined (__STDC__)
+#define CNST_LIMB(C) C##LL
+#else
+#define CNST_LIMB(C) C/**/LL
+#endif
+#else /* not _LONG_LONG_LIMB */
+#if defined (__STDC__)
+#define CNST_LIMB(C) C##L
+#else
+#define CNST_LIMB(C) C/**/L
+#endif
+#endif /* _LONG_LONG_LIMB */
+
+/*** Stuff used by mpn/generic/prefsqr.c and mpn/generic/next_prime.c ***/
+#if BITS_PER_MP_LIMB == 32
+#define PP 0xC0CFD797L		/* 3 x 5 x 7 x 11 x 13 x ... x 29 */
+#define PP_INVERTED 0x53E5645CL
+#define PP_MAXPRIME 29
+#define PP_MASK 0x208A28A8L
+#endif
+
+#if BITS_PER_MP_LIMB == 64
+#define PP CNST_LIMB(0xE221F97C30E94E1D)	/* 3 x 5 x 7 x 11 x 13 x ... x 53 */
+#define PP_INVERTED CNST_LIMB(0x21CFE6CFC938B36B)
+#define PP_MAXPRIME 53
+#define PP_MASK CNST_LIMB(0x208A20A08A28A8)
+#endif
+
+
+/* BIT1 means a result value in bit 1 (second least significant bit), with a
+   zero bit representing +1 and a one bit representing -1.  Bits other than
+   bit 1 are garbage.
+
+   JACOBI_TWOS_U_BIT1 and JACOBI_RECIP_UU_BIT1 are used in mpn_jacobi_base
+   and their speed is important.  Expressions are used rather than
+   conditionals to accumulate sign changes, which effectively means XORs
+   instead of conditional JUMPs. */
+
+/* (a/0), with a signed; is 1 if a=+/-1, 0 otherwise */
+#define JACOBI_S0(a) \
+  (((a) == 1) | ((a) == -1))
+
+/* (a/0), with a unsigned; is 1 if a=+/-1, 0 otherwise */
+#define JACOBI_U0(a) \
+  ((a) == 1)
+
+/* (a/0), with a an mpz_t; is 1 if a=+/-1, 0 otherwise
+   An mpz_t always has at least one limb of allocated space, so the fetch of
+   the low limb is valid. */
+#define JACOBI_Z0(a) \
+  (((SIZ(a) == 1) | (SIZ(a) == -1)) & (PTR(a)[0] == 1))
+
+/* Convert a bit1 to +1 or -1. */
+#define JACOBI_BIT1_TO_PN(result_bit1) \
+  (1 - ((result_bit1) & 2))
+
+/* (2/b), with b unsigned and odd;
+   is (-1)^((b^2-1)/8) which is 1 if b==1,7mod8 or -1 if b==3,5mod8 and
+   hence obtained from (b>>1)^b */
+#define JACOBI_TWO_U_BIT1(b) \
+  (ASSERT (b & 1), (((b) >> 1) ^ (b)))
+
+/* (2/b)^twos, with b unsigned and odd */
+#define JACOBI_TWOS_U_BIT1(twos, b) \
+  (((twos) << 1) & JACOBI_TWO_U_BIT1 (b))
+
+/* (2/b)^twos, with b unsigned and odd */
+#define JACOBI_TWOS_U(twos, b) \
+  (JACOBI_BIT1_TO_PN (JACOBI_TWOS_U_BIT1 (twos, b)))
+
+/* (a/b) effect due to sign of a: signed/unsigned, b odd;
+   is (-1)^((b-1)/2) if a<0, or +1 if a>=0 */
+#define JACOBI_ASGN_SU_BIT1(a, b) \
+  ((((a) < 0) << 1) & (b))
+
+/* (a/b) effect due to sign of b: signed/mpz;
+   is -1 if a and b both negative, +1 otherwise */
+#define JACOBI_BSGN_SZ_BIT1(a, b) \
+  ((((a) < 0) & (SIZ(b) < 0)) << 1)
+
+/* (a/b) effect due to sign of b: mpz/signed */
+#define JACOBI_BSGN_ZS_BIT1(a, b) \
+  JACOBI_BSGN_SZ_BIT1(b, a)
+
+/* (a/b) reciprocity to switch to (b/a), a,b both unsigned and odd.
+   Is (-1)^((a-1)*(b-1)/4), which means +1 if either a,b==1mod4 or -1 if
+   both a,b==3mod4, achieved in bit 1 by a&b.  No ASSERT()s about a,b odd
+   because this is used in a couple of places with only bit 1 of a or b
+   valid. */
+#define JACOBI_RECIP_UU_BIT1(a, b) \
+  ((a) & (b))
+
+
+/* For testing and debugging.  */
+#define MPZ_CHECK_FORMAT(z)                                     	\
+  (ASSERT_ALWAYS (SIZ(z) == 0 || PTR(z)[ABSIZ(z) - 1] != 0),    	\
+   ASSERT_ALWAYS (ALLOC(z) >= ABSIZ(z)))
+#define MPZ_PROVOKE_REALLOC(z)						\
+  do { ALLOC(z) = ABSIZ(z); } while (0)
+
+
+#if TUNE_PROGRAM_BUILD
+/* Some extras wanted when recompiling some .c files for use by the tune
+   program.  Not part of a normal build. */
+
+extern mp_size_t  mul_threshold[];
+extern mp_size_t  fft_modf_mul_threshold;
+extern mp_size_t  sqr_threshold[];
+extern mp_size_t  fft_modf_sqr_threshold;
+extern mp_size_t  bz_threshold[];
+extern mp_size_t  fib_threshold[];
+extern mp_size_t  powm_threshold[];
+extern mp_size_t  gcd_accel_threshold[];
+extern mp_size_t  gcdext_threshold[];
+
+#undef KARATSUBA_MUL_THRESHOLD
+#undef TOOM3_MUL_THRESHOLD
+#undef FFT_MUL_TABLE
+#undef FFT_MUL_THRESHOLD
+#undef FFT_MODF_MUL_THRESHOLD
+#undef KARATSUBA_SQR_THRESHOLD
+#undef TOOM3_SQR_THRESHOLD
+#undef FFT_SQR_TABLE
+#undef FFT_SQR_THRESHOLD
+#undef FFT_MODF_SQR_THRESHOLD
+#undef BZ_THRESHOLD
+#undef FIB_THRESHOLD
+#undef POWM_THRESHOLD
+#undef GCD_ACCEL_THRESHOLD
+#undef GCDEXT_THRESHOLD
+
+#define KARATSUBA_MUL_THRESHOLD  mul_threshold[0]
+#define TOOM3_MUL_THRESHOLD      mul_threshold[1]
+#define FFT_MUL_TABLE            0
+#define FFT_MUL_THRESHOLD        mul_threshold[2]
+#define FFT_MODF_MUL_THRESHOLD   fft_modf_mul_threshold
+#define KARATSUBA_SQR_THRESHOLD  sqr_threshold[0]
+#define TOOM3_SQR_THRESHOLD      sqr_threshold[1]
+#define FFT_SQR_TABLE            0
+#define FFT_SQR_THRESHOLD        sqr_threshold[2]
+#define FFT_MODF_SQR_THRESHOLD   fft_modf_sqr_threshold
+#define BZ_THRESHOLD             bz_threshold[0]
+#define FIB_THRESHOLD            fib_threshold[0]
+#define POWM_THRESHOLD           powm_threshold[0]
+#define GCD_ACCEL_THRESHOLD      gcd_accel_threshold[0]
+#define GCDEXT_THRESHOLD         gcdext_threshold[0]
+
+#define TOOM3_MUL_THRESHOLD_LIMIT  700
+
+#undef  FFT_TABLE_ATTRS
+#define FFT_TABLE_ATTRS
+extern mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE];
+
+#endif /* TUNE_PROGRAM_BUILD */
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/rts/gmp/gmp.h b/rts/gmp/gmp.h
new file mode 100644
index 0000000000..0f1b9510e9
--- /dev/null
+++ b/rts/gmp/gmp.h
@@ -0,0 +1,1083 @@
+/* gmp.h -- Definitions for GNU multiple precision functions.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#ifndef __GMP_H__
+
+#ifndef __GNU_MP__		/* to allow inclusion of both gmp.h and mp.h */
+#define __GNU_MP__ 2
+#define __need_size_t
+#include <stddef.h>
+#undef __need_size_t
+
+#ifndef STG_H
+/* Get DLL_IMPORT */
+#include "../../includes/ghcconfig.h"
+#include "../../includes/StgDLL.h"
+#endif
+
+#if defined (__mips) && defined (_ABIN32)
+/* Force the use of 64-bit limbs for all 64-bit MIPS CPUs if ABI permits.  */
+#define _LONG_LONG_LIMB
+#endif
+
+#if (__STDC__-0) || defined (__cplusplus)
+#define __gmp_const const
+#define __gmp_signed signed
+#else
+#define __gmp_const
+#define __gmp_signed
+#endif
+
+#if defined (__GNUC__)
+#define __gmp_inline __inline__
+#else
+#define __gmp_inline
+#endif
+
+#ifndef _EXTERN_INLINE
+#ifdef __GNUC__
+#define _EXTERN_INLINE extern __inline__
+#else
+#define _EXTERN_INLINE static
+#endif
+#endif
+
+#ifdef _SHORT_LIMB
+typedef unsigned int		mp_limb_t;
+typedef int			mp_limb_signed_t;
+#else
+#ifdef _LONG_LONG_LIMB
+typedef unsigned long long int	mp_limb_t;
+typedef long long int		mp_limb_signed_t;
+#else
+typedef unsigned long int	mp_limb_t;
+typedef long int		mp_limb_signed_t;
+#endif
+#endif
+
+typedef mp_limb_t *		mp_ptr;
+typedef __gmp_const mp_limb_t *	mp_srcptr;
+#if defined (_CRAY) && ! defined (_CRAYMPP)
+/* plain `int' is much faster (48 bits) */
+typedef int			mp_size_t;
+typedef int			mp_exp_t;
+#else
+typedef long int		mp_size_t;
+typedef long int		mp_exp_t;
+#endif
+
+typedef struct
+{
+  int _mp_alloc;		/* Number of *limbs* allocated and pointed
+				   to by the _mp_d field.  */
+  int _mp_size;			/* abs(_mp_size) is the number of limbs the
+				   last field points to.  If _mp_size is
+				   negative this is a negative number.  */
+  mp_limb_t *_mp_d;		/* Pointer to the limbs.  */
+} __mpz_struct;
+#endif /* __GNU_MP__ */
+
+typedef __mpz_struct MP_INT;
+typedef __mpz_struct mpz_t[1];
+
+typedef struct
+{
+  __mpz_struct _mp_num;
+  __mpz_struct _mp_den;
+} __mpq_struct;
+
+typedef __mpq_struct MP_RAT;
+typedef __mpq_struct mpq_t[1];
+
+typedef struct
+{
+  int _mp_prec;			/* Max precision, in number of `mp_limb_t's.
+				   Set by mpf_init and modified by
+				   mpf_set_prec.  The area pointed to by the
+				   _mp_d field contains `prec' + 1 limbs.  */
+  int _mp_size;			/* abs(_mp_size) is the number of limbs the
+				   last field points to.  If _mp_size is
+				   negative this is a negative number.  */
+  mp_exp_t _mp_exp;		/* Exponent, in the base of `mp_limb_t'.  */
+  mp_limb_t *_mp_d;		/* Pointer to the limbs.  */
+} __mpf_struct;
+
+/* typedef __mpf_struct MP_FLOAT; */
+typedef __mpf_struct mpf_t[1];
+
+/* Available random number generation algorithms.  */
+typedef enum
+{
+  GMP_RAND_ALG_DEFAULT = 0,
+  GMP_RAND_ALG_LC = GMP_RAND_ALG_DEFAULT /* Linear congruential.  */
+} gmp_randalg_t;
+
+/* Linear congruential data struct.  */
+typedef struct {
+  mpz_t a;			/* Multiplier. */
+  unsigned long int c;		/* Adder. */
+  mpz_t m;			/* Modulus (valid only if m2exp == 0).  */
+  unsigned long int m2exp;	/* If != 0, modulus is 2 ^ m2exp.  */
+} __gmp_randata_lc;
+
+/* Random state struct.  */
+typedef struct
+{
+  mpz_t seed;			/* Current seed.  */
+  gmp_randalg_t alg;		/* Algorithm used.  */
+  union {			/* Algorithm specific data.  */
+    __gmp_randata_lc *lc;	/* Linear congruential.  */
+  } algdata;
+} __gmp_randstate_struct;
+typedef __gmp_randstate_struct gmp_randstate_t[1];
+
+/* Types for function declarations in gmp files.  */
+/* ??? Should not pollute user name space with these ??? */
+typedef __gmp_const __mpz_struct *mpz_srcptr;
+typedef __mpz_struct *mpz_ptr;
+typedef __gmp_const __mpf_struct *mpf_srcptr;
+typedef __mpf_struct *mpf_ptr;
+typedef __gmp_const __mpq_struct *mpq_srcptr;
+typedef __mpq_struct *mpq_ptr;
+
+#ifndef _PROTO
+#if (__STDC__-0) || defined (__cplusplus)
+#define _PROTO(x) x
+#else
+#define _PROTO(x) ()
+#endif
+#endif
+
+#ifndef __MPN
+/* Really use `defined (__STDC__)' here; we want it to be true for Sun C */
+#if defined (__STDC__) || defined (__cplusplus)
+#define __MPN(x) __gmpn_##x
+#else
+#define __MPN(x) __gmpn_/**/x
+#endif
+#endif
+
+#if defined (FILE) || defined (H_STDIO) || defined (_H_STDIO) \
+ || defined (_STDIO_H) || defined (_STDIO_H_) || defined (__STDIO_H__) \
+ || defined (_STDIO_INCLUDED) || defined (__dj_include_stdio_h_)
+#define _GMP_H_HAVE_FILE 1
+#endif
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#define mp_set_memory_functions __gmp_set_memory_functions
+DLL_IMPORT void mp_set_memory_functions _PROTO ((void *(*) (size_t),
+				      void *(*) (void *, size_t, size_t),
+				      void (*) (void *, size_t)));
+
+#define mp_bits_per_limb __gmp_bits_per_limb
+DLL_IMPORT extern __gmp_const int mp_bits_per_limb;
+
+#if defined (__cplusplus)
+}
+#endif
+
+
+/**************** Random number routines.  ****************/
+
+#define _gmp_rand __gmp_rand
+#define gmp_randinit __gmp_randinit
+#define gmp_randinit_lc __gmp_randinit_lc
+#define gmp_randinit_lc_2exp __gmp_randinit_lc_2exp
+#define gmp_randseed __gmp_randseed
+#define gmp_randseed_ui __gmp_randseed_ui
+#define gmp_randclear __gmp_randclear
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+DLL_IMPORT void _gmp_rand _PROTO ((mp_ptr, gmp_randstate_t, unsigned long int));
+DLL_IMPORT void gmp_randinit _PROTO ((gmp_randstate_t, gmp_randalg_t, ...));
+DLL_IMPORT void gmp_randinit_lc _PROTO ((gmp_randstate_t, mpz_t, unsigned long int,
+			      mpz_t));
+DLL_IMPORT void gmp_randinit_lc_2exp _PROTO ((gmp_randstate_t, mpz_t, unsigned long int,
+				   unsigned long int));
+DLL_IMPORT void gmp_randseed _PROTO ((gmp_randstate_t, mpz_t));
+DLL_IMPORT void gmp_randseed_ui _PROTO ((gmp_randstate_t, unsigned long int));
+DLL_IMPORT void gmp_randclear _PROTO ((gmp_randstate_t));
+
+#if defined (__cplusplus)
+}
+#endif
+
+/**************** Integer (i.e. Z) routines.  ****************/
+
+#define _mpz_realloc __gmpz_realloc
+#define mpz_realloc __gmpz_realloc
+#define mpz_abs __gmpz_abs
+#define mpz_add __gmpz_add
+#define mpz_add_ui __gmpz_add_ui
+#define mpz_addmul_ui __gmpz_addmul_ui
+#define mpz_and __gmpz_and
+#define mpz_array_init __gmpz_array_init
+#define mpz_bin_ui __gmpz_bin_ui
+#define mpz_bin_uiui __gmpz_bin_uiui
+#define mpz_cdiv_q __gmpz_cdiv_q
+#define mpz_cdiv_q_ui __gmpz_cdiv_q_ui
+#define mpz_cdiv_qr __gmpz_cdiv_qr
+#define mpz_cdiv_qr_ui __gmpz_cdiv_qr_ui
+#define mpz_cdiv_r __gmpz_cdiv_r
+#define mpz_cdiv_r_ui __gmpz_cdiv_r_ui
+#define mpz_cdiv_ui __gmpz_cdiv_ui
+#define mpz_clear __gmpz_clear
+#define mpz_clrbit __gmpz_clrbit
+#define mpz_cmp __gmpz_cmp
+#define _mpz_cmp_si __gmpz_cmp_si
+#define _mpz_cmp_ui __gmpz_cmp_ui
+#define mpz_cmpabs __gmpz_cmpabs
+#define mpz_cmpabs_ui __gmpz_cmpabs_ui
+#define mpz_com __gmpz_com
+#define mpz_divexact __gmpz_divexact
+#define mpz_dump __gmpz_dump
+#define mpz_fac_ui __gmpz_fac_ui
+#define mpz_fdiv_q __gmpz_fdiv_q
+#define mpz_fdiv_q_2exp __gmpz_fdiv_q_2exp
+#define mpz_fdiv_q_ui __gmpz_fdiv_q_ui
+#define mpz_fdiv_qr __gmpz_fdiv_qr
+#define mpz_fdiv_qr_ui __gmpz_fdiv_qr_ui
+#define mpz_fdiv_r __gmpz_fdiv_r
+#define mpz_fdiv_r_2exp __gmpz_fdiv_r_2exp
+#define mpz_fdiv_r_ui __gmpz_fdiv_r_ui
+#define mpz_fdiv_ui __gmpz_fdiv_ui
+#define mpz_fib_ui __gmpz_fib_ui
+#define mpz_fits_sint_p __gmpz_fits_sint_p
+#define mpz_fits_slong_p __gmpz_fits_slong_p
+#define mpz_fits_sshort_p __gmpz_fits_sshort_p
+#define mpz_fits_uint_p __gmpz_fits_uint_p
+#define mpz_fits_ulong_p __gmpz_fits_ulong_p
+#define mpz_fits_ushort_p __gmpz_fits_ushort_p
+#define mpz_gcd __gmpz_gcd
+#define mpz_gcd_ui __gmpz_gcd_ui
+#define mpz_gcdext __gmpz_gcdext
+#define mpz_get_d __gmpz_get_d
+#define mpz_get_si __gmpz_get_si
+#define mpz_get_str __gmpz_get_str
+#define mpz_get_ui __gmpz_get_ui
+#define mpz_getlimbn __gmpz_getlimbn
+#define mpz_hamdist __gmpz_hamdist
+#define mpz_init __gmpz_init
+#define mpz_inp_binary __gmpz_inp_binary
+#define mpz_inp_raw __gmpz_inp_raw
+#define mpz_inp_str __gmpz_inp_str
+#define mpz_init_set __gmpz_init_set
+#define mpz_init_set_d __gmpz_init_set_d
+#define mpz_init_set_si __gmpz_init_set_si
+#define mpz_init_set_str __gmpz_init_set_str
+#define mpz_init_set_ui __gmpz_init_set_ui
+#define mpz_invert __gmpz_invert
+#define mpz_ior __gmpz_ior
+#define mpz_jacobi __gmpz_jacobi
+#define mpz_lcm __gmpz_lcm
+#define mpz_legendre __gmpz_legendre
+#define mpz_mod __gmpz_mod
+#define mpz_mul __gmpz_mul
+#define mpz_mul_2exp __gmpz_mul_2exp
+#define mpz_neg __gmpz_neg
+#define mpz_nextprime __gmpz_nextprime
+#define mpz_out_binary __gmpz_out_binary
+#define mpz_out_raw __gmpz_out_raw
+#define mpz_out_str __gmpz_out_str
+#define mpz_perfect_power_p __gmpz_perfect_power_p
+#define mpz_perfect_square_p __gmpz_perfect_square_p
+#define mpz_popcount __gmpz_popcount
+#define mpz_pow_ui __gmpz_pow_ui
+#define mpz_powm __gmpz_powm
+#define mpz_powm_ui __gmpz_powm_ui
+#define mpz_probab_prime_p __gmpz_probab_prime_p
+#define mpz_random __gmpz_random
+#define mpz_random2 __gmpz_random2
+#define mpz_remove __gmpz_remove
+#define mpz_root __gmpz_root
+#define mpz_rrandomb __gmpz_rrandomb
+#define mpz_scan0 __gmpz_scan0
+#define mpz_scan1 __gmpz_scan1
+#define mpz_set __gmpz_set
+#define mpz_set_d __gmpz_set_d
+#define mpz_set_f __gmpz_set_f
+#define mpz_set_q __gmpz_set_q
+#define mpz_set_si __gmpz_set_si
+#define mpz_set_str __gmpz_set_str
+#define mpz_set_ui __gmpz_set_ui
+#define mpz_setbit __gmpz_setbit
+#define mpz_size __gmpz_size
+#define mpz_sizeinbase __gmpz_sizeinbase
+#define mpz_sqrt __gmpz_sqrt
+#define mpz_sqrtrem __gmpz_sqrtrem
+#define mpz_sub __gmpz_sub
+#define mpz_sub_ui __gmpz_sub_ui
+#define mpz_swap __gmpz_swap
+#define mpz_tdiv_ui __gmpz_tdiv_ui
+#define mpz_tdiv_q __gmpz_tdiv_q
+#define mpz_tdiv_q_2exp __gmpz_tdiv_q_2exp
+#define mpz_tdiv_q_ui __gmpz_tdiv_q_ui
+#define mpz_tdiv_qr __gmpz_tdiv_qr
+#define mpz_tdiv_qr_ui __gmpz_tdiv_qr_ui
+#define mpz_tdiv_r __gmpz_tdiv_r
+#define mpz_tdiv_r_2exp __gmpz_tdiv_r_2exp
+#define mpz_tdiv_r_ui __gmpz_tdiv_r_ui
+#define mpz_tstbit __gmpz_tstbit
+#define mpz_ui_pow_ui __gmpz_ui_pow_ui
+#define mpz_urandomb __gmpz_urandomb
+#define mpz_urandomm __gmpz_urandomm
+#define mpz_xor __gmpz_xor
+#define mpz_eor __gmpz_xor
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+DLL_IMPORT void *_mpz_realloc _PROTO ((mpz_ptr, mp_size_t));
+
+DLL_IMPORT void mpz_abs _PROTO ((mpz_ptr, mpz_srcptr));
+DLL_IMPORT void mpz_add _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_add_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_addmul_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_and _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_array_init _PROTO ((mpz_ptr, mp_size_t, mp_size_t));
+DLL_IMPORT void mpz_bin_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_bin_uiui _PROTO ((mpz_ptr, unsigned long int, unsigned long int));
+DLL_IMPORT void mpz_cdiv_q _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT unsigned long int mpz_cdiv_q_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_cdiv_qr _PROTO ((mpz_ptr, mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT unsigned long int mpz_cdiv_qr_ui _PROTO ((mpz_ptr, mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_cdiv_r _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT unsigned long int mpz_cdiv_r_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT unsigned long int mpz_cdiv_ui _PROTO ((mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_clear _PROTO ((mpz_ptr));
+DLL_IMPORT void mpz_clrbit _PROTO ((mpz_ptr, unsigned long int));
+DLL_IMPORT int mpz_cmp _PROTO ((mpz_srcptr, mpz_srcptr));
+DLL_IMPORT int _mpz_cmp_si _PROTO ((mpz_srcptr, signed long int));
+DLL_IMPORT int _mpz_cmp_ui _PROTO ((mpz_srcptr, unsigned long int));
+DLL_IMPORT int mpz_cmpabs _PROTO ((mpz_srcptr, mpz_srcptr));
+DLL_IMPORT int mpz_cmpabs_ui _PROTO ((mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_com _PROTO ((mpz_ptr, mpz_srcptr));
+DLL_IMPORT void mpz_divexact _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_dump _PROTO ((mpz_srcptr));
+DLL_IMPORT void mpz_fac_ui _PROTO ((mpz_ptr, unsigned long int));
+DLL_IMPORT void mpz_fdiv_q _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_fdiv_q_2exp _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT unsigned long int mpz_fdiv_q_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_fdiv_qr _PROTO ((mpz_ptr, mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT unsigned long int mpz_fdiv_qr_ui _PROTO ((mpz_ptr, mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_fdiv_r _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_fdiv_r_2exp _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT unsigned long int mpz_fdiv_r_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT unsigned long int mpz_fdiv_ui _PROTO ((mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_fib_ui _PROTO ((mpz_ptr, unsigned long int));
+DLL_IMPORT int mpz_fits_sint_p _PROTO ((mpz_srcptr));
+DLL_IMPORT int mpz_fits_slong_p _PROTO ((mpz_srcptr));
+DLL_IMPORT int mpz_fits_sshort_p _PROTO ((mpz_srcptr));
+DLL_IMPORT int mpz_fits_uint_p _PROTO ((mpz_srcptr));
+DLL_IMPORT int mpz_fits_ulong_p _PROTO ((mpz_srcptr));
+DLL_IMPORT int mpz_fits_ushort_p _PROTO ((mpz_srcptr));
+DLL_IMPORT void mpz_gcd _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT unsigned long int mpz_gcd_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_gcdext _PROTO ((mpz_ptr, mpz_ptr, mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT double mpz_get_d _PROTO ((mpz_srcptr));
+/* signed */ long int mpz_get_si _PROTO ((mpz_srcptr));
+DLL_IMPORT char *mpz_get_str _PROTO ((char *, int, mpz_srcptr));
+DLL_IMPORT unsigned long int mpz_get_ui _PROTO ((mpz_srcptr));
+DLL_IMPORT mp_limb_t mpz_getlimbn _PROTO ((mpz_srcptr, mp_size_t));
+DLL_IMPORT unsigned long int mpz_hamdist _PROTO ((mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_init _PROTO ((mpz_ptr));
+#ifdef _GMP_H_HAVE_FILE
+DLL_IMPORT size_t mpz_inp_binary _PROTO ((mpz_ptr, FILE *));
+DLL_IMPORT size_t mpz_inp_raw _PROTO ((mpz_ptr, FILE *));
+DLL_IMPORT size_t mpz_inp_str _PROTO ((mpz_ptr, FILE *, int));
+#endif
+DLL_IMPORT void mpz_init_set _PROTO ((mpz_ptr, mpz_srcptr));
+DLL_IMPORT void mpz_init_set_d _PROTO ((mpz_ptr, double));
+DLL_IMPORT void mpz_init_set_si _PROTO ((mpz_ptr, signed long int));
+DLL_IMPORT int mpz_init_set_str _PROTO ((mpz_ptr, __gmp_const char *, int));
+DLL_IMPORT void mpz_init_set_ui _PROTO ((mpz_ptr, unsigned long int));
+DLL_IMPORT int mpz_invert _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_ior _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT int mpz_jacobi _PROTO ((mpz_srcptr, mpz_srcptr));
+
+#define mpz_kronecker_si __gmpz_kronecker_si
+DLL_IMPORT int mpz_kronecker_si _PROTO ((mpz_srcptr, long));
+
+#define mpz_kronecker_ui __gmpz_kronecker_ui
+DLL_IMPORT int mpz_kronecker_ui _PROTO ((mpz_srcptr, unsigned long));
+
+#define mpz_si_kronecker __gmpz_si_kronecker
+DLL_IMPORT int mpz_si_kronecker _PROTO ((long, mpz_srcptr));
+
+#define mpz_ui_kronecker __gmpz_ui_kronecker
+DLL_IMPORT int mpz_ui_kronecker _PROTO ((unsigned long, mpz_srcptr));
+
+DLL_IMPORT void mpz_lcm _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT int mpz_legendre _PROTO ((mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_mod _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_mul _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_mul_2exp _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+
+#define mpz_mul_si __gmpz_mul_si
+DLL_IMPORT void mpz_mul_si _PROTO ((mpz_ptr, mpz_srcptr, long int));
+
+#define mpz_mul_ui __gmpz_mul_ui
+DLL_IMPORT void mpz_mul_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+
+DLL_IMPORT void mpz_neg _PROTO ((mpz_ptr, mpz_srcptr));
+DLL_IMPORT void mpz_nextprime _PROTO ((mpz_ptr, mpz_srcptr));
+#ifdef _GMP_H_HAVE_FILE
+DLL_IMPORT size_t mpz_out_binary _PROTO ((FILE *, mpz_srcptr));
+DLL_IMPORT size_t mpz_out_raw _PROTO ((FILE *, mpz_srcptr));
+DLL_IMPORT size_t mpz_out_str _PROTO ((FILE *, int, mpz_srcptr));
+#endif
+DLL_IMPORT int mpz_perfect_power_p _PROTO ((mpz_srcptr));
+DLL_IMPORT int mpz_perfect_square_p _PROTO ((mpz_srcptr));
+DLL_IMPORT unsigned long int mpz_popcount _PROTO ((mpz_srcptr));
+DLL_IMPORT void mpz_pow_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_powm _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_powm_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int, mpz_srcptr));
+DLL_IMPORT int mpz_probab_prime_p _PROTO ((mpz_srcptr, int));
+DLL_IMPORT void mpz_random _PROTO ((mpz_ptr, mp_size_t));
+DLL_IMPORT void mpz_random2 _PROTO ((mpz_ptr, mp_size_t));
+DLL_IMPORT unsigned long int mpz_remove _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT int mpz_root _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_rrandomb _PROTO ((mpz_ptr, gmp_randstate_t, unsigned long int));
+DLL_IMPORT unsigned long int mpz_scan0 _PROTO ((mpz_srcptr, unsigned long int));
+DLL_IMPORT unsigned long int mpz_scan1 _PROTO ((mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_set _PROTO ((mpz_ptr, mpz_srcptr));
+DLL_IMPORT void mpz_set_d _PROTO ((mpz_ptr, double));
+DLL_IMPORT void mpz_set_f _PROTO ((mpz_ptr, mpf_srcptr));
+DLL_IMPORT void mpz_set_q _PROTO ((mpz_ptr, mpq_srcptr));
+DLL_IMPORT void mpz_set_si _PROTO ((mpz_ptr, signed long int));
+DLL_IMPORT int mpz_set_str _PROTO ((mpz_ptr, __gmp_const char *, int));
+DLL_IMPORT void mpz_set_ui _PROTO ((mpz_ptr, unsigned long int));
+DLL_IMPORT void mpz_setbit _PROTO ((mpz_ptr, unsigned long int));
+DLL_IMPORT size_t mpz_size _PROTO ((mpz_srcptr));
+DLL_IMPORT size_t mpz_sizeinbase _PROTO ((mpz_srcptr, int));
+DLL_IMPORT void mpz_sqrt _PROTO ((mpz_ptr, mpz_srcptr));
+DLL_IMPORT void mpz_sqrtrem _PROTO ((mpz_ptr, mpz_ptr, mpz_srcptr));
+DLL_IMPORT void mpz_sub _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_sub_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_swap _PROTO ((mpz_ptr, mpz_ptr));
+DLL_IMPORT void mpz_tdiv_q _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_tdiv_q_2exp _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT unsigned long int mpz_tdiv_ui _PROTO ((mpz_srcptr, unsigned long int));
+DLL_IMPORT unsigned long int mpz_tdiv_q_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_tdiv_qr _PROTO ((mpz_ptr, mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT unsigned long int mpz_tdiv_qr_ui _PROTO ((mpz_ptr, mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_tdiv_r _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_tdiv_r_2exp _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT unsigned long int mpz_tdiv_r_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT int mpz_tstbit _PROTO ((mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_ui_pow_ui _PROTO ((mpz_ptr, unsigned long int, unsigned long int));
+DLL_IMPORT void mpz_urandomb _PROTO ((mpz_t, gmp_randstate_t, unsigned long int));
+DLL_IMPORT void mpz_urandomm _PROTO ((mpz_t, gmp_randstate_t, mpz_t));
+DLL_IMPORT void mpz_xor _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+#if defined (__cplusplus)
+}
+#endif
+
+/**************** Rational (i.e. Q) routines.  ****************/
+
+#define mpq_init __gmpq_init
+#define mpq_clear __gmpq_clear
+#define mpq_set __gmpq_set
+#define mpq_set_ui __gmpq_set_ui
+#define mpq_set_si __gmpq_set_si
+#define mpq_set_z __gmpq_set_z
+#define mpq_add __gmpq_add
+#define mpq_sub __gmpq_sub
+#define mpq_mul __gmpq_mul
+#define mpq_div __gmpq_div
+#define mpq_neg __gmpq_neg
+#define mpq_cmp __gmpq_cmp
+#define _mpq_cmp_ui __gmpq_cmp_ui
+#define mpq_equal __gmpq_equal
+#define mpq_inv __gmpq_inv
+#define mpq_set_num __gmpq_set_num
+#define mpq_set_den __gmpq_set_den
+#define mpq_get_num __gmpq_get_num
+#define mpq_get_den __gmpq_get_den
+#define mpq_get_d __gmpq_get_d
+#define mpq_set_d __gmpq_set_d
+#define mpq_canonicalize __gmpq_canonicalize
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+DLL_IMPORT void mpq_init _PROTO ((mpq_ptr));
+DLL_IMPORT void mpq_clear _PROTO ((mpq_ptr));
+DLL_IMPORT void mpq_set _PROTO ((mpq_ptr, mpq_srcptr));
+DLL_IMPORT void mpq_set_ui _PROTO ((mpq_ptr, unsigned long int, unsigned long int));
+DLL_IMPORT void mpq_set_si _PROTO ((mpq_ptr, signed long int, unsigned long int));
+DLL_IMPORT void mpq_set_z _PROTO ((mpq_ptr, mpz_srcptr));
+DLL_IMPORT void mpq_add _PROTO ((mpq_ptr, mpq_srcptr, mpq_srcptr));
+DLL_IMPORT void mpq_sub _PROTO ((mpq_ptr, mpq_srcptr, mpq_srcptr));
+DLL_IMPORT void mpq_mul _PROTO ((mpq_ptr, mpq_srcptr, mpq_srcptr));
+DLL_IMPORT void mpq_div _PROTO ((mpq_ptr, mpq_srcptr, mpq_srcptr));
+DLL_IMPORT void mpq_neg _PROTO ((mpq_ptr, mpq_srcptr));
+DLL_IMPORT int mpq_cmp _PROTO ((mpq_srcptr, mpq_srcptr));
+DLL_IMPORT int _mpq_cmp_ui _PROTO ((mpq_srcptr, unsigned long int, unsigned long int));
+DLL_IMPORT int mpq_equal _PROTO ((mpq_srcptr, mpq_srcptr));
+DLL_IMPORT void mpq_inv _PROTO ((mpq_ptr, mpq_srcptr));
+DLL_IMPORT void mpq_set_num _PROTO ((mpq_ptr, mpz_srcptr));
+DLL_IMPORT void mpq_set_den _PROTO ((mpq_ptr, mpz_srcptr));
+DLL_IMPORT void mpq_get_num _PROTO ((mpz_ptr, mpq_srcptr));
+DLL_IMPORT void mpq_get_den _PROTO ((mpz_ptr, mpq_srcptr));
+DLL_IMPORT double mpq_get_d _PROTO ((mpq_srcptr));
+DLL_IMPORT void mpq_set_d _PROTO ((mpq_ptr, double));
+DLL_IMPORT void mpq_canonicalize _PROTO ((mpq_ptr));
+
+#define mpq_swap __gmpq_swap
+DLL_IMPORT void mpq_swap _PROTO ((mpq_ptr, mpq_ptr));
+
+#ifdef _GMP_H_HAVE_FILE
+#define mpq_out_str __gmpq_out_str
+DLL_IMPORT size_t mpq_out_str _PROTO ((FILE *, int, mpq_srcptr));
+#endif
+
+#if defined (__cplusplus)
+}
+#endif
+
+/**************** Float (i.e. F) routines.  ****************/
+
+#define mpf_abs __gmpf_abs
+#define mpf_add __gmpf_add
+#define mpf_add_ui __gmpf_add_ui
+#define mpf_ceil __gmpf_ceil
+#define mpf_clear __gmpf_clear
+#define mpf_cmp __gmpf_cmp
+#define mpf_cmp_si __gmpf_cmp_si
+#define mpf_cmp_ui __gmpf_cmp_ui
+#define mpf_div __gmpf_div
+#define mpf_div_2exp __gmpf_div_2exp
+#define mpf_div_ui __gmpf_div_ui
+#define mpf_dump __gmpf_dump
+#define mpf_floor __gmpf_floor
+#define mpf_eq __gmpf_eq
+#define mpf_get_d __gmpf_get_d
+#define mpf_get_prec __gmpf_get_prec
+#define mpf_get_str __gmpf_get_str
+#define mpf_init __gmpf_init
+#define mpf_init2 __gmpf_init2
+#define mpf_inp_str __gmpf_inp_str
+#define mpf_init_set __gmpf_init_set
+#define mpf_init_set_d __gmpf_init_set_d
+#define mpf_init_set_si __gmpf_init_set_si
+#define mpf_init_set_str __gmpf_init_set_str
+#define mpf_init_set_ui __gmpf_init_set_ui
+#define mpf_mul __gmpf_mul
+#define mpf_mul_2exp __gmpf_mul_2exp
+#define mpf_mul_ui __gmpf_mul_ui
+#define mpf_neg __gmpf_neg
+#define mpf_out_str __gmpf_out_str
+#define mpf_pow_ui __gmpf_pow_ui
+#define mpf_random2 __gmpf_random2
+#define mpf_reldiff __gmpf_reldiff
+#define mpf_set __gmpf_set
+#define mpf_set_d __gmpf_set_d
+#define mpf_set_default_prec __gmpf_set_default_prec
+#define mpf_set_prec __gmpf_set_prec
+#define mpf_set_prec_raw __gmpf_set_prec_raw
+#define mpf_set_q __gmpf_set_q
+#define mpf_set_si __gmpf_set_si
+#define mpf_set_str __gmpf_set_str
+#define mpf_set_ui __gmpf_set_ui
+#define mpf_set_z __gmpf_set_z
+#define mpf_size __gmpf_size
+#define mpf_sqrt __gmpf_sqrt
+#define mpf_sqrt_ui __gmpf_sqrt_ui
+#define mpf_sub __gmpf_sub
+#define mpf_sub_ui __gmpf_sub_ui
+#define mpf_trunc __gmpf_trunc
+#define mpf_ui_div __gmpf_ui_div
+#define mpf_ui_sub __gmpf_ui_sub
+#define mpf_urandomb __gmpf_urandomb
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+DLL_IMPORT void mpf_abs _PROTO ((mpf_ptr, mpf_srcptr));
+DLL_IMPORT void mpf_add _PROTO ((mpf_ptr, mpf_srcptr, mpf_srcptr));
+DLL_IMPORT void mpf_add_ui _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
+DLL_IMPORT void mpf_ceil _PROTO ((mpf_ptr, mpf_srcptr));
+DLL_IMPORT void mpf_clear _PROTO ((mpf_ptr));
+DLL_IMPORT int mpf_cmp _PROTO ((mpf_srcptr, mpf_srcptr));
+DLL_IMPORT int mpf_cmp_si _PROTO ((mpf_srcptr, signed long int));
+DLL_IMPORT int mpf_cmp_ui _PROTO ((mpf_srcptr, unsigned long int));
+DLL_IMPORT void mpf_div _PROTO ((mpf_ptr, mpf_srcptr, mpf_srcptr));
+DLL_IMPORT void mpf_div_2exp _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
+DLL_IMPORT void mpf_div_ui _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
+DLL_IMPORT void mpf_dump _PROTO ((mpf_srcptr));
+DLL_IMPORT int mpf_eq _PROTO ((mpf_srcptr, mpf_srcptr, unsigned long int));
+DLL_IMPORT void mpf_floor _PROTO ((mpf_ptr, mpf_srcptr));
+DLL_IMPORT double mpf_get_d _PROTO ((mpf_srcptr));
+DLL_IMPORT unsigned long int mpf_get_prec _PROTO ((mpf_srcptr));
+char *mpf_get_str _PROTO ((char *, mp_exp_t *, int, size_t, mpf_srcptr));
+DLL_IMPORT void mpf_init _PROTO ((mpf_ptr));
+DLL_IMPORT void mpf_init2 _PROTO ((mpf_ptr, unsigned long int));
+#ifdef _GMP_H_HAVE_FILE
+DLL_IMPORT size_t mpf_inp_str _PROTO ((mpf_ptr, FILE *, int));
+#endif
+DLL_IMPORT void mpf_init_set _PROTO ((mpf_ptr, mpf_srcptr));
+DLL_IMPORT void mpf_init_set_d _PROTO ((mpf_ptr, double));
+DLL_IMPORT void mpf_init_set_si _PROTO ((mpf_ptr, signed long int));
+DLL_IMPORT int mpf_init_set_str _PROTO ((mpf_ptr, __gmp_const char *, int));
+DLL_IMPORT void mpf_init_set_ui _PROTO ((mpf_ptr, unsigned long int));
+DLL_IMPORT void mpf_mul _PROTO ((mpf_ptr, mpf_srcptr, mpf_srcptr));
+DLL_IMPORT void mpf_mul_2exp _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
+DLL_IMPORT void mpf_mul_ui _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
+DLL_IMPORT void mpf_neg _PROTO ((mpf_ptr, mpf_srcptr));
+#ifdef _GMP_H_HAVE_FILE
+DLL_IMPORT size_t mpf_out_str _PROTO ((FILE *, int, size_t, mpf_srcptr));
+#endif
+DLL_IMPORT void mpf_pow_ui _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
+DLL_IMPORT void mpf_random2 _PROTO ((mpf_ptr, mp_size_t, mp_exp_t));
+DLL_IMPORT void mpf_reldiff _PROTO ((mpf_ptr, mpf_srcptr, mpf_srcptr));
+DLL_IMPORT void mpf_set _PROTO ((mpf_ptr, mpf_srcptr));
+DLL_IMPORT void mpf_set_d _PROTO ((mpf_ptr, double));
+DLL_IMPORT void mpf_set_default_prec _PROTO ((unsigned long int));
+DLL_IMPORT void mpf_set_prec _PROTO ((mpf_ptr, unsigned long int));
+DLL_IMPORT void mpf_set_prec_raw _PROTO ((mpf_ptr, unsigned long int));
+DLL_IMPORT void mpf_set_q _PROTO ((mpf_ptr, mpq_srcptr));
+DLL_IMPORT void mpf_set_si _PROTO ((mpf_ptr, signed long int));
+DLL_IMPORT int mpf_set_str _PROTO ((mpf_ptr, __gmp_const char *, int));
+DLL_IMPORT void mpf_set_ui _PROTO ((mpf_ptr, unsigned long int));
+DLL_IMPORT void mpf_set_z _PROTO ((mpf_ptr, mpz_srcptr));
+DLL_IMPORT size_t mpf_size _PROTO ((mpf_srcptr));
+DLL_IMPORT void mpf_sqrt _PROTO ((mpf_ptr, mpf_srcptr));
+DLL_IMPORT void mpf_sqrt_ui _PROTO ((mpf_ptr, unsigned long int));
+DLL_IMPORT void mpf_sub _PROTO ((mpf_ptr, mpf_srcptr, mpf_srcptr));
+DLL_IMPORT void mpf_sub_ui _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
+DLL_IMPORT void mpf_trunc _PROTO ((mpf_ptr, mpf_srcptr));
+DLL_IMPORT void mpf_ui_div _PROTO ((mpf_ptr, unsigned long int, mpf_srcptr));
+DLL_IMPORT void mpf_ui_sub _PROTO ((mpf_ptr, unsigned long int, mpf_srcptr));
+DLL_IMPORT void mpf_urandomb _PROTO ((mpf_t, gmp_randstate_t, unsigned long int));
+
+#define mpf_swap __gmpf_swap
+DLL_IMPORT void mpf_swap _PROTO ((mpf_ptr, mpf_ptr));
+
+#if defined (__cplusplus)
+}
+#endif
+/************ Low level positive-integer (i.e. N) routines.  ************/
+
+/* This is ugly, but we need to make user calls reach the prefixed function. */
+#define mpn_add			__MPN(add)
+#define mpn_add_1		__MPN(add_1)
+#define mpn_add_n		__MPN(add_n)
+#define mpn_add_nc		__MPN(add_nc)
+#define mpn_addmul_1		__MPN(addmul_1)
+#define mpn_addsub_n		__MPN(addsub_n)
+#define mpn_addsub_nc		__MPN(addsub_nc)
+/* #define mpn_and_n		__MPN(and_n) */
+/* #define mpn_andn_n		__MPN(andn_n) */
+#define mpn_bdivmod		__MPN(bdivmod)
+#define mpn_cmp			__MPN(cmp)
+/* #define mpn_com_n		__MPN(com_n) */
+#define mpn_copyd		__MPN(copyd)
+#define mpn_copyi		__MPN(copyi)
+#define mpn_divrem		__MPN(divrem)
+#define mpn_divrem_1		__MPN(divrem_1)
+#define mpn_divrem_2		__MPN(divrem_2)
+#define mpn_dump		__MPN(dump)
+#define mpn_gcd			__MPN(gcd)
+#define mpn_gcd_1		__MPN(gcd_1)
+#define mpn_gcdext		__MPN(gcdext)
+#define mpn_get_str		__MPN(get_str)
+#define mpn_hamdist		__MPN(hamdist)
+#define mpn_invert_limb 	__MPN(invert_limb)
+/* #define mpn_ior_n		__MPN(ior_n) */
+/* #define mpn_iorn_n		__MPN(iorn_n) */
+/* #define mpn_kara_mul_n	__MPN(kara_mul_n)  internal */
+/* #define mpn_kara_sqr_n	__MPN(kara_sqr_n)  internal */
+#define mpn_lshift		__MPN(lshift)
+#define mpn_lshiftc		__MPN(lshiftc)
+#define mpn_mod_1		__MPN(mod_1)
+#define mpn_mul			__MPN(mul)
+#define mpn_mul_1		__MPN(mul_1)
+#define mpn_mul_basecase	__MPN(mul_basecase)
+#define mpn_mul_n		__MPN(mul_n)
+#define mpn_perfect_square_p	__MPN(perfect_square_p)
+#define mpn_popcount		__MPN(popcount)
+#define mpn_preinv_mod_1	__MPN(preinv_mod_1)
+/* #define mpn_nand_n		__MPN(nand_n) */
+/* #define mpn_nior_n		__MPN(nior_n) */
+#define mpn_random		__MPN(random)
+#define mpn_random2		__MPN(random2)
+#define mpn_rshift		__MPN(rshift)
+#define mpn_rshiftc		__MPN(rshiftc)
+#define mpn_scan0		__MPN(scan0)
+#define mpn_scan1		__MPN(scan1)
+#define mpn_set_str		__MPN(set_str)
+#define mpn_sqr_basecase	__MPN(sqr_basecase)
+#define mpn_sqr_n		__MPN(sqr_n)
+#define mpn_sqrtrem		__MPN(sqrtrem)
+#define mpn_sub			__MPN(sub)
+#define mpn_sub_1		__MPN(sub_1)
+#define mpn_sub_n		__MPN(sub_n)
+#define mpn_sub_nc		__MPN(sub_nc)
+#define mpn_submul_1		__MPN(submul_1)
+/* #define mpn_toom3_mul_n		__MPN(toom3_mul_n)  internal */
+/* #define mpn_toom3_sqr_n		__MPN(toom3_sqr_n)  internal */
+/* #define mpn_xnor_n		__MPN(xnor_n) */
+/* #define mpn_xor_n		__MPN(xor_n) */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+DLL_IMPORT mp_limb_t mpn_add _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr,mp_size_t));
+DLL_IMPORT mp_limb_t mpn_add_1 _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
+DLL_IMPORT mp_limb_t mpn_add_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+DLL_IMPORT mp_limb_t mpn_add_nc _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t));
+
+DLL_IMPORT mp_limb_t mpn_addmul_1 _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
+
+#define mpn_addmul_1c  __MPN(addmul_1c)
+DLL_IMPORT mp_limb_t mpn_addmul_1c _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t));
+
+DLL_IMPORT mp_limb_t mpn_addsub_n _PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+DLL_IMPORT mp_limb_t mpn_bdivmod _PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t, unsigned long int));
+DLL_IMPORT int mpn_cmp _PROTO ((mp_srcptr, mp_srcptr, mp_size_t));
+
+#define mpn_divexact_by3(dst, src, size)  mpn_divexact_by3c (dst, src, size, 0)
+
+#define mpn_divexact_by3c  __MPN(divexact_by3c)
+DLL_IMPORT mp_limb_t mpn_divexact_by3c _PROTO ((mp_ptr dst, mp_srcptr src,
+                                     mp_size_t size, mp_limb_t carry));
+
+#define mpn_divmod_1(qp,np,nsize,dlimb) mpn_divrem_1 (qp,0,np,nsize,dlimb)
+
+DLL_IMPORT mp_limb_t mpn_divrem _PROTO((mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr, mp_size_t));
+
+DLL_IMPORT mp_limb_t mpn_divrem_1 _PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t));
+
+#define mpn_divrem_1c  __MPN(divrem_1c)
+DLL_IMPORT mp_limb_t mpn_divrem_1c _PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t,
+                                 mp_limb_t, mp_limb_t));
+
+DLL_IMPORT mp_limb_t mpn_divrem_2 _PROTO ((mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr));
+DLL_IMPORT void mpn_dump _PROTO ((mp_srcptr, mp_size_t));
+mp_size_t mpn_gcd _PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
+DLL_IMPORT mp_limb_t mpn_gcd_1 _PROTO ((mp_srcptr, mp_size_t, mp_limb_t));
+mp_size_t mpn_gcdext _PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
+DLL_IMPORT size_t mpn_get_str _PROTO ((unsigned char *, int, mp_ptr, mp_size_t));
+DLL_IMPORT unsigned long int mpn_hamdist _PROTO ((mp_srcptr, mp_srcptr, mp_size_t));
+
+#define mpn_jacobi_base __MPN(jacobi_base)
+DLL_IMPORT int mpn_jacobi_base _PROTO ((mp_limb_t a, mp_limb_t b, int result_bit1));
+
+DLL_IMPORT mp_limb_t mpn_lshift _PROTO ((mp_ptr, mp_srcptr, mp_size_t, unsigned int));
+DLL_IMPORT mp_limb_t mpn_mod_1 _PROTO ((mp_srcptr, mp_size_t, mp_limb_t));
+
+#define mpn_mod_1c  __MPN(mod_1c)
+DLL_IMPORT mp_limb_t mpn_mod_1c _PROTO ((mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t));
+
+#define mpn_mod_1_rshift __MPN(mod_1_rshift)
+DLL_IMPORT mp_limb_t mpn_mod_1_rshift _PROTO ((mp_srcptr, mp_size_t, unsigned,mp_limb_t));
+
+DLL_IMPORT mp_limb_t mpn_mul _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t));
+DLL_IMPORT mp_limb_t mpn_mul_1 _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
+
+#define mpn_mul_1c  __MPN(mul_1c)
+DLL_IMPORT mp_limb_t mpn_mul_1c _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t));
+
+DLL_IMPORT void mpn_mul_basecase _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t));
+DLL_IMPORT void mpn_mul_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+DLL_IMPORT int mpn_perfect_square_p _PROTO ((mp_srcptr, mp_size_t));
+DLL_IMPORT unsigned long int mpn_popcount _PROTO ((mp_srcptr, mp_size_t));
+DLL_IMPORT mp_limb_t mpn_preinv_mod_1 _PROTO ((mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t));
+DLL_IMPORT void mpn_random _PROTO ((mp_ptr, mp_size_t));
+DLL_IMPORT void mpn_random2 _PROTO ((mp_ptr, mp_size_t));
+DLL_IMPORT mp_limb_t mpn_rshift _PROTO ((mp_ptr, mp_srcptr, mp_size_t, unsigned int));
+DLL_IMPORT unsigned long int mpn_scan0 _PROTO ((mp_srcptr, unsigned long int));
+DLL_IMPORT unsigned long int mpn_scan1 _PROTO ((mp_srcptr, unsigned long int));
+mp_size_t mpn_set_str _PROTO ((mp_ptr, __gmp_const unsigned char *, size_t, int));
+DLL_IMPORT void mpn_sqr_n _PROTO ((mp_ptr, mp_srcptr, mp_size_t));
+DLL_IMPORT void mpn_sqr_basecase _PROTO ((mp_ptr, mp_srcptr, mp_size_t));
+mp_size_t mpn_sqrtrem _PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t));
+DLL_IMPORT mp_limb_t mpn_sub _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr,mp_size_t));
+DLL_IMPORT mp_limb_t mpn_sub_1 _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
+DLL_IMPORT mp_limb_t mpn_sub_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+DLL_IMPORT mp_limb_t mpn_sub_nc _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t));
+DLL_IMPORT mp_limb_t mpn_submul_1 _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
+
+#define mpn_submul_1c  __MPN(submul_1c)
+DLL_IMPORT mp_limb_t mpn_submul_1c _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t));
+
+#define mpn_tdiv_qr  __MPN(tdiv_qr)
+DLL_IMPORT void mpn_tdiv_qr _PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t));
+
+#if defined (__cplusplus)
+}
+#endif
+
+#define mpn_incr_u(p,incr) \
+  do { mp_limb_t __x; mp_ptr __p = p;			\
+    __x = *__p + incr;					\
+    *__p = __x;						\
+    if (__x < incr)					\
+      while (++(*(++__p)) == 0)				\
+        ;						\
+  } while (0)
+
+#define mpn_decr_u(p,incr) \
+  do { mp_limb_t __x; mp_ptr __p = p;			\
+    __x = *__p;						\
+    *__p = __x - incr;					\
+    if (__x < incr)					\
+      while ((*(++__p))-- == 0)				\
+        ;						\
+  } while (0)
+
+#if defined (__GNUC__) || defined (_FORCE_INLINES)
+_EXTERN_INLINE mp_limb_t
+#if (__STDC__-0) || defined (__cplusplus)
+mpn_add_1 (register mp_ptr res_ptr,
+	   register mp_srcptr s1_ptr,
+	   register mp_size_t s1_size,
+	   register mp_limb_t s2_limb)
+#else
+mpn_add_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     register mp_size_t s1_size;
+     register mp_limb_t s2_limb;
+#endif
+{
+  register mp_limb_t x;
+
+  x = *s1_ptr++;
+  s2_limb = x + s2_limb;
+  *res_ptr++ = s2_limb;
+  if (s2_limb < x)
+    {
+      while (--s1_size != 0)
+	{
+	  x = *s1_ptr++ + 1;
+	  *res_ptr++ = x;
+	  if (x != 0)
+	    goto fin;
+	}
+
+      return 1;
+    }
+
+ fin:
+  if (res_ptr != s1_ptr)
+    {
+      mp_size_t i;
+      for (i = 0; i < s1_size - 1; i++)
+	res_ptr[i] = s1_ptr[i];
+    }
+  return 0;
+}
+
+_EXTERN_INLINE mp_limb_t
+#if (__STDC__-0) || defined (__cplusplus)
+mpn_add (register mp_ptr res_ptr,
+	 register mp_srcptr s1_ptr,
+	 register mp_size_t s1_size,
+	 register mp_srcptr s2_ptr,
+	 register mp_size_t s2_size)
+#else
+mpn_add (res_ptr, s1_ptr, s1_size, s2_ptr, s2_size)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     register mp_size_t s1_size;
+     register mp_srcptr s2_ptr;
+     register mp_size_t s2_size;
+#endif
+{
+  mp_limb_t cy_limb = 0;
+
+  if (s2_size != 0)
+    cy_limb = mpn_add_n (res_ptr, s1_ptr, s2_ptr, s2_size);
+
+  if (s1_size - s2_size != 0)
+    cy_limb = mpn_add_1 (res_ptr + s2_size,
+			 s1_ptr + s2_size,
+			 s1_size - s2_size,
+			 cy_limb);
+  return cy_limb;
+}
+
+_EXTERN_INLINE mp_limb_t
+#if (__STDC__-0) || defined (__cplusplus)
+mpn_sub_1 (register mp_ptr res_ptr,
+	   register mp_srcptr s1_ptr,
+	   register mp_size_t s1_size,
+	   register mp_limb_t s2_limb)
+#else
+mpn_sub_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     register mp_size_t s1_size;
+     register mp_limb_t s2_limb;
+#endif
+{
+  register mp_limb_t x;
+
+  x = *s1_ptr++;
+  s2_limb = x - s2_limb;
+  *res_ptr++ = s2_limb;
+  if (s2_limb > x)
+    {
+      while (--s1_size != 0)
+	{
+	  x = *s1_ptr++;
+	  *res_ptr++ = x - 1;
+	  if (x != 0)
+	    goto fin;
+	}
+
+      return 1;
+    }
+
+ fin:
+  if (res_ptr != s1_ptr)
+    {
+      mp_size_t i;
+      for (i = 0; i < s1_size - 1; i++)
+	res_ptr[i] = s1_ptr[i];
+    }
+  return 0;
+}
+
+_EXTERN_INLINE mp_limb_t
+#if (__STDC__-0) || defined (__cplusplus)
+mpn_sub (register mp_ptr res_ptr,
+	 register mp_srcptr s1_ptr,
+	 register mp_size_t s1_size,
+	 register mp_srcptr s2_ptr,
+	 register mp_size_t s2_size)
+#else
+mpn_sub (res_ptr, s1_ptr, s1_size, s2_ptr, s2_size)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     register mp_size_t s1_size;
+     register mp_srcptr s2_ptr;
+     register mp_size_t s2_size;
+#endif
+{
+  mp_limb_t cy_limb = 0;
+
+  if (s2_size != 0)
+    cy_limb = mpn_sub_n (res_ptr, s1_ptr, s2_ptr, s2_size);
+
+  if (s1_size - s2_size != 0)
+    cy_limb = mpn_sub_1 (res_ptr + s2_size,
+			 s1_ptr + s2_size,
+			 s1_size - s2_size,
+			 cy_limb);
+  return cy_limb;
+}
+#endif /* __GNUC__ */
+
+/* Allow faster testing for negative, zero, and positive.  */
+#define mpz_sgn(Z) ((Z)->_mp_size < 0 ? -1 : (Z)->_mp_size > 0)
+#define mpf_sgn(F) ((F)->_mp_size < 0 ? -1 : (F)->_mp_size > 0)
+#define mpq_sgn(Q) ((Q)->_mp_num._mp_size < 0 ? -1 : (Q)->_mp_num._mp_size > 0)
+
+/* When using GCC, optimize certain common comparisons.  */
+#if defined (__GNUC__)
+#define mpz_cmp_ui(Z,UI) \
+  (__builtin_constant_p (UI) && (UI) == 0				\
+   ? mpz_sgn (Z) : _mpz_cmp_ui (Z,UI))
+#define mpz_cmp_si(Z,SI) \
+  (__builtin_constant_p (SI) && (SI) == 0 ? mpz_sgn (Z)			\
+   : __builtin_constant_p (SI) && (SI) > 0				\
+    ? _mpz_cmp_ui (Z, (unsigned long int) SI)				\
+   : _mpz_cmp_si (Z,SI))
+#define mpq_cmp_ui(Q,NUI,DUI) \
+  (__builtin_constant_p (NUI) && (NUI) == 0				\
+   ? mpq_sgn (Q) : _mpq_cmp_ui (Q,NUI,DUI))
+#else
+#define mpz_cmp_ui(Z,UI) _mpz_cmp_ui (Z,UI)
+#define mpz_cmp_si(Z,UI) _mpz_cmp_si (Z,UI)
+#define mpq_cmp_ui(Q,NUI,DUI) _mpq_cmp_ui (Q,NUI,DUI)
+#endif
+
+
+/* Using "&" rather than "&&" means these can come out branch-free.  Every
+   mpz_t has at least one limb allocated, so fetching the low limb is always
+   allowed.  */
+#define mpz_odd_p(z)   ((int) ((z)->_mp_size != 0) & (int) (z)->_mp_d[0])
+#define mpz_even_p(z)  (! mpz_odd_p (z))
+
+
+/* Allow direct user access to numerator and denominator of a mpq_t object.  */
+#define mpq_numref(Q) (&((Q)->_mp_num))
+#define mpq_denref(Q) (&((Q)->_mp_den))
+
+
+/* Compatibility with GMP 2 and earlier. */
+#define mpn_divmod(qp,np,nsize,dp,dsize) mpn_divrem (qp,0,np,nsize,dp,dsize)
+
+/* Compatibility with GMP 1.  */
+#define mpz_mdiv	mpz_fdiv_q
+#define mpz_mdivmod	mpz_fdiv_qr
+#define mpz_mmod	mpz_fdiv_r
+#define mpz_mdiv_ui	mpz_fdiv_q_ui
+#define mpz_mdivmod_ui(q,r,n,d) \
+  ((r == 0) ? mpz_fdiv_q_ui (q,n,d) : mpz_fdiv_qr_ui (q,r,n,d))
+#define mpz_mmod_ui(r,n,d) \
+  ((r == 0) ? mpz_fdiv_ui (n,d) : mpz_fdiv_r_ui (r,n,d))
+
+/* Useful synonyms, but not quite compatible with GMP 1.  */
+#define mpz_div		mpz_fdiv_q
+#define mpz_divmod	mpz_fdiv_qr
+#define mpz_div_ui	mpz_fdiv_q_ui
+#define mpz_divmod_ui	mpz_fdiv_qr_ui
+#define mpz_mod_ui	mpz_fdiv_r_ui
+#define mpz_div_2exp	mpz_fdiv_q_2exp
+#define mpz_mod_2exp	mpz_fdiv_r_2exp
+
+#define gmp_errno __gmp_errno
+extern int gmp_errno;
+
+enum
+{
+  GMP_ERROR_NONE = 0,
+  GMP_ERROR_UNSUPPORTED_ARGUMENT = 1,
+  GMP_ERROR_DIVISION_BY_ZERO = 2,
+  GMP_ERROR_SQRT_OF_NEGATIVE = 4,
+  GMP_ERROR_INVALID_ARGUMENT = 8,
+  GMP_ERROR_ALLOCATE = 16,
+  GMP_ERROR_BAD_STRING = 32,
+  GMP_ERROR_UNUSED_ERROR
+};
+
+/* Note: major version number is in mp.h too */
+#define __GNU_MP_VERSION 3
+#define __GNU_MP_VERSION_MINOR 1
+#define __GNU_MP_VERSION_PATCHLEVEL 1
+
+#define gmp_version __gmp_version
+extern __gmp_const char *gmp_version;
+
+#define __GMP_H__
+#endif /* __GMP_H__ */
diff --git a/rts/gmp/insert-dbl.c b/rts/gmp/insert-dbl.c
new file mode 100644
index 0000000000..dc88a56f62
--- /dev/null
+++ b/rts/gmp/insert-dbl.c
@@ -0,0 +1,98 @@
+/* __gmp_insert_double -- convert from array of mp_limb_t to double.
+
+Copyright (C) 1996, 1997, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#ifdef XDEBUG
+#undef _GMP_IEEE_FLOATS
+#endif
+
+#ifndef _GMP_IEEE_FLOATS
+#define _GMP_IEEE_FLOATS 0
+#endif
+
+double
+#if __STDC__
+__gmp_scale2 (double d, int exp)
+#else
+__gmp_scale2 (d, exp)
+     double d;
+     int exp;
+#endif
+{
+#if _GMP_IEEE_FLOATS
+  {
+#if defined (__alpha) && __GNUC__ == 2 && __GNUC_MINOR__ == 8
+    /* Work around alpha-specific bug in GCC 2.8.x.  */
+    volatile
+#endif
+    union ieee_double_extract x;
+    x.d = d;
+    exp += x.s.exp;
+    x.s.exp = exp;
+    if (exp >= 2047)
+      {
+	/* Return +-infinity */
+	x.s.exp = 2047;
+	x.s.manl = x.s.manh = 0;
+      }
+    else if (exp < 1)
+      {
+	x.s.exp = 1;		/* smallest exponent (biased) */
+	/* Divide result by 2 until we have scaled it to the right IEEE
+	   denormalized number, but stop if it becomes zero.  */
+	while (exp < 1 && x.d != 0)
+	  {
+	    x.d *= 0.5;
+	    exp++;
+	  }
+      }
+    return x.d;
+  }
+#else
+  {
+    double factor, r;
+
+    factor = 2.0;
+    if (exp < 0)
+      {
+	factor = 0.5;
+	exp = -exp;
+      }
+    r = d;
+    if (exp != 0)
+      {
+	if ((exp & 1) != 0)
+	  r *= factor;
+	exp >>= 1;
+	while (exp != 0)
+	  {
+	    factor *= factor;
+	    if ((exp & 1) != 0)
+	      r *= factor;
+	    exp >>= 1;
+	  }
+      }
+    return r;
+  }
+#endif
+}
diff --git a/rts/gmp/install-sh b/rts/gmp/install-sh
new file mode 100644
index 0000000000..e9de23842d
--- /dev/null
+++ b/rts/gmp/install-sh
@@ -0,0 +1,251 @@
+#!/bin/sh
+#
+# install - install a program, script, or datafile
+# This comes from X11R5 (mit/util/scripts/install.sh).
+#
+# Copyright 1991 by the Massachusetts Institute of Technology
+#
+# Permission to use, copy, modify, distribute, and sell this software and its
+# documentation for any purpose is hereby granted without fee, provided that
+# the above copyright notice appear in all copies and that both that
+# copyright notice and this permission notice appear in supporting
+# documentation, and that the name of M.I.T. not be used in advertising or
+# publicity pertaining to distribution of the software without specific,
+# written prior permission.  M.I.T. makes no representations about the
+# suitability of this software for any purpose.  It is provided "as is"
+# without express or implied warranty.
+#
+# Calling this script install-sh is preferred over install.sh, to prevent
+# `make' implicit rules from creating a file called install from it
+# when there is no Makefile.
+#
+# This script is compatible with the BSD install script, but was written
+# from scratch.  It can only install one file at a time, a restriction
+# shared with many OS's install programs.
+
+
+# set DOITPROG to echo to test this script
+
+# Don't use :- since 4.3BSD and earlier shells don't like it.
+doit="${DOITPROG-}"
+
+
+# put in absolute paths if you don't have them in your path; or use env. vars.
+
+mvprog="${MVPROG-mv}"
+cpprog="${CPPROG-cp}"
+chmodprog="${CHMODPROG-chmod}"
+chownprog="${CHOWNPROG-chown}"
+chgrpprog="${CHGRPPROG-chgrp}"
+stripprog="${STRIPPROG-strip}"
+rmprog="${RMPROG-rm}"
+mkdirprog="${MKDIRPROG-mkdir}"
+
+transformbasename=""
+transform_arg=""
+instcmd="$mvprog"
+chmodcmd="$chmodprog 0755"
+chowncmd=""
+chgrpcmd=""
+stripcmd=""
+rmcmd="$rmprog -f"
+mvcmd="$mvprog"
+src=""
+dst=""
+dir_arg=""
+
+while [ x"$1" != x ]; do
+    case $1 in
+	-c) instcmd="$cpprog"
+	    shift
+	    continue;;
+
+	-d) dir_arg=true
+	    shift
+	    continue;;
+
+	-m) chmodcmd="$chmodprog $2"
+	    shift
+	    shift
+	    continue;;
+
+	-o) chowncmd="$chownprog $2"
+	    shift
+	    shift
+	    continue;;
+
+	-g) chgrpcmd="$chgrpprog $2"
+	    shift
+	    shift
+	    continue;;
+
+	-s) stripcmd="$stripprog"
+	    shift
+	    continue;;
+
+	-t=*) transformarg=`echo $1 | sed 's/-t=//'`
+	    shift
+	    continue;;
+
+	-b=*) transformbasename=`echo $1 | sed 's/-b=//'`
+	    shift
+	    continue;;
+
+	*)  if [ x"$src" = x ]
+	    then
+		src=$1
+	    else
+		# this colon is to work around a 386BSD /bin/sh bug
+		:
+		dst=$1
+	    fi
+	    shift
+	    continue;;
+    esac
+done
+
+if [ x"$src" = x ]
+then
+	echo "install:	no input file specified"
+	exit 1
+else
+	true
+fi
+
+if [ x"$dir_arg" != x ]; then
+	dst=$src
+	src=""
+	
+	if [ -d $dst ]; then
+		instcmd=:
+		chmodcmd=""
+	else
+		instcmd=mkdir
+	fi
+else
+
+# Waiting for this to be detected by the "$instcmd $src $dsttmp" command
+# might cause directories to be created, which would be especially bad 
+# if $src (and thus $dsttmp) contains '*'.
+
+	if [ -f $src -o -d $src ]
+	then
+		true
+	else
+		echo "install:  $src does not exist"
+		exit 1
+	fi
+	
+	if [ x"$dst" = x ]
+	then
+		echo "install:	no destination specified"
+		exit 1
+	else
+		true
+	fi
+
+# If destination is a directory, append the input filename; if your system
+# does not like double slashes in filenames, you may need to add some logic
+
+	if [ -d $dst ]
+	then
+		dst="$dst"/`basename $src`
+	else
+		true
+	fi
+fi
+
+## this sed command emulates the dirname command
+dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'`
+
+# Make sure that the destination directory exists.
+#  this part is taken from Noah Friedman's mkinstalldirs script
+
+# Skip lots of stat calls in the usual case.
+if [ ! -d "$dstdir" ]; then
+defaultIFS='	
+'
+IFS="${IFS-${defaultIFS}}"
+
+oIFS="${IFS}"
+# Some sh's can't handle IFS=/ for some reason.
+IFS='%'
+set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'`
+IFS="${oIFS}"
+
+pathcomp=''
+
+while [ $# -ne 0 ] ; do
+	pathcomp="${pathcomp}${1}"
+	shift
+
+	if [ ! -d "${pathcomp}" ] ;
+        then
+		$mkdirprog "${pathcomp}"
+	else
+		true
+	fi
+
+	pathcomp="${pathcomp}/"
+done
+fi
+
+if [ x"$dir_arg" != x ]
+then
+	$doit $instcmd $dst &&
+
+	if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi &&
+	if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi &&
+	if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi &&
+	if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi
+else
+
+# If we're going to rename the final executable, determine the name now.
+
+	if [ x"$transformarg" = x ] 
+	then
+		dstfile=`basename $dst`
+	else
+		dstfile=`basename $dst $transformbasename | 
+			sed $transformarg`$transformbasename
+	fi
+
+# don't allow the sed command to completely eliminate the filename
+
+	if [ x"$dstfile" = x ] 
+	then
+		dstfile=`basename $dst`
+	else
+		true
+	fi
+
+# Make a temp file name in the proper directory.
+
+	dsttmp=$dstdir/#inst.$$#
+
+# Move or copy the file name to the temp name
+
+	$doit $instcmd $src $dsttmp &&
+
+	trap "rm -f ${dsttmp}" 0 &&
+
+# and set any options; do chmod last to preserve setuid bits
+
+# If any of these fail, we abort the whole thing.  If we want to
+# ignore errors from any of these, just make sure not to ignore
+# errors from the above "$doit $instcmd $src $dsttmp" command.
+
+	if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi &&
+	if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi &&
+	if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi &&
+	if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi &&
+
+# Now rename the file to the real destination.
+
+	$doit $rmcmd -f $dstdir/$dstfile &&
+	$doit $mvcmd $dsttmp $dstdir/$dstfile 
+
+fi &&
+
+
+exit 0
diff --git a/rts/gmp/longlong.h b/rts/gmp/longlong.h
new file mode 100644
index 0000000000..9a12755053
--- /dev/null
+++ b/rts/gmp/longlong.h
@@ -0,0 +1,1347 @@
+/* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this file; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* You have to define the following before including this file:
+
+   UWtype -- An unsigned type, default type for operations (typically a "word")
+   UHWtype -- An unsigned type, at least half the size of UWtype.
+   UDWtype -- An unsigned type, at least twice as large a UWtype
+   W_TYPE_SIZE -- size in bits of UWtype
+
+   SItype, USItype -- Signed and unsigned 32 bit types.
+   DItype, UDItype -- Signed and unsigned 64 bit types.
+
+   On a 32 bit machine UWtype should typically be USItype;
+   on a 64 bit machine, UWtype should typically be UDItype.
+*/
+
+#define __BITS4 (W_TYPE_SIZE / 4)
+#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
+#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
+#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
+
+/* This is used to make sure no undesirable sharing between different libraries
+   that use this file takes place.  */
+#ifndef __MPN
+#define __MPN(x) __##x
+#endif
+
+#ifndef _PROTO
+#if (__STDC__-0) || defined (__cplusplus)
+#define _PROTO(x) x
+#else
+#define _PROTO(x) ()
+#endif
+#endif
+
+/* Define auxiliary asm macros.
+
+   1) umul_ppmm(high_prod, low_prod, multipler, multiplicand) multiplies two
+   UWtype integers MULTIPLER and MULTIPLICAND, and generates a two UWtype
+   word product in HIGH_PROD and LOW_PROD.
+
+   2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
+   UDWtype product.  This is just a variant of umul_ppmm.
+
+   3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
+   denominator) divides a UDWtype, composed by the UWtype integers
+   HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
+   in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
+   than DENOMINATOR for correct operation.  If, in addition, the most
+   significant bit of DENOMINATOR must be 1, then the pre-processor symbol
+   UDIV_NEEDS_NORMALIZATION is defined to 1.
+
+   4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
+   denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
+   is rounded towards 0.
+
+   5) count_leading_zeros(count, x) counts the number of zero-bits from the
+   msb to the first non-zero bit in the UWtype X.  This is the number of
+   steps X needs to be shifted left to set the msb.  Undefined for X == 0,
+   unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
+
+   6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
+   from the least significant end.
+
+   7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
+   high_addend_2, low_addend_2) adds two UWtype integers, composed by
+   HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
+   respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
+   (i.e. carry out) is not stored anywhere, and is lost.
+
+   8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
+   high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
+   composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
+   LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
+   and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
+   and is lost.
+
+   If any of these macros are left undefined for a particular CPU,
+   C macros are used.  */
+
+/* The CPUs come in alphabetical order below.
+
+   Please add support for more CPUs here, or improve the current support
+   for the CPUs below!  */
+
+#if defined (__alpha) && W_TYPE_SIZE == 64
+#if defined (__GNUC__)
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    UDItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("umulh %r1,%2,%0"						\
+	     : "=r" (ph)						\
+	     : "%rJ" (m0), "rI" (m1));					\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#define UMUL_TIME 18
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { UDItype __di;							\
+    __di = __MPN(invert_limb) (d);					\
+    udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
+  } while (0)
+#define UDIV_NEEDS_NORMALIZATION 1
+#define UDIV_TIME 220
+long __MPN(count_leading_zeros) ();
+#define count_leading_zeros(count, x) \
+  ((count) = __MPN(count_leading_zeros) (x))
+#endif /* LONGLONG_STANDALONE */
+#else /* ! __GNUC__ */
+#include <machine/builtins.h>
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    UDItype __m0 = (m0), __m1 = (m1);					\
+    (ph) = __UMULH (m0, m1);						\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#endif
+#endif /* __alpha */
+
+#if defined (__hppa) && W_TYPE_SIZE == 64
+/* We put the result pointer parameter last here, since it makes passing
+   of the other parameters more efficient.  */
+#ifndef LONGLONG_STANDALONE
+#define umul_ppmm(wh, wl, u, v) \
+  do {									\
+    UDItype __p0;							\
+    (wh) = __MPN(umul_ppmm) (u, v, &__p0);				\
+    (wl) = __p0;							\
+  } while (0)
+extern UDItype __MPN(umul_ppmm) _PROTO ((UDItype, UDItype, UDItype *));
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { UDItype __r;							\
+    (q) = __MPN(udiv_qrnnd) (n1, n0, d, &__r);				\
+    (r) = __r;								\
+  } while (0)
+extern UDItype __MPN(udiv_qrnnd) _PROTO ((UDItype, UDItype, UDItype, UDItype *));
+#define UMUL_TIME 8
+#define UDIV_TIME 60
+#endif /* LONGLONG_STANDALONE */
+#endif /* hppa */
+
+#if defined (__ia64) && W_TYPE_SIZE == 64
+#if defined (__GNUC__)
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    UDItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("xma.hu %0 = %1, %2, f0"					\
+	     : "=e" (ph)						\
+	     : "e" (m0), "e" (m1));					\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#endif
+#endif
+
+
+#if defined (__GNUC__) && !defined (NO_ASM)
+
+/* We sometimes need to clobber "cc" with gcc2, but that would not be
+   understood by gcc1.  Use cpp to avoid major code duplication.  */
+#if __GNUC__ < 2
+#define __CLOBBER_CC
+#define __AND_CLOBBER_CC
+#else /* __GNUC__ >= 2 */
+#define __CLOBBER_CC : "cc"
+#define __AND_CLOBBER_CC , "cc"
+#endif /* __GNUC__ < 2 */
+
+#if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"				\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "%r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"				\
+	   : "=r" (sh),	"=&r" (sl)					\
+	   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
+#define umul_ppmm(xh, xl, m0, m1) \
+  do {									\
+    USItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("multiplu %0,%1,%2"					\
+	     : "=r" (xl)						\
+	     : "r" (__m0), "r" (__m1));					\
+    __asm__ ("multmu %0,%1,%2"						\
+	     : "=r" (xh)						\
+	     : "r" (__m0), "r" (__m1));					\
+  } while (0)
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  __asm__ ("dividu %0,%3,%4"						\
+	   : "=r" (q), "=q" (r)						\
+	   : "1" (n1), "r" (n0), "r" (d))
+#define count_leading_zeros(count, x) \
+    __asm__ ("clz %0,%1"						\
+	     : "=r" (count)						\
+	     : "r" (x))
+#define COUNT_LEADING_ZEROS_0 32
+#endif /* __a29k__ */
+
+#if defined (__arm__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "%r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
+	   : "=r" (sh),	"=&r" (sl)					\
+	   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
+#if 1 || defined (__arm_m__)		/* `M' series has widening multiply support */
+#define umul_ppmm(xh, xl, a, b) \
+  __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
+#define smul_ppmm(xh, xl, a, b) \
+  __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
+#define UMUL_TIME 5
+#else
+#define umul_ppmm(xh, xl, a, b) \
+  __asm__ ("%@ Inlined umul_ppmm\n"		\
+	"mov	%|r0, %2, lsr #16\n"		\
+	"mov	%|r2, %3, lsr #16\n"		\
+	"bic	%|r1, %2, %|r0, lsl #16\n"	\
+	"bic	%|r2, %3, %|r2, lsl #16\n"	\
+	"mul	%1, %|r1, %|r2\n"		\
+	"mul	%|r2, %|r0, %|r2\n"		\
+	"mul	%|r1, %0, %|r1\n"		\
+	"mul	%0, %|r0, %0\n"			\
+	"adds	%|r1, %|r2, %|r1\n"		\
+	"addcs	%0, %0, #65536\n"		\
+	"adds	%1, %1, %|r1, lsl #16\n"	\
+	"adc	%0, %0, %|r1, lsr #16"					\
+	   : "=&r" (xh), "=r" (xl)					\
+	   : "r" (a), "r" (b)						\
+	   : "r0", "r1", "r2")
+#define UMUL_TIME 20
+#endif
+#define UDIV_TIME 100
+#endif /* __arm__ */
+
+#if defined (__clipper__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+  ({union {UDItype __ll;						\
+	   struct {USItype __l, __h;} __i;				\
+	  } __x;							\
+  __asm__ ("mulwux %2,%0"						\
+	   : "=r" (__x.__ll)						\
+	   : "%0" ((USItype)(u)), "r" ((USItype)(v)));			\
+  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
+#define smul_ppmm(w1, w0, u, v) \
+  ({union {DItype __ll;							\
+	   struct {SItype __l, __h;} __i;				\
+	  } __x;							\
+  __asm__ ("mulwx %2,%0"						\
+	   : "=r" (__x.__ll)						\
+	   : "%0" ((SItype)(u)), "r" ((SItype)(v)));			\
+  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
+#define __umulsidi3(u, v) \
+  ({UDItype __w;							\
+    __asm__ ("mulwux %2,%0"						\
+	     : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));	\
+    __w; })
+#endif /* __clipper__ */
+
+/* Fujitsu vector computers.  */
+#if defined (__uxp__) && W_TYPE_SIZE == 32
+#define umul_ppmm(ph, pl, u, v) \
+  do {									\
+    union {UDItype __ll;						\
+	   struct {USItype __h, __l;} __i;				\
+	  } __x;							\
+    __asm__ ("mult.lu %1,%2,%0"	: "=r" (__x.__ll) : "%r" (u), "rK" (v));\
+    (ph) = __x.__i.__h;							\
+    (pl) = __x.__i.__l;							\
+  } while (0)
+#define smul_ppmm(ph, pl, u, v) \
+  do {									\
+    union {UDItype __ll;						\
+	   struct {USItype __h, __l;} __i;				\
+	  } __x;							\
+    __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));	\
+    (ph) = __x.__i.__h;							\
+    (pl) = __x.__i.__l;							\
+  } while (0)
+#endif
+
+#if defined (__gmicro__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add.w %5,%1\n\taddx %3,%0"					\
+	   : "=g" ((USItype)(sh)), "=&g" ((USItype)(sl))		\
+	   : "%0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
+	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub.w %5,%1\n\tsubx %3,%0"					\
+	   : "=g" ((USItype)(sh)), "=&g" ((USItype)(sl))		\
+	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
+	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define umul_ppmm(ph, pl, m0, m1) \
+  __asm__ ("mulx %3,%0,%1"						\
+	   : "=g" ((USItype)(ph)), "=r" ((USItype)(pl))			\
+	   : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
+#define udiv_qrnnd(q, r, nh, nl, d) \
+  __asm__ ("divx %4,%0,%1"						\
+	   : "=g" ((USItype)(q)), "=r" ((USItype)(r))			\
+	   : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
+#define count_leading_zeros(count, x) \
+  __asm__ ("bsch/1 %1,%0"						\
+	   : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
+#endif
+
+#if defined (__hppa) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add %4,%5,%1\n\taddc %2,%3,%0"				\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "%rM" (ah), "rM" (bh), "%rM" (al), "rM" (bl))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub %4,%5,%1\n\tsubb %2,%3,%0"				\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "rM" (ah), "rM" (bh), "rM" (al), "rM" (bl))
+#if defined (_PA_RISC1_1)
+#define umul_ppmm(wh, wl, u, v) \
+  do {									\
+    union {UDItype __ll;						\
+	   struct {USItype __h, __l;} __i;				\
+	  } __x;							\
+    __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v));	\
+    (wh) = __x.__i.__h;							\
+    (wl) = __x.__i.__l;							\
+  } while (0)
+#define UMUL_TIME 8
+#define UDIV_TIME 60
+#else
+#define UMUL_TIME 40
+#define UDIV_TIME 80
+#endif
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { USItype __r;							\
+    (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
+    (r) = __r;								\
+  } while (0)
+extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, USItype, USItype, USItype));
+#endif /* LONGLONG_STANDALONE */
+#define count_leading_zeros(count, x) \
+  do {									\
+    USItype __tmp;							\
+    __asm__ (								\
+       "ldi		2,%0\n"							\
+	"extru,=		%1,15,16,%%r0		; Bits 31..16 zero?\n"	\
+	"extru,tr	%1,15,16,%1		; No.  Shift down, skip add.\n"	\
+	"ldo		16(%0),%0		; Yes.  Perform add.\n"		\
+	"extru,=		%1,23,8,%%r0		; Bits 15..8 zero?\n"	\
+	"extru,tr	%1,23,8,%1		; No.  Shift down, skip add.\n"	\
+	"ldo		8(%0),%0		; Yes.  Perform add.\n"		\
+	"extru,=		%1,27,4,%%r0		; Bits 7..4 zero?\n"	\
+	"extru,tr	%1,27,4,%1		; No.  Shift down, skip add.\n"	\
+	"ldo		4(%0),%0		; Yes.  Perform add.\n"		\
+	"extru,=		%1,29,2,%%r0		; Bits 3..2 zero?\n"	\
+	"extru,tr	%1,29,2,%1		; No.  Shift down, skip add.\n"	\
+	"ldo		2(%0),%0		; Yes.  Perform add.\n"		\
+	"extru		%1,30,1,%1		; Extract bit 1.\n"		\
+	"sub		%0,%1,%0		; Subtract it.\n"	\
+	: "=r" (count), "=r" (__tmp) : "1" (x));			\
+  } while (0)
+#endif /* hppa */
+
+#if (defined (__i370__) || defined (__mvs__)) && W_TYPE_SIZE == 32
+#define smul_ppmm(xh, xl, m0, m1) \
+  do {									\
+    union {DItype __ll;							\
+	   struct {USItype __h, __l;} __i;				\
+	  } __x;							\
+    __asm__ ("mr %0,%3"							\
+	     : "=r" (__x.__i.__h), "=r" (__x.__i.__l)			\
+	     : "%1" (m0), "r" (m1));					\
+    (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
+  } while (0)
+#define sdiv_qrnnd(q, r, n1, n0, d) \
+  do {									\
+    union {DItype __ll;							\
+	   struct {USItype __h, __l;} __i;				\
+	  } __x;							\
+    __x.__i.__h = n1; __x.__i.__l = n0;					\
+    __asm__ ("dr %0,%2"							\
+	     : "=r" (__x.__ll)						\
+	     : "0" (__x.__ll), "r" (d));				\
+    (q) = __x.__i.__l; (r) = __x.__i.__h;				\
+  } while (0)
+#endif
+
+#if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addl %5,%1\n\tadcl %3,%0"					\
+	   : "=r" ((USItype)(sh)), "=&r" ((USItype)(sl))		\
+	   : "%0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
+	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subl %5,%1\n\tsbbl %3,%0"					\
+	   : "=r" ((USItype)(sh)), "=&r" ((USItype)(sl))		\
+	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
+	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("mull %3"							\
+	   : "=a" (w0), "=d" (w1)					\
+	   : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  __asm__ ("divl %4"							\
+	   : "=a" (q), "=d" (r)						\
+	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(d)))
+#define count_leading_zeros(count, x) \
+  do {									\
+    USItype __cbtmp;							\
+    __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
+    (count) = __cbtmp ^ 31;						\
+  } while (0)
+#define count_trailing_zeros(count, x) \
+  __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x)))
+#ifndef UMUL_TIME
+#define UMUL_TIME 10
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME 40
+#endif
+#endif /* 80x86 */
+
+#if defined (__i860__) && W_TYPE_SIZE == 32
+#define rshift_rhlc(r,h,l,c) \
+  __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"				\
+	   "=r" (r) : "r" (h), "r" (l), "rn" (c))
+#endif /* i860 */
+
+#if defined (__i960__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"			\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "%dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"			\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
+#define umul_ppmm(w1, w0, u, v) \
+  ({union {UDItype __ll;						\
+	   struct {USItype __l, __h;} __i;				\
+	  } __x;							\
+  __asm__ ("emul %2,%1,%0"						\
+	   : "=d" (__x.__ll) : "%dI" (u), "dI" (v));			\
+  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
+#define __umulsidi3(u, v) \
+  ({UDItype __w;							\
+    __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));	\
+    __w; })
+#define udiv_qrnnd(q, r, nh, nl, d) \
+  do {									\
+    union {UDItype __ll;						\
+	   struct {USItype __l, __h;} __i;				\
+	  } __nn;							\
+    __nn.__i.__h = (nh); __nn.__i.__l = (nl);				\
+    __asm__ ("ediv %d,%n,%0"						\
+	   : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));		\
+    (r) = __rq.__i.__l; (q) = __rq.__i.__h;				\
+  } while (0)
+#define count_leading_zeros(count, x) \
+  do {									\
+    USItype __cbtmp;							\
+    __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));		\
+    (count) = __cbtmp ^ 31;						\
+  } while (0)
+#define COUNT_LEADING_ZEROS_0 (-32) /* sic */
+#if defined (__i960mx)		/* what is the proper symbol to test??? */
+#define rshift_rhlc(r,h,l,c) \
+  do {									\
+    union {UDItype __ll;						\
+	   struct {USItype __l, __h;} __i;				\
+	  } __nn;							\
+    __nn.__i.__h = (h); __nn.__i.__l = (l);				\
+    __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));	\
+  }
+#endif /* i960mx */
+#endif /* i960 */
+
+#if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
+     || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
+     || defined (__mc5307__)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
+	   : "=d" ((USItype)(sh)), "=&d" ((USItype)(sl))		\
+	   : "%0" ((USItype)(ah)), "d" ((USItype)(bh)),			\
+	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
+	   : "=d" ((USItype)(sh)), "=&d" ((USItype)(sl))		\
+	   : "0" ((USItype)(ah)), "d" ((USItype)(bh)),			\
+	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
+/* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
+#if defined (__mc68020__) || defined(mc68020) \
+     || defined (__mc68030__) || defined (mc68030) \
+     || defined (__mc68040__) || defined (mc68040) \
+     || defined (__mc68332__) || defined (mc68332) \
+     || defined (__NeXT__)
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("mulu%.l %3,%1:%0"						\
+	   : "=d" ((USItype)(w0)), "=d" ((USItype)(w1))			\
+	   : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
+#define UMUL_TIME 45
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  __asm__ ("divu%.l %4,%1:%0"						\
+	   : "=d" ((USItype)(q)), "=d" ((USItype)(r))			\
+	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
+#define UDIV_TIME 90
+#define sdiv_qrnnd(q, r, n1, n0, d) \
+  __asm__ ("divs%.l %4,%1:%0"						\
+	   : "=d" ((USItype)(q)), "=d" ((USItype)(r))			\
+	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
+#else /* for other 68k family members use 16x16->32 multiplication */
+#define umul_ppmm(xh, xl, a, b) \
+  do { USItype __umul_tmp1, __umul_tmp2;				\
+	__asm__ ("| Inlined umul_ppmm\n"			\
+	"move%.l	%5,%3\n"				\
+	"move%.l	%2,%0\n"				\
+	"move%.w	%3,%1\n"				\
+	"swap	%3\n"						\
+	"swap	%0\n"						\
+	"mulu%.w	%2,%1\n"				\
+	"mulu%.w	%3,%0\n"				\
+	"mulu%.w	%2,%3\n"				\
+	"swap	%2\n"						\
+	"mulu%.w	%5,%2\n"				\
+	"add%.l	%3,%2\n"					\
+	"jcc	1f\n"						\
+	"add%.l	%#0x10000,%0\n"					\
+"1:	move%.l	%2,%3\n"					\
+	"clr%.w	%2\n"						\
+	"swap	%2\n"						\
+	"swap	%3\n"						\
+	"clr%.w	%3\n"						\
+	"add%.l	%3,%1\n"					\
+	"addx%.l	%2,%0\n"				\
+	"| End inlined umul_ppmm"				\
+	      : "=&d" ((USItype)(xh)), "=&d" ((USItype)(xl)),	\
+	        "=d" (__umul_tmp1), "=&d" (__umul_tmp2)		\
+	      : "%2" ((USItype)(a)), "d" ((USItype)(b)));	\
+  } while (0)
+#define UMUL_TIME 100
+#define UDIV_TIME 400
+#endif /* not mc68020 */
+/* The '020, '030, '040 and '060 have bitfield insns.  */
+#if defined (__mc68020__) || defined (mc68020) \
+     || defined (__mc68030__) || defined (mc68030) \
+     || defined (__mc68040__) || defined (mc68040) \
+     || defined (__mc68060__) || defined (mc68060) \
+     || defined (__NeXT__)
+#define count_leading_zeros(count, x) \
+  __asm__ ("bfffo %1{%b2:%b2},%0"					\
+	   : "=d" ((USItype) (count))					\
+	   : "od" ((USItype) (x)), "n" (0))
+#define COUNT_LEADING_ZEROS_0 32
+#endif
+#endif /* mc68000 */
+
+#if defined (__m88000__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "%rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
+#define count_leading_zeros(count, x) \
+  do {									\
+    USItype __cbtmp;							\
+    __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));			\
+    (count) = __cbtmp ^ 31;						\
+  } while (0)
+#define COUNT_LEADING_ZEROS_0 63 /* sic */
+#if defined (__m88110__)
+#define umul_ppmm(wh, wl, u, v) \
+  do {									\
+    union {UDItype __ll;						\
+	   struct {USItype __h, __l;} __i;				\
+	  } __x;							\
+    __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));	\
+    (wh) = __x.__i.__h;							\
+    (wl) = __x.__i.__l;							\
+  } while (0)
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  ({union {UDItype __ll;						\
+	   struct {USItype __h, __l;} __i;				\
+	  } __x, __q;							\
+  __x.__i.__h = (n1); __x.__i.__l = (n0);				\
+  __asm__ ("divu.d %0,%1,%2"						\
+	   : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));		\
+  (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
+#define UMUL_TIME 5
+#define UDIV_TIME 25
+#else
+#define UMUL_TIME 17
+#define UDIV_TIME 150
+#endif /* __m88110__ */
+#endif /* __m88000__ */
+
+#if defined (__mips) && W_TYPE_SIZE == 32
+#if __GNUC__ > 2 || __GNUC_MINOR__ >= 7
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
+#else
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"				\
+	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
+#endif
+#define UMUL_TIME 10
+#define UDIV_TIME 100
+#endif /* __mips */
+
+#if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
+#if __GNUC__ > 2 || __GNUC_MINOR__ >= 7
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
+#else
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"				\
+	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
+#endif
+#define UMUL_TIME 20
+#define UDIV_TIME 140
+#endif /* __mips */
+
+#if defined (__ns32000__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+  ({union {UDItype __ll;						\
+	   struct {USItype __l, __h;} __i;				\
+	  } __x;							\
+  __asm__ ("meid %2,%0"							\
+	   : "=g" (__x.__ll)						\
+	   : "%0" ((USItype)(u)), "g" ((USItype)(v)));			\
+  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
+#define __umulsidi3(u, v) \
+  ({UDItype __w;							\
+    __asm__ ("meid %2,%0"						\
+	     : "=g" (__w)						\
+	     : "%0" ((USItype)(u)), "g" ((USItype)(v)));		\
+    __w; })
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  ({union {UDItype __ll;						\
+	   struct {USItype __l, __h;} __i;				\
+	  } __x;							\
+  __x.__i.__h = (n1); __x.__i.__l = (n0);				\
+  __asm__ ("deid %2,%0"							\
+	   : "=g" (__x.__ll)						\
+	   : "0" (__x.__ll), "g" ((USItype)(d)));			\
+  (r) = __x.__i.__l; (q) = __x.__i.__h; })
+#define count_trailing_zeros(count,x) \
+  do {									\
+    __asm__ ("ffsd	%2,%0"						\
+	     : "=r" ((USItype) (count))					\
+	     : "0" ((USItype) 0), "r" ((USItype) (x)));			\
+  } while (0)
+#endif /* __ns32000__ */
+
+/* We should test _IBMR2 here when we add assembly support for the system
+   vendor compilers.  */
+#if (defined (_ARCH_PPC) || defined (_ARCH_PWR) || defined (__powerpc__)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  do {									\
+    if (__builtin_constant_p (bh) && (bh) == 0)				\
+      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"		\
+	     : "=r" (sh), "=&r" (sl) : "%r" (ah), "%r" (al), "rI" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
+      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"		\
+	     : "=r" (sh), "=&r" (sl) : "%r" (ah), "%r" (al), "rI" (bl));\
+    else								\
+      __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"		\
+	     : "=r" (sh), "=&r" (sl)					\
+	     : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
+  } while (0)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  do {									\
+    if (__builtin_constant_p (ah) && (ah) == 0)				\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"	\
+	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"	\
+	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == 0)			\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"		\
+	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"		\
+	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+    else								\
+      __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"	\
+	       : "=r" (sh), "=&r" (sl)					\
+	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
+  } while (0)
+#define count_leading_zeros(count, x) \
+  __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
+#define COUNT_LEADING_ZEROS_0 32
+#if defined (_ARCH_PPC) || defined (__powerpc__)
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    USItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#define UMUL_TIME 15
+#define smul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    SItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#define SMUL_TIME 14
+#define UDIV_TIME 120
+#else
+#define UMUL_TIME 8
+#define smul_ppmm(xh, xl, m0, m1) \
+  __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
+#define SMUL_TIME 4
+#define sdiv_qrnnd(q, r, nh, nl, d) \
+  __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
+#define UDIV_TIME 100
+#endif
+#endif /* 32-bit POWER architecture variants.  */
+
+/* We should test _IBMR2 here when we add assembly support for the system
+   vendor compilers.  */
+#if (defined (_ARCH_PPC) || defined (__powerpc__)) && W_TYPE_SIZE == 64
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  do {									\
+    if (__builtin_constant_p (bh) && (bh) == 0)				\
+      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"		\
+	     : "=r" (sh), "=&r" (sl) : "%r" (ah), "%r" (al), "rI" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
+      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"		\
+	     : "=r" (sh), "=&r" (sl) : "%r" (ah), "%r" (al), "rI" (bl));\
+    else								\
+      __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"		\
+	     : "=r" (sh), "=&r" (sl)					\
+	     : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
+  } while (0)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  do {									\
+    if (__builtin_constant_p (ah) && (ah) == 0)				\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"	\
+	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)		\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"	\
+	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == 0)			\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"		\
+	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"		\
+	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+    else								\
+      __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"	\
+	       : "=r" (sh), "=&r" (sl)					\
+	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
+  } while (0)
+#define count_leading_zeros(count, x) \
+  __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
+#define COUNT_LEADING_ZEROS_0 64
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    UDItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#define UMUL_TIME 15
+#define smul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    DItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#define SMUL_TIME 14  /* ??? */
+#define UDIV_TIME 120 /* ??? */
+#endif /* 64-bit PowerPC.  */
+
+#if defined (__pyr__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addw %5,%1\n\taddwc %3,%0"					\
+	   : "=r" ((USItype)(sh)), "=&r" ((USItype)(sl))		\
+	   : "%0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
+	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subw %5,%1\n\tsubwb %3,%0"					\
+	   : "=r" ((USItype)(sh)), "=&r" ((USItype)(sl))		\
+	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
+	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
+/* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
+#define umul_ppmm(w1, w0, u, v) \
+  ({union {UDItype __ll;						\
+	   struct {USItype __h, __l;} __i;				\
+	  } __x;							\
+  __asm__ ("movw %1,%R0\n\tuemul %2,%0"					\
+	   : "=&r" (__x.__ll)						\
+	   : "g" ((USItype) (u)), "g" ((USItype)(v)));			\
+  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
+#endif /* __pyr__ */
+
+#if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("a %1,%5\n\tae %0,%3"					\
+	   : "=r" ((USItype)(sh)), "=&r" ((USItype)(sl))		\
+	   : "%0" ((USItype)(ah)), "r" ((USItype)(bh)),			\
+	     "%1" ((USItype)(al)), "r" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("s %1,%5\n\tse %0,%3"					\
+	   : "=r" ((USItype)(sh)), "=&r" ((USItype)(sl))		\
+	   : "0" ((USItype)(ah)), "r" ((USItype)(bh)),			\
+	     "1" ((USItype)(al)), "r" ((USItype)(bl)))
+#define smul_ppmm(ph, pl, m0, m1) \
+  __asm__ (								\
+       "s	r2,r2\n"				\
+	"mts r10,%2\n"					\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"cas	%0,r2,r0\n"				\
+	"mfs	r10,%1"					\
+	   : "=r" ((USItype)(ph)), "=r" ((USItype)(pl))	\
+	   : "%r" ((USItype)(m0)), "r" ((USItype)(m1))	\
+	   : "r2");					\
+#define UMUL_TIME 20
+#define UDIV_TIME 200
+#define count_leading_zeros(count, x) \
+  do {									\
+    if ((x) >= 0x10000)							\
+      __asm__ ("clz	%0,%1"						\
+	       : "=r" ((USItype)(count)) : "r" ((USItype)(x) >> 16));	\
+    else								\
+      {									\
+	__asm__ ("clz	%0,%1"						\
+		 : "=r" ((USItype)(count)) : "r" ((USItype)(x)));	\
+	(count) += 16;							\
+      }									\
+  } while (0)
+#endif /* RT/ROMP */
+
+#if defined (__sh2__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"		\
+	   : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
+#define UMUL_TIME 5
+#endif
+
+#if defined (__sparc__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "%rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)		\
+	   __CLOBBER_CC)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl)	\
+	   __CLOBBER_CC)
+#if defined (__sparc_v9__) || defined (__sparcv9)
+/* Perhaps we should use floating-point operations here?  */
+#if 0
+/* Triggers a bug making mpz/tests/t-gcd.c fail.
+   Perhaps we simply need explicitly zero-extend the inputs?  */
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :		\
+	   "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
+#else
+/* Use v8 umul until above bug is fixed.  */
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
+#endif
+/* Use a plain v8 divide for v9.  */
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do {									\
+    USItype __q;							\
+    __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
+	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
+    (r) = (n0) - __q * (d);						\
+    (q) = __q;								\
+  } while (0)
+#else
+#if defined (__sparc_v8__)
+/* Don't match immediate range because, 1) it is not often useful,
+   2) the 'I' flag thinks of the range as a 13 bit signed interval,
+   while we want to match a 13 bit interval, sign extended to 32 bits,
+   but INTERPRETED AS UNSIGNED.  */
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
+#define UMUL_TIME 5
+#ifndef SUPERSPARC	/* SuperSPARC's udiv only handles 53 bit dividends */
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do {									\
+    USItype __q;							\
+    __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
+	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
+    (r) = (n0) - __q * (d);						\
+    (q) = __q;								\
+  } while (0)
+#define UDIV_TIME 25
+#else
+#define UDIV_TIME 60		/* SuperSPARC timing */
+#endif /* SUPERSPARC */
+#else /* ! __sparc_v8__ */
+#if defined (__sparclite__)
+/* This has hardware multiply but not divide.  It also has two additional
+   instructions scan (ffs from high bit) and divscc.  */
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
+#define UMUL_TIME 5
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  __asm__ ("! Inlined udiv_qrnnd\n"					\
+	"wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
+	"tst	%%g0\n"							\
+	"divscc	%3,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%0\n"						\
+	"rd	%%y,%1\n"						\
+	"bl,a 1f\n"							\
+	"add	%1,%4,%1\n"						\
+"1:	! End of inline udiv_qrnnd"					\
+	   : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)		\
+	   : "%g1" __AND_CLOBBER_CC)
+#define UDIV_TIME 37
+#define count_leading_zeros(count, x) \
+  __asm__ ("scan %1,0,%0" : "=r" (x) : "r" (count))
+/* Early sparclites return 63 for an argument of 0, but they warn that future
+   implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
+   undefined.  */
+#endif /* __sparclite__ */
+#endif /* __sparc_v8__ */
+#endif /* __sparc_v9__ */
+/* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
+#ifndef umul_ppmm
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("! Inlined umul_ppmm\n"						\
+	"wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n"	\
+	"sra	%3,31,%%g2	! Don't move this insn\n"			\
+	"and	%2,%%g2,%%g2	! Don't move this insn\n"			\
+	"andcc	%%g0,0,%%g1	! Don't move this insn\n"			\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,0,%%g1\n"							\
+	"add	%%g1,%%g2,%0\n"							\
+	"rd	%%y,%1"								\
+	   : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)				\
+	   : "%g1", "%g2" __AND_CLOBBER_CC)
+#define UMUL_TIME 39		/* 39 instructions */
+#endif
+#ifndef udiv_qrnnd
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { USItype __r;							\
+    (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
+    (r) = __r;								\
+  } while (0)
+extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, USItype, USItype, USItype));
+#ifndef UDIV_TIME
+#define UDIV_TIME 140
+#endif
+#endif /* LONGLONG_STANDALONE */
+#endif /* udiv_qrnnd */
+#endif /* __sparc__ */
+
+#if defined (__vax__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
+	   : "=g" ((USItype)(sh)), "=&g" ((USItype)(sl))		\
+	   : "%0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
+	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
+	   : "=g" ((USItype)(sh)), "=&g" ((USItype)(sl))		\
+	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
+	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define smul_ppmm(xh, xl, m0, m1) \
+  do {									\
+    union {UDItype __ll;						\
+	   struct {USItype __l, __h;} __i;				\
+	  } __x;							\
+    USItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("emul %1,%2,$0,%0"						\
+	     : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));		\
+    (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
+  } while (0)
+#define sdiv_qrnnd(q, r, n1, n0, d) \
+  do {									\
+    union {DItype __ll;							\
+	   struct {SItype __l, __h;} __i;				\
+	  } __x;							\
+    __x.__i.__h = n1; __x.__i.__l = n0;					\
+    __asm__ ("ediv %3,%2,%0,%1"						\
+	     : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));		\
+  } while (0)
+#endif /* __vax__ */
+
+#if defined (__z8000__) && W_TYPE_SIZE == 16
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
+	   : "=r" ((unsigned int)(sh)), "=&r" ((unsigned int)(sl))	\
+	   : "%0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
+	     "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
+	   : "=r" ((unsigned int)(sh)), "=&r" ((unsigned int)(sl))	\
+	   : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
+	     "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
+#define umul_ppmm(xh, xl, m0, m1) \
+  do {									\
+    union {long int __ll;						\
+	   struct {unsigned int __h, __l;} __i;				\
+	  } __x;							\
+    unsigned int __m0 = (m0), __m1 = (m1);				\
+    __asm__ ("mult	%S0,%H3"					\
+	     : "=r" (__x.__i.__h), "=r" (__x.__i.__l)			\
+	     : "%1" (m0), "rQR" (m1));					\
+    (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
+    (xh) += ((((signed int) __m0 >> 15) & __m1)				\
+	     + (((signed int) __m1 >> 15) & __m0));			\
+  } while (0)
+#endif /* __z8000__ */
+
+#endif /* __GNUC__ */
+
+
+#if !defined (umul_ppmm) && defined (__umulsidi3)
+#define umul_ppmm(ph, pl, m0, m1) \
+  {									\
+    UDWtype __ll = __umulsidi3 (m0, m1);				\
+    ph = (UWtype) (__ll >> W_TYPE_SIZE);				\
+    pl = (UWtype) __ll;							\
+  }
+#endif
+
+#if !defined (__umulsidi3)
+#define __umulsidi3(u, v) \
+  ({UWtype __hi, __lo;							\
+    umul_ppmm (__hi, __lo, u, v);					\
+    ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
+#endif
+
+
+/* Note the prototypes are under !define(umul_ppmm) etc too, since the HPPA
+   versions above are different and we don't want to conflict.  */
+
+#if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm
+#define mpn_umul_ppmm  __MPN(umul_ppmm)
+extern mp_limb_t mpn_umul_ppmm _PROTO ((mp_limb_t *, mp_limb_t, mp_limb_t));
+#define umul_ppmm(wh, wl, u, v)                                 \
+  do {                                                          \
+    mp_limb_t __umul_ppmm__p0;                                  \
+    (wh) = __MPN(umul_ppmm) (&__umul_ppmm__p0,                  \
+                             (mp_limb_t) (u), (mp_limb_t) (v)); \
+    (wl) = __umul_ppmm__p0;                                     \
+  } while (0)
+#endif
+
+#if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd
+#define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
+extern mp_limb_t mpn_udiv_qrnnd _PROTO ((mp_limb_t *,
+                                         mp_limb_t, mp_limb_t, mp_limb_t));
+#define udiv_qrnnd(q, r, n1, n0, d)                                           \
+  do {                                                                        \
+    mp_limb_t __udiv_qrnnd__r;                                                \
+    (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r,                                   \
+                          (mp_limb_t) (n1), (mp_limb_t) (n0), (mp_limb_t) d); \
+    (r) = __udiv_qrnnd__r;                                                    \
+  } while (0)
+#endif
+
+
+/* If this machine has no inline assembler, use C macros.  */
+
+#if !defined (add_ssaaaa)
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  do {									\
+    UWtype __x;								\
+    __x = (al) + (bl);							\
+    (sh) = (ah) + (bh) + (__x < (al));					\
+    (sl) = __x;								\
+  } while (0)
+#endif
+
+#if !defined (sub_ddmmss)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  do {									\
+    UWtype __x;								\
+    __x = (al) - (bl);							\
+    (sh) = (ah) - (bh) - (__x > (al));					\
+    (sl) = __x;								\
+  } while (0)
+#endif
+
+/* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
+   smul_ppmm.  */
+#if !defined (umul_ppmm) && defined (smul_ppmm)
+#define umul_ppmm(w1, w0, u, v)						\
+  do {									\
+    UWtype __w1;							\
+    UWtype __xm0 = (u), __xm1 = (v);					\
+    smul_ppmm (__w1, w0, __xm0, __xm1);					\
+    (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
+		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
+  } while (0)
+#endif
+
+/* If we still don't have umul_ppmm, define it using plain C.  */
+#if !defined (umul_ppmm)
+#define umul_ppmm(w1, w0, u, v)						\
+  do {									\
+    UWtype __x0, __x1, __x2, __x3;					\
+    UHWtype __ul, __vl, __uh, __vh;					\
+    UWtype __u = (u), __v = (v);					\
+									\
+    __ul = __ll_lowpart (__u);						\
+    __uh = __ll_highpart (__u);						\
+    __vl = __ll_lowpart (__v);						\
+    __vh = __ll_highpart (__v);						\
+									\
+    __x0 = (UWtype) __ul * __vl;					\
+    __x1 = (UWtype) __ul * __vh;					\
+    __x2 = (UWtype) __uh * __vl;					\
+    __x3 = (UWtype) __uh * __vh;					\
+									\
+    __x1 += __ll_highpart (__x0);/* this can't give carry */		\
+    __x1 += __x2;		/* but this indeed can */		\
+    if (__x1 < __x2)		/* did we get it? */			\
+      __x3 += __ll_B;		/* yes, add it in the proper pos. */	\
+									\
+    (w1) = __x3 + __ll_highpart (__x1);					\
+    (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);		\
+  } while (0)
+#endif
+
+/* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
+   exist in one form or another.  */
+#if !defined (smul_ppmm)
+#define smul_ppmm(w1, w0, u, v)						\
+  do {									\
+    UWtype __w1;							\
+    UWtype __xm0 = (u), __xm1 = (v);					\
+    umul_ppmm (__w1, w0, __xm0, __xm1);					\
+    (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
+		- (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
+  } while (0)
+#endif
+
+/* Define this unconditionally, so it can be used for debugging.  */
+#define __udiv_qrnnd_c(q, r, n1, n0, d) \
+  do {									\
+    UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;			\
+    __d1 = __ll_highpart (d);						\
+    __d0 = __ll_lowpart (d);						\
+									\
+    __q1 = (n1) / __d1;							\
+    __r1 = (n1) - __q1 * __d1;						\
+    __m = (UWtype) __q1 * __d0;						\
+    __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
+    if (__r1 < __m)							\
+      {									\
+	__q1--, __r1 += (d);						\
+	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
+	  if (__r1 < __m)						\
+	    __q1--, __r1 += (d);					\
+      }									\
+    __r1 -= __m;							\
+									\
+    __q0 = __r1 / __d1;							\
+    __r0 = __r1  - __q0 * __d1;						\
+    __m = (UWtype) __q0 * __d0;						\
+    __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
+    if (__r0 < __m)							\
+      {									\
+	__q0--, __r0 += (d);						\
+	if (__r0 >= (d))						\
+	  if (__r0 < __m)						\
+	    __q0--, __r0 += (d);					\
+      }									\
+    __r0 -= __m;							\
+									\
+    (q) = (UWtype) __q1 * __ll_B | __q0;				\
+    (r) = __r0;								\
+  } while (0)
+
+/* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
+   __udiv_w_sdiv (defined in libgcc or elsewhere).  */
+#if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
+#define udiv_qrnnd(q, r, nh, nl, d) \
+  do {									\
+    UWtype __r;								\
+    (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);				\
+    (r) = __r;								\
+  } while (0)
+#endif
+
+/* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
+#if !defined (udiv_qrnnd)
+#define UDIV_NEEDS_NORMALIZATION 1
+#define udiv_qrnnd __udiv_qrnnd_c
+#endif
+
+#if !defined (count_leading_zeros)
+extern
+#if __STDC__
+const
+#endif
+unsigned char __clz_tab[];
+#define count_leading_zeros(count, x) \
+  do {									\
+    UWtype __xr = (x);							\
+    UWtype __a;								\
+									\
+    if (W_TYPE_SIZE <= 32)						\
+      {									\
+	__a = __xr < ((UWtype) 1 << 2*__BITS4)				\
+	  ? (__xr < ((UWtype) 1 << __BITS4) ? 0 : __BITS4)		\
+	  : (__xr < ((UWtype) 1 << 3*__BITS4) ?  2*__BITS4 : 3*__BITS4);\
+      }									\
+    else								\
+      {									\
+	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
+	  if (((__xr >> __a) & 0xff) != 0)				\
+	    break;							\
+      }									\
+									\
+    (count) = W_TYPE_SIZE - (__clz_tab[__xr >> __a] + __a);		\
+  } while (0)
+/* This version gives a well-defined value for zero. */
+#define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+#endif
+
+#if !defined (count_trailing_zeros)
+/* Define count_trailing_zeros using count_leading_zeros.  The latter might be
+   defined in asm, but if it is not, the C version above is good enough.  */
+#define count_trailing_zeros(count, x) \
+  do {									\
+    UWtype __ctz_x = (x);						\
+    UWtype __ctz_c;							\
+    count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
+    (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
+  } while (0)
+#endif
+
+#ifndef UDIV_NEEDS_NORMALIZATION
+#define UDIV_NEEDS_NORMALIZATION 0
+#endif
+
+/* Give defaults for UMUL_TIME and UDIV_TIME.  */
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+/* count_trailing_zeros is often on the slow side, so make that the default */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME  15  /* cycles */
+#endif
+
+
diff --git a/rts/gmp/ltconfig b/rts/gmp/ltconfig
new file mode 100644
index 0000000000..6d8cf33e8f
--- /dev/null
+++ b/rts/gmp/ltconfig
@@ -0,0 +1,3109 @@
+#! /bin/sh
+
+# ltconfig - Create a system-specific libtool.
+# Copyright (C) 1996-2000 Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+#
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# A lot of this script is taken from autoconf-2.10.
+
+# Check that we are running under the correct shell.
+SHELL=${CONFIG_SHELL-/bin/sh}
+echo=echo
+if test "X$1" = X--no-reexec; then
+  # Discard the --no-reexec flag, and continue.
+  shift
+elif test "X$1" = X--fallback-echo; then
+  # Avoid inline document here, it may be left over
+  :
+elif test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then
+  # Yippee, $echo works!
+  :
+else
+  # Restart under the correct shell.
+  exec "$SHELL" "$0" --no-reexec ${1+"$@"}
+fi
+
+if test "X$1" = X--fallback-echo; then
+  # used as fallback echo
+  shift
+  cat <<EOF
+$*
+EOF
+  exit 0
+fi
+
+# Find the correct PATH separator.  Usually this is `:', but
+# DJGPP uses `;' like DOS.
+if test "X${PATH_SEPARATOR+set}" != Xset; then
+  UNAME=${UNAME-`uname 2>/dev/null`}
+  case X$UNAME in
+    *-DOS) PATH_SEPARATOR=';' ;;
+    *)     PATH_SEPARATOR=':' ;;
+  esac
+fi
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+if test "X${CDPATH+set}" = Xset; then CDPATH=:; export CDPATH; fi
+
+if test "X${echo_test_string+set}" != Xset; then
+  # find a string as large as possible, as long as the shell can cope with it
+  for cmd in 'sed 50q "$0"' 'sed 20q "$0"' 'sed 10q "$0"' 'sed 2q "$0"' 'echo test'; do
+    # expected sizes: less than 2Kb, 1Kb, 512 bytes, 16 bytes, ...
+    if (echo_test_string="`eval $cmd`") 2>/dev/null &&
+       echo_test_string="`eval $cmd`" &&
+       (test "X$echo_test_string" = "X$echo_test_string") 2>/dev/null; then
+      break
+    fi
+  done
+fi
+
+if test "X`($echo '\t') 2>/dev/null`" = 'X\t' &&
+   echo_testing_string=`($echo "$echo_test_string") 2>/dev/null` &&
+   test "X$echo_testing_string" = "X$echo_test_string"; then
+  :
+else
+  # The Solaris, AIX, and Digital Unix default echo programs unquote
+  # backslashes.  This makes it impossible to quote backslashes using
+  #   echo "$something" | sed 's/\\/\\\\/g'
+  #
+  # So, first we look for a working echo in the user's PATH.
+
+  IFS="${IFS= 	}"; save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR}"
+  for dir in $PATH /usr/ucb; do
+    if (test -f $dir/echo || test -f $dir/echo$ac_exeext) &&
+       test "X`($dir/echo '\t') 2>/dev/null`" = 'X\t' &&
+       echo_testing_string=`($dir/echo "$echo_test_string") 2>/dev/null` &&
+       test "X$echo_testing_string" = "X$echo_test_string"; then
+      echo="$dir/echo"
+      break
+    fi
+  done
+  IFS="$save_ifs"
+
+  if test "X$echo" = Xecho; then
+    # We didn't find a better echo, so look for alternatives.
+    if test "X`(print -r '\t') 2>/dev/null`" = 'X\t' &&
+       echo_testing_string=`(print -r "$echo_test_string") 2>/dev/null` &&
+       test "X$echo_testing_string" = "X$echo_test_string"; then
+      # This shell has a builtin print -r that does the trick.
+      echo='print -r'
+    elif (test -f /bin/ksh || test -f /bin/ksh$ac_exeext) &&
+	 test "X$CONFIG_SHELL" != X/bin/ksh; then
+      # If we have ksh, try running ltconfig again with it.
+      ORIGINAL_CONFIG_SHELL="${CONFIG_SHELL-/bin/sh}"
+      export ORIGINAL_CONFIG_SHELL
+      CONFIG_SHELL=/bin/ksh
+      export CONFIG_SHELL
+      exec "$CONFIG_SHELL" "$0" --no-reexec ${1+"$@"}
+    else
+      # Try using printf.
+      echo='printf "%s\n"'
+      if test "X`($echo '\t') 2>/dev/null`" = 'X\t' &&
+	 echo_testing_string=`($echo "$echo_test_string") 2>/dev/null` &&
+	 test "X$echo_testing_string" = "X$echo_test_string"; then
+	# Cool, printf works
+	:
+      elif echo_testing_string=`("$ORIGINAL_CONFIG_SHELL" "$0" --fallback-echo '\t') 2>/dev/null` &&
+	   test "X$echo_testing_string" = 'X\t' &&
+	   echo_testing_string=`("$ORIGINAL_CONFIG_SHELL" "$0" --fallback-echo "$echo_test_string") 2>/dev/null` &&
+	   test "X$echo_testing_string" = "X$echo_test_string"; then
+	CONFIG_SHELL="$ORIGINAL_CONFIG_SHELL"
+	export CONFIG_SHELL
+	SHELL="$CONFIG_SHELL"
+	export SHELL
+	echo="$CONFIG_SHELL $0 --fallback-echo"
+      elif echo_testing_string=`("$CONFIG_SHELL" "$0" --fallback-echo '\t') 2>/dev/null` &&
+	   test "X$echo_testing_string" = 'X\t' &&
+	   echo_testing_string=`("$CONFIG_SHELL" "$0" --fallback-echo "$echo_test_string") 2>/dev/null` &&
+	   test "X$echo_testing_string" = "X$echo_test_string"; then
+	echo="$CONFIG_SHELL $0 --fallback-echo"
+      else
+	# maybe with a smaller string...
+	prev=:
+
+	for cmd in 'echo test' 'sed 2q "$0"' 'sed 10q "$0"' 'sed 20q "$0"' 'sed 50q "$0"'; do
+	  if (test "X$echo_test_string" = "X`eval $cmd`") 2>/dev/null; then
+	    break
+	  fi
+	  prev="$cmd"
+	done
+
+	if test "$prev" != 'sed 50q "$0"'; then
+	  echo_test_string=`eval $prev`
+	  export echo_test_string
+	  exec "${ORIGINAL_CONFIG_SHELL}" "$0" ${1+"$@"}
+	else
+	  # Oops.  We lost completely, so just stick with echo.
+	  echo=echo
+	fi
+      fi
+    fi
+  fi
+fi
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+Xsed='sed -e s/^X//'
+sed_quote_subst='s/\([\\"\\`$\\\\]\)/\\\1/g'
+
+# Same as above, but do not quote variable references.
+double_quote_subst='s/\([\\"\\`\\\\]\)/\\\1/g'
+
+# Sed substitution to delay expansion of an escaped shell variable in a
+# double_quote_subst'ed string.
+delay_variable_subst='s/\\\\\\\\\\\$/\\\\\\$/g'
+
+# The name of this program.
+progname=`$echo "X$0" | $Xsed -e 's%^.*/%%'`
+
+# Constants:
+PROGRAM=ltconfig
+PACKAGE=libtool
+VERSION=1.3c
+TIMESTAMP=" (1.696 2000/03/14 20:22:42)"
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+rm="rm -f"
+
+help="Try \`$progname --help' for more information."
+
+# Global variables:
+default_ofile=libtool
+can_build_shared=yes
+enable_shared=yes
+# All known linkers require a `.a' archive for static linking (except M$VC,
+# which needs '.lib').
+enable_static=yes
+enable_fast_install=yes
+enable_dlopen=unknown
+enable_win32_dll=no
+pic_mode=default
+ltmain=
+silent=
+srcdir=
+ac_config_guess=
+ac_config_sub=
+host=
+build=NONE
+nonopt=NONE
+ofile="$default_ofile"
+verify_host=yes
+with_gcc=no
+with_gnu_ld=no
+need_locks=yes
+ac_ext=c
+libext=a
+cache_file=
+
+old_AR="$AR"
+old_CC="$CC"
+old_CFLAGS="$CFLAGS"
+old_CPPFLAGS="$CPPFLAGS"
+old_LDFLAGS="$LDFLAGS"
+old_LIBS="$LIBS"
+old_MAGIC="$MAGIC"
+old_LD="$LD"
+old_LN_S="$LN_S"
+old_NM="$NM"
+old_RANLIB="$RANLIB"
+old_STRIP="$STRIP"
+old_AS="$AS"
+old_DLLTOOL="$DLLTOOL"
+old_OBJDUMP="$OBJDUMP"
+old_OBJEXT="$OBJEXT"
+old_EXEEXT="$EXEEXT"
+old_reload_Flag="$reload_flag"
+old_deplibs_check_method="$deplibs_check_method"
+old_file_magic_cmd="$file_magic_cmd"
+
+# Parse the command line options.
+args=
+prev=
+for option
+do
+  case "$option" in
+  -*=*) optarg=`echo "$option" | sed 's/[-_a-zA-Z0-9]*=//'` ;;
+  *) optarg= ;;
+  esac
+
+  # If the previous option needs an argument, assign it.
+  if test -n "$prev"; then
+    eval "$prev=\$option"
+    prev=
+    continue
+  fi
+
+  case "$option" in
+  --help) cat <<EOM
+Usage: $progname [OPTION]... LTMAIN [HOST]
+
+Generate a system-specific libtool script.
+
+    --build                configure for building on BUILD [BUILD=HOST]
+    --debug                enable verbose shell tracing
+    --disable-shared       do not build shared libraries
+    --disable-static       do not build static libraries
+    --disable-fast-install do not optimize for fast installation
+    --enable-dlopen        enable dlopen support
+    --enable-win32-dll     enable building dlls on win32 hosts
+    --help                 display this help and exit
+    --no-verify            do not verify that HOST is a valid host type
+-o, --output=FILE          specify the output file [default=$default_ofile]
+    --quiet                same as \`--silent'
+    --silent               do not print informational messages
+    --srcdir=DIR           find \`config.guess' in DIR
+    --version              output version information and exit
+    --with-gcc             assume that the GNU C compiler will be used
+    --with-gnu-ld          assume that the C compiler uses the GNU linker
+    --prefer-pic           try to use only PIC objects
+    --prefer-non-pic       try to use only non-PIC objects
+    --disable-lock         disable file locking
+    --cache-file=FILE      configure cache file
+
+LTMAIN is the \`ltmain.sh' shell script fragment or \`ltmain.c' program
+that provides basic libtool functionality.
+
+HOST is the canonical host system name [default=guessed].
+EOM
+  exit 0
+  ;;
+
+  --build) prev=build ;;
+  --build=*) build="$optarg" ;;
+
+  --debug)
+    echo "$progname: enabling shell trace mode"
+    set -x
+    ;;
+
+  --disable-shared) enable_shared=no ;;
+
+  --disable-static) enable_static=no ;;
+
+  --disable-fast-install) enable_fast_install=no ;;
+
+  --enable-dlopen) enable_dlopen=yes ;;
+
+  --enable-win32-dll) enable_win32_dll=yes ;;
+
+  --quiet | --silent) silent=yes ;;
+
+  --srcdir) prev=srcdir ;;
+  --srcdir=*) srcdir="$optarg" ;;
+
+  --no-verify) verify_host=no ;;
+
+  --output | -o) prev=ofile ;;
+  --output=*) ofile="$optarg" ;;
+
+  --version) echo "$PROGRAM (GNU $PACKAGE) $VERSION$TIMESTAMP"; exit 0 ;;
+
+  --with-gcc) with_gcc=yes ;;
+  --with-gnu-ld) with_gnu_ld=yes ;;
+
+  --prefer-pic) pic_mode=yes ;;
+  --prefer-non-pic) pic_mode=no ;;
+
+  --disable-lock) need_locks=no ;;
+
+  --cache-file=*) cache_file="$optarg" ;;
+
+  -*)
+    echo "$progname: unrecognized option \`$option'" 1>&2
+    echo "$help" 1>&2
+    exit 1
+    ;;
+
+  *)
+    if test -z "$ltmain"; then
+      ltmain="$option"
+    elif test -z "$host"; then
+# This generates an unnecessary warning for sparc-sun-solaris4.1.3_U1
+#      if test -n "`echo $option| sed 's/[-a-z0-9.]//g'`"; then
+#        echo "$progname: warning \`$option' is not a valid host type" 1>&2
+#      fi
+      host="$option"
+    else
+      echo "$progname: too many arguments" 1>&2
+      echo "$help" 1>&2
+      exit 1
+    fi ;;
+  esac
+done
+
+if test -z "$ltmain"; then
+  echo "$progname: you must specify a LTMAIN file" 1>&2
+  echo "$help" 1>&2
+  exit 1
+fi
+
+if test ! -f "$ltmain"; then
+  echo "$progname: \`$ltmain' does not exist" 1>&2
+  echo "$help" 1>&2
+  exit 1
+fi
+
+# Quote any args containing shell metacharacters.
+ltconfig_args=
+for arg
+do
+  case "$arg" in
+  *" "*|*"	"*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?]*)
+  ltconfig_args="$ltconfig_args '$arg'" ;;
+  *) ltconfig_args="$ltconfig_args $arg" ;;
+  esac
+done
+
+# A relevant subset of AC_INIT.
+
+# File descriptor usage:
+# 0 standard input
+# 1 file creation
+# 2 errors and warnings
+# 3 some systems may open it to /dev/tty
+# 4 used on the Kubota Titan
+# 5 compiler messages saved in config.log
+# 6 checking for... messages and results
+if test "$silent" = yes; then
+  exec 6>/dev/null
+else
+  exec 6>&1
+fi
+exec 5>>./config.log
+
+# NLS nuisances.
+# Only set LANG and LC_ALL to C if already set.
+# These must not be set unconditionally because not all systems understand
+# e.g. LANG=C (notably SCO).
+if test "X${LC_ALL+set}" = Xset; then LC_ALL=C; export LC_ALL; fi
+if test "X${LANG+set}"   = Xset; then LANG=C;   export LANG;   fi
+
+if test -n "$cache_file" && test -r "$cache_file"; then
+  echo "loading cache $cache_file within ltconfig"
+  . $cache_file
+fi
+
+if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then
+  # Stardent Vistra SVR4 grep lacks -e, says ghazi@caip.rutgers.edu.
+  if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then
+    ac_n= ac_c='
+' ac_t='	'
+  else
+    ac_n=-n ac_c= ac_t=
+  fi
+else
+  ac_n= ac_c='\c' ac_t=
+fi
+
+if test -z "$srcdir"; then
+  # Assume the source directory is the same one as the path to LTMAIN.
+  srcdir=`$echo "X$ltmain" | $Xsed -e 's%/[^/]*$%%'`
+  test "$srcdir" = "$ltmain" && srcdir=.
+fi
+
+trap "$rm conftest*; exit 1" 1 2 15
+if test "$verify_host" = yes; then
+  # Check for config.guess and config.sub.
+  ac_aux_dir=
+  for ac_dir in $srcdir $srcdir/.. $srcdir/../..; do
+    if test -f $ac_dir/config.guess; then
+      ac_aux_dir=$ac_dir
+      break
+    fi
+  done
+  if test -z "$ac_aux_dir"; then
+    echo "$progname: cannot find config.guess in $srcdir $srcdir/.. $srcdir/../.." 1>&2
+    echo "$help" 1>&2
+    exit 1
+  fi
+  ac_config_guess=$ac_aux_dir/config.guess
+  ac_config_sub=$ac_aux_dir/config.sub
+
+  # Make sure we can run config.sub.
+  if $SHELL $ac_config_sub sun4 >/dev/null 2>&1; then :
+  else
+    echo "$progname: cannot run $ac_config_sub" 1>&2
+    echo "$help" 1>&2
+    exit 1
+  fi
+
+  echo $ac_n "checking host system type""... $ac_c" 1>&6
+
+  host_alias=$host
+  case "$host_alias" in
+  "")
+    if host_alias=`$SHELL $ac_config_guess`; then :
+    else
+      echo "$progname: cannot guess host type; you must specify one" 1>&2
+      echo "$help" 1>&2
+      exit 1
+    fi ;;
+  esac
+  host=`$SHELL $ac_config_sub $host_alias`
+  echo "$ac_t$host" 1>&6
+
+  # Make sure the host verified.
+  test -z "$host" && exit 1
+
+  # Check for the build system type
+  echo $ac_n "checking build system type... $ac_c" 1>&6
+
+  build_alias=$build
+  case "$build_alias" in
+  NONE)
+    case $nonopt in
+    NONE) build_alias=$host_alias ;;
+    *) build_alias=$nonopt ;;
+    esac ;;
+  esac
+
+  build=`$SHELL $ac_config_sub $build_alias`
+  build_cpu=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+  build_vendor=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+  build_os=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+  echo "$ac_t""$build" 1>&6
+
+elif test -z "$host"; then
+  echo "$progname: you must specify a host type if you use \`--no-verify'" 1>&2
+  echo "$help" 1>&2
+  exit 1
+else
+  host_alias=$host
+  build_alias=$host_alias
+  build=$host
+fi
+
+if test x"$host" != x"$build"; then
+  ac_tool_prefix=${host_alias}-
+else
+  ac_tool_prefix=
+fi
+
+host_cpu=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+host_vendor=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+host_os=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+
+# Transform linux* to *-*-linux-gnu*, to support old configure scripts.
+case "$host_os" in
+linux-gnu*) ;;
+linux*) host=`echo $host | sed 's/^\(.*-.*-linux\)\(.*\)$/\1-gnu\2/'`
+esac
+
+case "$host_os" in
+aix3*)
+  # AIX sometimes has problems with the GCC collect2 program.  For some
+  # reason, if we set the COLLECT_NAMES environment variable, the problems
+  # vanish in a puff of smoke.
+  if test "X${COLLECT_NAMES+set}" != Xset; then
+    COLLECT_NAMES=
+    export COLLECT_NAMES
+  fi
+  ;;
+esac
+
+# Determine commands to create old-style static archives.
+old_archive_cmds='$AR cru $oldlib$oldobjs$old_deplibs'
+old_postinstall_cmds='chmod 644 $oldlib'
+old_postuninstall_cmds=
+
+# Set sane defaults for various variables
+test -z "$AR" && AR=ar
+test -z "$AS" && AS=as
+test -z "$CC" && CC=cc
+test -z "$DLLTOOL" && DLLTOOL=dlltool
+test -z "$MAGIC" && MAGIC=file
+test -z "$LD" && LD=ld
+test -z "$LN_S" && LN_S="ln -s"
+test -z "$NM" && NM=nm
+test -z "$OBJDUMP" && OBJDUMP=objdump
+test -z "$RANLIB" && RANLIB=:
+test -z "$STRIP" && STRIP=:
+test -z "$objext" && objext=o
+
+echo $ac_n "checking for objdir... $ac_c" 1>&6
+rm -f .libs 2>/dev/null
+mkdir .libs 2>/dev/null
+if test -d .libs; then
+  objdir=.libs
+else
+  # MS-DOS does not allow filenames that begin with a dot.
+  objdir=_libs
+fi
+rmdir .libs 2>/dev/null
+echo "$ac_t$objdir" 1>&6
+
+# Allow CC to be a program name with arguments.
+set dummy $CC
+compiler="$2"
+
+# We assume here that the value for ac_cv_prog_cc_pic will not be cached
+# in isolation, and that seeing it set (from the cache) indicates that
+# the associated values are set (in the cache) correctly too.
+echo $ac_n "checking for $compiler option to produce PIC... $ac_c" 1>&6
+echo "$progname:563:checking for $compiler option to produce PIC" 1>&5
+if test "X${ac_cv_prog_cc_pic+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  ac_cv_prog_cc_pic=
+  ac_cv_prog_cc_shlib=
+  ac_cv_prog_cc_wl=
+  ac_cv_prog_cc_static=
+  ac_cv_prog_cc_no_builtin=
+  ac_cv_prog_cc_can_build_shared=$can_build_shared
+
+  if test "$with_gcc" = yes; then
+    ac_cv_prog_cc_wl='-Wl,'
+    ac_cv_prog_cc_static='-static'
+
+    case "$host_os" in
+    beos* | irix5* | irix6* | osf3* | osf4* | osf5*)
+      # PIC is the default for these OSes.
+      ;;
+    aix*)
+      # Below there is a dirty hack to force normal static linking with -ldl
+      # The problem is because libdl dynamically linked with both libc and
+      # libC (AIX C++ library), which obviously doesn't included in libraries
+      # list by gcc. This cause undefined symbols with -static flags.
+      # This hack allows C programs to be linked with "-static -ldl", but
+      # we not sure about C++ programs.
+      ac_cv_prog_cc_static="$ac_cv_prog_cc_static ${ac_cv_prog_cc_wl}-lC"
+      ;;
+    cygwin* | mingw* | os2*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      ac_cv_prog_cc_pic='-DDLL_EXPORT'
+      ;;
+    amigaos*)
+      # FIXME: we need at least 68020 code to build shared libraries, but
+      # adding the `-m68020' flag to GCC prevents building anything better,
+      # like `-m68040'.
+      ac_cv_prog_cc_pic='-m68020 -resident32 -malways-restore-a4'
+      ;;
+    sysv4*MP*)
+      if test -d /usr/nec; then
+	 ac_cv_prog_cc_pic=-Kconform_pic
+      fi
+      ;;
+    *)
+      ac_cv_prog_cc_pic='-fPIC'
+      ;;
+    esac
+  else
+    # PORTME Check for PIC flags for the system compiler.
+    case "$host_os" in
+    aix3* | aix4*)
+     # All AIX code is PIC.
+      ac_cv_prog_cc_static='-bnso -bI:/lib/syscalls.exp'
+      ;;
+
+    hpux9* | hpux10* | hpux11*)
+      # Is there a better ac_cv_prog_cc_static that works with the bundled CC?
+      ac_cv_prog_cc_wl='-Wl,'
+      ac_cv_prog_cc_static="${ac_cv_prog_cc_wl}-a ${ac_cv_prog_cc_wl}archive"
+      ac_cv_prog_cc_pic='+Z'
+      ;;
+
+    irix5* | irix6*)
+      ac_cv_prog_cc_wl='-Wl,'
+      ac_cv_prog_cc_static='-non_shared'
+      # PIC (with -KPIC) is the default.
+      ;;
+
+    cygwin* | mingw* | os2*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      ac_cv_prog_cc_pic='-DDLL_EXPORT'
+      ;;
+
+    osf3* | osf4* | osf5*)
+      # All OSF/1 code is PIC.
+      ac_cv_prog_cc_wl='-Wl,'
+      ac_cv_prog_cc_static='-non_shared'
+      ;;
+
+    sco3.2v5*)
+      ac_cv_prog_cc_pic='-Kpic'
+      ac_cv_prog_cc_static='-dn'
+      ac_cv_prog_cc_shlib='-belf'
+      ;;
+
+    solaris*)
+      ac_cv_prog_cc_pic='-KPIC'
+      ac_cv_prog_cc_static='-Bstatic'
+      ac_cv_prog_cc_wl='-Wl,'
+      ;;
+
+    sunos4*)
+      ac_cv_prog_cc_pic='-PIC'
+      ac_cv_prog_cc_static='-Bstatic'
+      ac_cv_prog_cc_wl='-Qoption ld '
+      ;;
+
+    sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+      ac_cv_prog_cc_pic='-KPIC'
+      ac_cv_prog_cc_static='-Bstatic'
+      ac_cv_prog_cc_wl='-Wl,'
+      ;;
+
+    uts4*)
+      ac_cv_prog_cc_pic='-pic'
+      ac_cv_prog_cc_static='-Bstatic'
+      ;;
+
+    sysv4*MP*)
+      if test -d /usr/nec ;then
+	ac_cv_prog_cc_pic='-Kconform_pic'
+	ac_cv_prog_cc_static='-Bstatic'
+      fi
+      ;;
+
+    *)
+      ac_cv_prog_cc_can_build_shared=no
+      ;;
+    esac
+  fi
+fi
+if test -z "$ac_cv_prog_cc_pic"; then
+  echo "$ac_t"none 1>&6
+else
+  echo "$ac_t""$ac_cv_prog_cc_pic" 1>&6
+
+  # Check to make sure the pic_flag actually works.
+  echo $ac_n "checking if $compiler PIC flag $ac_cv_prog_cc_pic works... $ac_c" 1>&6
+  echo "$progname:693:checking that $compiler PIC flag $ac_cv_prog_cc_pic works." 1>&5
+  if test "X${ac_cv_prog_cc_pic_works+set}" = Xset; then
+    echo $ac_n "(cached) $ac_c" 1>&6
+  else
+    ac_cv_prog_cc_pic_works=yes
+    $rm conftest*
+    echo "int some_variable = 0;" > conftest.c
+    save_CFLAGS="$CFLAGS"
+    CFLAGS="$CFLAGS $ac_cv_prog_cc_pic -DPIC"
+    if { (eval echo $progname:702: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; } && test -s conftest.$objext; then
+      # Append any warnings to the config.log.
+      cat conftest.err 1>&5
+
+      case "$host_os" in
+      hpux9* | hpux10* | hpux11*)
+	# On HP-UX, both CC and GCC only warn that PIC is supported... then
+	# they create non-PIC objects.  So, if there were any warnings, we
+	# assume that PIC is not supported.
+	if test -s conftest.err; then
+	  ac_cv_prog_cc_pic_works=no
+	  ac_cv_prog_cc_can_build_shared=no
+	  ac_cv_prog_cc_pic=
+	else
+	  ac_cv_prog_cc_pic_works=yes
+	  ac_cv_prog_cc_pic=" $ac_cv_prog_cc_pic"
+	fi
+	;;
+      *)
+	ac_cv_prog_cc_pic_works=yes
+	ac_cv_prog_cc_pic=" $ac_cv_prog_cc_pic"
+	;;
+      esac
+    else
+      # Append any errors to the config.log.
+      cat conftest.err 1>&5
+      ac_cv_prog_cc_pic_works=no
+      ac_cv_prog_cc_can_build_shared=no
+      ac_cv_prog_cc_pic=
+    fi
+    CFLAGS="$save_CFLAGS"
+    $rm conftest*
+  fi
+  # Belt *and* braces to stop my trousers falling down:
+  if test "X$ac_cv_prog_cc_pic_works" = Xno; then
+    ac_cv_prog_cc_pic=
+    ac_cv_prog_cc_can_build_shared=no
+  fi
+  echo "$ac_t""$ac_cv_prog_cc_pic_works" 1>&6
+fi
+
+# Check for any special shared library compilation flags.
+if test -n "$ac_cv_prog_cc_shlib"; then
+  echo "$progname: warning: \`$CC' requires \`$ac_cv_prog_cc_shlib' to build shared libraries" 1>&2
+  if echo "$old_CC $old_CFLAGS " | egrep -e "[ 	]$ac_cv_prog_cc_shlib[ 	]" >/dev/null; then :
+  else
+    echo "$progname: add \`$ac_cv_prog_cc_shlib' to the CC or CFLAGS env variable and reconfigure" 1>&2
+    ac_cv_prog_cc_can_build_shared=no
+  fi
+fi
+
+echo $ac_n "checking if $compiler static flag $ac_cv_prog_cc_static works... $ac_c" 1>&6
+echo "$progname:754: checking if $compiler static flag $ac_cv_prog_cc_static works" >&5
+if test "X${ac_cv_prog_cc_static_works+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  $rm conftest*
+  echo 'main(){return(0);}' > conftest.c
+  save_LDFLAGS="$LDFLAGS"
+  LDFLAGS="$LDFLAGS $ac_cv_prog_cc_static"
+  if { (eval echo $progname:762: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
+    ac_cv_prog_cc_static_works=yes
+  else
+    ac_cv_prog_cc_static_works=no
+    ac_cv_prog_cc_static=
+  fi
+  LDFLAGS="$save_LDFLAGS"
+  $rm conftest*
+fi
+# Belt *and* braces to stop my trousers falling down:
+if test "X$ac_cv_prog_cc_static_works" = Xno; then
+  ac_cv_prog_cc_static=
+fi
+echo "$ac_t""$ac_cv_prog_cc_static_works" 1>&6
+pic_flag="$ac_cv_prog_cc_pic"
+special_shlib_compile_flags="$ac_cv_prog_cc_shlib"
+wl="$ac_cv_prog_cc_wl"
+link_static_flag="$ac_cv_prog_cc_static"
+no_builtin_flag="$ac_cv_prog_cc_no_builtin"
+can_build_shared="$ac_cv_prog_cc_can_build_shared"
+
+# Check to see if options -o and -c are simultaneously supported by compiler
+echo $ac_n "checking if $compiler supports -c -o file.o... $ac_c" 1>&6
+$rm -r conftest 2>/dev/null
+mkdir conftest
+cd conftest
+$rm conftest*
+echo "int some_variable = 0;" > conftest.c
+mkdir out
+# According to Tom Tromey, Ian Lance Taylor reported there are C compilers
+# that will create temporary files in the current directory regardless of
+# the output directory.  Thus, making CWD read-only will cause this test
+# to fail, enabling locking or at least warning the user not to do parallel
+# builds.
+chmod -w .
+save_CFLAGS="$CFLAGS"
+CFLAGS="$CFLAGS -o out/conftest2.o"
+echo "$progname:799: checking if $compiler supports -c -o file.o" >&5
+if { (eval echo $progname:800: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.o; then
+
+  # The compiler can only warn and ignore the option if not recognized
+  # So say no if there are warnings
+    if test -s out/conftest.err; then
+      echo "$ac_t"no 1>&6
+      compiler_c_o=no
+    else
+      echo "$ac_t"yes 1>&6
+      compiler_c_o=yes
+    fi
+else
+  # Append any errors to the config.log.
+  cat out/conftest.err 1>&5
+  compiler_c_o=no
+  echo "$ac_t"no 1>&6
+fi
+CFLAGS="$save_CFLAGS"
+chmod u+w .
+$rm conftest* out/*
+rmdir out
+cd ..
+rmdir conftest
+$rm -r conftest 2>/dev/null
+
+if test x"$compiler_c_o" = x"yes"; then
+  # Check to see if we can write to a .lo
+  echo $ac_n "checking if $compiler supports -c -o file.lo... $ac_c" 1>&6
+  $rm conftest*
+  echo "int some_variable = 0;" > conftest.c
+  save_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -c -o conftest.lo"
+  echo "$progname:832: checking if $compiler supports -c -o file.lo" >&5
+if { (eval echo $progname:833: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; } && test -s conftest.lo; then
+
+    # The compiler can only warn and ignore the option if not recognized
+    # So say no if there are warnings
+      if test -s conftest.err; then
+	echo "$ac_t"no 1>&6
+	compiler_o_lo=no
+      else
+	echo "$ac_t"yes 1>&6
+	compiler_o_lo=yes
+      fi
+  else
+    # Append any errors to the config.log.
+    cat conftest.err 1>&5
+    compiler_o_lo=no
+    echo "$ac_t"no 1>&6
+  fi
+  CFLAGS="$save_CFLAGS"
+  $rm conftest*
+else
+  compiler_o_lo=no
+fi
+
+# Check to see if we can do hard links to lock some files if needed
+hard_links="nottested"
+if test "$compiler_c_o" = no && test "$need_locks" != no; then
+  # do not overwrite the value of need_locks provided by the user
+  echo $ac_n "checking if we can lock with hard links... $ac_c" 1>&6
+  hard_links=yes
+  $rm conftest*
+  ln conftest.a conftest.b 2>/dev/null && hard_links=no
+  touch conftest.a
+  ln conftest.a conftest.b 2>&5 || hard_links=no
+  ln conftest.a conftest.b 2>/dev/null && hard_links=no
+  echo "$ac_t$hard_links" 1>&6
+  $rm conftest*
+  if test "$hard_links" = no; then
+    echo "*** WARNING: \`$CC' does not support \`-c -o', so \`make -j' may be unsafe" >&2
+    need_locks=warn
+  fi
+else
+  need_locks=no
+fi
+
+if test "$with_gcc" = yes; then
+  # Check to see if options -fno-rtti -fno-exceptions are supported by compiler
+  echo $ac_n "checking if $compiler supports -fno-rtti -fno-exceptions ... $ac_c" 1>&6
+  $rm conftest*
+  echo "int some_variable = 0;" > conftest.c
+  save_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -fno-rtti -fno-exceptions -c conftest.c"
+  echo "$progname:884: checking if $compiler supports -fno-rtti -fno-exceptions" >&5
+  if { (eval echo $progname:885: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; } && test -s conftest.o; then
+
+    # The compiler can only warn and ignore the option if not recognized
+    # So say no if there are warnings
+      if test -s conftest.err; then
+	echo "$ac_t"no 1>&6
+	compiler_rtti_exceptions=no
+      else
+	echo "$ac_t"yes 1>&6
+	compiler_rtti_exceptions=yes
+      fi
+  else
+    # Append any errors to the config.log.
+    cat conftest.err 1>&5
+    compiler_rtti_exceptions=no
+    echo "$ac_t"no 1>&6
+  fi
+  CFLAGS="$save_CFLAGS"
+  $rm conftest*
+
+  if test "$compiler_rtti_exceptions" = "yes"; then
+    no_builtin_flag=' -fno-builtin -fno-rtti -fno-exceptions'
+  else
+    no_builtin_flag=' -fno-builtin'
+  fi
+
+fi
+
+# See if the linker supports building shared libraries.
+echo $ac_n "checking whether the linker ($LD) supports shared libraries... $ac_c" 1>&6
+
+allow_undefined_flag=
+no_undefined_flag=
+need_lib_prefix=unknown
+need_version=unknown
+# when you set need_version to no, make sure it does not cause -set_version
+# flags to be left without arguments
+archive_cmds=
+archive_expsym_cmds=
+old_archive_from_new_cmds=
+old_archive_from_expsyms_cmds=
+striplib=
+old_striplib=
+export_dynamic_flag_spec=
+whole_archive_flag_spec=
+thread_safe_flag_spec=
+hardcode_into_libs=no
+hardcode_libdir_flag_spec=
+hardcode_libdir_separator=
+hardcode_direct=no
+hardcode_minus_L=no
+hardcode_shlibpath_var=unsupported
+runpath_var=
+link_all_deplibs=unknown
+always_export_symbols=no
+export_symbols_cmds='$NM $libobjs $convenience | $global_symbol_pipe | sed '\''s/.* //'\'' | sort | uniq > $export_symbols'
+# include_expsyms should be a list of space-separated symbols to be *always*
+# included in the symbol list
+include_expsyms=
+# exclude_expsyms can be an egrep regular expression of symbols to exclude
+# it will be wrapped by ` (' and `)$', so one must not match beginning or
+# end of line.  Example: `a|bc|.*d.*' will exclude the symbols `a' and `bc',
+# as well as any symbol that contains `d'.
+exclude_expsyms="_GLOBAL_OFFSET_TABLE_"
+# Although _GLOBAL_OFFSET_TABLE_ is a valid symbol C name, most a.out
+# platforms (ab)use it in PIC code, but their linkers get confused if
+# the symbol is explicitly referenced.  Since portable code cannot
+# rely on this symbol name, it's probably fine to never include it in
+# preloaded symbol tables.
+extract_expsyms_cmds=
+
+case "$host_os" in
+cygwin* | mingw*)
+  # FIXME: the MSVC++ port hasn't been tested in a loooong time
+  # When not using gcc, we currently assume that we are using
+  # Microsoft Visual C++.
+  if test "$with_gcc" != yes; then
+    with_gnu_ld=no
+  fi
+  ;;
+
+esac
+
+ld_shlibs=yes
+if test "$with_gnu_ld" = yes; then
+  # If archive_cmds runs LD, not CC, wlarc should be empty
+  wlarc='${wl}'
+
+  # See if GNU ld supports shared libraries.
+  case "$host_os" in
+  aix3* | aix4*)
+    # On AIX, the GNU linker is very broken
+    ld_shlibs=no
+    cat <<EOF 1>&2
+
+*** Warning: the GNU linker, at least up to release 2.9.1, is reported
+*** to be unable to reliably create shared libraries on AIX.
+*** Therefore, libtool is disabling shared libraries support.  If you
+*** really care for shared libraries, you may want to modify your PATH
+*** so that a non-GNU linker is found, and then restart.
+
+EOF
+    ;;
+
+  amigaos*)
+    archive_cmds='$rm $output_objdir/a2ixlibrary.data~$echo "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$echo "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$echo "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$echo "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR cru $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_minus_L=yes
+
+    # Samuel A. Falvo II <kc5tja@dolphin.openprojects.net> reports
+    # that the semantics of dynamic libraries on AmigaOS, at least up
+    # to version 4, is to share data among multiple programs linked
+    # with the same dynamic library.  Since this doesn't match the
+    # behavior of shared libraries on other platforms, we can use
+    # them.
+    ld_shlibs=no
+    ;;
+
+  beos*)
+    if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+      allow_undefined_flag=unsupported
+      # Joseph Beckenbach <jrb3@best.com> says some releases of gcc
+      # support --undefined.  This deserves some investigation.  FIXME
+      archive_cmds='$CC -nostart $libobjs $deplibs $linker_flags ${wl}-soname $wl$soname -o $lib'
+    else
+      ld_shlibs=no
+    fi
+    ;;
+
+  cygwin* | mingw*)
+    # hardcode_libdir_flag_spec is actually meaningless, as there is
+    # no search path for DLLs.
+    hardcode_libdir_flag_spec='-L$libdir'
+    allow_undefined_flag=unsupported
+    always_export_symbols=yes
+
+    extract_expsyms_cmds='test -f $output_objdir/impgen.c || \
+      sed -e "/^# \/\* impgen\.c starts here \*\//,/^# \/\* impgen.c ends here \*\// { s/^# //; p; }" -e d < $0 > $output_objdir/impgen.c~
+      test -f $output_objdir/impgen.exe || (cd $output_objdir && \
+      if test "x$HOST_CC" != "x" ; then $HOST_CC -o impgen impgen.c ; \
+      else $CC -o impgen impgen.c ; fi)~
+      $output_objdir/impgen $dir/$soname > $output_objdir/$soname-def'
+
+    old_archive_from_expsyms_cmds='$DLLTOOL --as=$AS --dllname $soname --def $output_objdir/$soname-def --output-lib $output_objdir/$newlib'
+
+    # cygwin and mingw dlls have different entry points and sets of symbols
+    # to exclude.
+    # FIXME: what about values for MSVC?
+    dll_entry=__cygwin_dll_entry@12
+    dll_exclude_symbols=DllMain@12,_cygwin_dll_entry@12,_cygwin_noncygwin_dll_entry@12~
+    case "$host_os" in
+    mingw*)
+      # mingw values
+      dll_entry=_DllMainCRTStartup@12
+      dll_exclude_symbols=DllMain@12,DllMainCRTStartup@12,DllEntryPoint@12~
+      ;;
+    esac
+
+    # mingw and cygwin differ, and it's simplest to just exclude the union
+    # of the two symbol sets.
+    dll_exclude_symbols=DllMain@12,_cygwin_dll_entry@12,_cygwin_noncygwin_dll_entry@12,DllMainCRTStartup@12,DllEntryPoint@12
+
+    # recent cygwin and mingw systems supply a stub DllMain which the user
+    # can override, but on older systems we have to supply one (in ltdll.c)
+    if test "x$lt_cv_need_dllmain" = "xyes"; then
+      ltdll_obj='$output_objdir/$soname-ltdll.'"$objext "
+      ltdll_cmds='test -f $output_objdir/$soname-ltdll.c || sed -e "/^# \/\* ltdll\.c starts here \*\//,/^# \/\* ltdll.c ends here \*\// { s/^# //; p; }" -e d < $0 > $output_objdir/$soname-ltdll.c~
+	test -f $output_objdir/$soname-ltdll.$objext || (cd $output_objdir && $CC -c $soname-ltdll.c)~'
+    else
+      ltdll_obj=
+      ltdll_cmds=
+    fi
+
+    # Extract the symbol export list from an `--export-all' def file,
+    # then regenerate the def file from the symbol export list, so that
+    # the compiled dll only exports the symbol export list.
+    # Be careful not to strip the DATA tag left be newer dlltools.
+    export_symbols_cmds="$ltdll_cmds"'
+      $DLLTOOL --export-all --exclude-symbols '$dll_exclude_symbols' --output-def $output_objdir/$soname-def '$ltdll_obj'$libobjs $convenience~
+      sed -e "1,/EXPORTS/d" -e "s/ @ [0-9]*//" -e "s/ *;.*$//" < $output_objdir/$soname-def > $export_symbols'
+
+    # If DATA tags from a recent dlltool are present, honour them!
+    archive_expsym_cmds='echo EXPORTS > $output_objdir/$soname-def~
+      _lt_hint=1;
+      cat $export_symbols | while read symbol; do
+	set dummy \$symbol;
+	case \$# in
+	  2) echo "	\$2 @ \$_lt_hint ; " >> $output_objdir/$soname-def;;
+	  *) echo "     \$2 @ \$_lt_hint \$3 ; " >> $output_objdir/$soname-def;;
+	esac;
+	_lt_hint=`expr 1 + \$_lt_hint`;
+      done~
+      '"$ltdll_cmds"'
+      $CC -Wl,--base-file,$output_objdir/$soname-base '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $lib '$ltdll_obj'$libobjs $deplibs $compiler_flags~
+      $DLLTOOL --as=$AS --dllname $soname --exclude-symbols '$dll_exclude_symbols' --def $output_objdir/$soname-def --base-file $output_objdir/$soname-base --output-exp $output_objdir/$soname-exp~
+      $CC -Wl,--base-file,$output_objdir/$soname-base $output_objdir/$soname-exp '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $lib '$ltdll_obj'$libobjs $deplibs $compiler_flags~
+      $DLLTOOL --as=$AS --dllname $soname --exclude-symbols '$dll_exclude_symbols' --def $output_objdir/$soname-def --base-file $output_objdir/$soname-base --output-exp $output_objdir/$soname-exp~
+      $CC $output_objdir/$soname-exp '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $lib '$ltdll_obj'$libobjs $deplibs $compiler_flags'
+    ;;
+
+  netbsd*)
+    if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+    else
+      archive_cmds='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib'
+    fi
+    ;;
+
+  solaris* | sysv5*)
+    if $LD -v 2>&1 | egrep 'BFD 2\.8' > /dev/null; then
+      ld_shlibs=no
+      cat <<EOF 1>&2
+
+*** Warning: The releases 2.8.* of the GNU linker cannot reliably
+*** create shared libraries on Solaris systems.  Therefore, libtool
+*** is disabling shared libraries support.  We urge you to upgrade GNU
+*** binutils to release 2.9.1 or newer.  Another option is to modify
+*** your PATH or compiler configuration so that the native linker is
+*** used, and then restart.
+
+EOF
+    elif $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+      archive_cmds='$CC -shared $libobjs $deplibs $linker_flags ${wl}-soname $wl$soname -o $lib'
+      archive_expsym_cmds='$CC -shared $libobjs $deplibs $linker_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+    else
+      ld_shlibs=no
+    fi
+    ;;
+
+  sunos4*)
+    archive_cmds='$LD -assert pure-text -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+    wlarc=
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  *)
+    if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+    else
+      ld_shlibs=no
+    fi
+    ;;
+  esac
+
+  if test "$ld_shlibs" = yes; then
+    runpath_var=LD_RUN_PATH
+    hardcode_libdir_flag_spec='${wl}--rpath ${wl}$libdir'
+    export_dynamic_flag_spec='${wl}--export-dynamic'
+    case $host_os in
+    cygwin* | mingw*)
+      # dlltool doesn't understand --whole-archive et. al.
+      whole_archive_flag_spec=
+      ;;
+    *)
+      # ancient GNU ld didn't support --whole-archive et. al.
+      if $LD --help 2>&1 | egrep 'no-whole-archive' > /dev/null; then
+	whole_archive_flag_spec="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
+      else
+	whole_archive_flag_spec=
+      fi
+      ;;
+    esac
+  fi
+else
+  # PORTME fill in a description of your system's linker (not GNU ld)
+  case "$host_os" in
+  aix3*)
+    allow_undefined_flag=unsupported
+    always_export_symbols=yes
+    archive_expsym_cmds='$LD -o $output_objdir/$soname $libobjs $deplibs $linker_flags -bE:$export_symbols -T512 -H512 -bM:SRE~$AR cru $lib $output_objdir/$soname'
+    # Note: this linker hardcodes the directories in LIBPATH if there
+    # are no directories specified by -L.
+    hardcode_minus_L=yes
+    if test "$with_gcc" = yes && test -z "$link_static_flag"; then
+      # Neither direct hardcoding nor static linking is supported with a
+      # broken collect2.
+      hardcode_direct=unsupported
+    fi
+    ;;
+
+  aix4*)
+    hardcode_libdir_flag_spec='${wl}-b ${wl}nolibpath ${wl}-b ${wl}libpath:$libdir:/usr/lib:/lib'
+    hardcode_libdir_separator=':'
+    if test "$with_gcc" = yes; then
+      collect2name=`${CC} -print-prog-name=collect2`
+      if test -f "$collect2name" && \
+	 strings "$collect2name" | grep resolve_lib_name >/dev/null
+      then
+	# We have reworked collect2
+	hardcode_direct=yes
+      else
+	# We have old collect2
+	hardcode_direct=unsupported
+	# It fails to find uninstalled libraries when the uninstalled
+	# path is not listed in the libpath.  Setting hardcode_minus_L
+	# to unsupported forces relinking
+	hardcode_minus_L=yes
+	hardcode_libdir_flag_spec='-L$libdir'
+	hardcode_libdir_separator=
+      fi
+      shared_flag='-shared'
+    else
+      shared_flag='${wl}-bM:SRE'
+      hardcode_direct=yes
+    fi
+    allow_undefined_flag=' ${wl}-berok'
+    archive_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${wl}-bexpall ${wl}-bnoentry${allow_undefined_flag}'
+    archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${wl}-bE:$export_symbols ${wl}-bnoentry${allow_undefined_flag}'
+    case "$host_os" in aix4.[01]|aix4.[01].*)
+      # According to Greg Wooledge, -bexpall is only supported from AIX 4.2 on
+      always_export_symbols=yes ;;
+    esac
+   ;;
+
+  amigaos*)
+    archive_cmds='$rm $output_objdir/a2ixlibrary.data~$echo "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$echo "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$echo "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$echo "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR cru $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_minus_L=yes
+    # see comment about different semantics on the GNU ld section
+    ld_shlibs=no
+    ;;
+
+  cygwin* | mingw*)
+    # When not using gcc, we currently assume that we are using
+    # Microsoft Visual C++.
+    # hardcode_libdir_flag_spec is actually meaningless, as there is
+    # no search path for DLLs.
+    hardcode_libdir_flag_spec=' '
+    allow_undefined_flag=unsupported
+    # Tell ltmain to make .lib files, not .a files.
+    libext=lib
+    # FIXME: Setting linknames here is a bad hack.
+    archive_cmds='$CC -o $lib $libobjs $compiler_flags `echo "$deplibs" | sed -e '\''s/ -lc$//'\''` -link -dll~linknames='
+    # The linker will automatically build a .lib file if we build a DLL.
+    old_archive_from_new_cmds='true'
+    # FIXME: Should let the user specify the lib program.
+    old_archive_cmds='lib /OUT:$oldlib$oldobjs$old_deplibs'
+    fix_srcfile_path='`cygpath -w $srcfile`'
+    ;;
+
+  freebsd1*)
+    ld_shlibs=no
+    ;;
+
+  # FreeBSD 2.2.[012] allows us to include c++rt0.o to get C++ constructor
+  # support.  Future versions do this automatically, but an explicit c++rt0.o
+  # does not break anything, and helps significantly (at the cost of a little
+  # extra space).
+  freebsd2.2*)
+    archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags /usr/lib/c++rt0.o'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  # Unfortunately, older versions of FreeBSD 2 do not have this feature.
+  freebsd2*)
+    archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_direct=yes
+    hardcode_minus_L=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  # FreeBSD 3 and greater uses gcc -shared to do shared libraries.
+  freebsd*)
+    archive_cmds='$CC -shared -o $lib $libobjs $deplibs $compiler_flags'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  hpux9* | hpux10* | hpux11*)
+    case "$host_os" in
+    hpux9*) archive_cmds='$rm $output_objdir/$soname~$LD -b +b $install_libdir -o $output_objdir/$soname $libobjs $deplibs $linker_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib' ;;
+    *) archive_cmds='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags' ;;
+    esac
+    hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir'
+    hardcode_libdir_separator=:
+    hardcode_direct=yes
+    hardcode_minus_L=yes # Not in the search PATH, but as the default
+			 # location of the library.
+    export_dynamic_flag_spec='${wl}-E'
+    ;;
+
+  irix5* | irix6*)
+    if test "$with_gcc" = yes; then
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+    else
+      archive_cmds='$LD -shared $libobjs $deplibs $linker_flags -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${output_objdir}/so_locations -o $lib'
+    fi
+    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    hardcode_libdir_separator=:
+    link_all_deplibs=yes
+    ;;
+
+  netbsd*)
+    if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+      archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'  # a.out
+    else
+      archive_cmds='$LD -shared -o $lib $libobjs $deplibs $linker_flags'      # ELF
+    fi
+    hardcode_libdir_flag_spec='${wl}-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  openbsd*)
+    archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  os2*)
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_minus_L=yes
+    allow_undefined_flag=unsupported
+    archive_cmds='$echo "LIBRARY $libname INITINSTANCE" > $output_objdir/$libname.def~$echo "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~$echo DATA >> $output_objdir/$libname.def~$echo " SINGLE NONSHARED" >> $output_objdir/$libname.def~$echo EXPORTS >> $output_objdir/$libname.def~emxexp $libobjs >> $output_objdir/$libname.def~$CC -Zdll -Zcrtdll -o $lib $libobjs $deplibs $compiler_flags $output_objdir/$libname.def'
+    old_archive_from_new_cmds='emximp -o $output_objdir/$libname.a $output_objdir/$libname.def'
+    ;;
+
+  osf3*)
+    if test "$with_gcc" = yes; then
+      allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*'
+      archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+    else
+      allow_undefined_flag=' -expect_unresolved \*'
+      archive_cmds='$LD -shared${allow_undefined_flag} $libobjs $deplibs $linker_flags -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${output_objdir}/so_locations -o $lib'
+    fi
+    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    hardcode_libdir_separator=:
+    ;;
+
+  osf4* | osf5*)	# as osf3* with the addition of -msym flag
+    if test "$with_gcc" = yes; then
+      allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*'
+      archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+    else
+      allow_undefined_flag=' -expect_unresolved \*'
+      archive_cmds='$LD -shared${allow_undefined_flag} $libobjs $deplibs $linker_flags -msym -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${output_objdir}/so_locations -o $lib'
+    fi
+    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    hardcode_libdir_separator=:
+    ;;
+
+  sco3.2v5*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_shlibpath_var=no
+    runpath_var=LD_RUN_PATH
+    hardcode_runpath_var=yes
+    ;;
+
+  solaris*)
+    no_undefined_flag=' -z text'
+    # $CC -shared without GNU ld will not create a library from C++
+    # object files and a static libstdc++, better avoid it by now
+    archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    archive_expsym_cmds='$echo "{ global:" > $lib.exp~cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $lib.exp~$echo "local: *; };" >> $lib.exp~
+		$LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$rm $lib.exp'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_shlibpath_var=no
+    case "$host_os" in
+    solaris2.[0-5] | solaris2.[0-5].*) ;;
+    *) # Supported since Solaris 2.6 (maybe 2.5.1?)
+      whole_archive_flag_spec='-z allextract$convenience -z defaultextract' ;;
+    esac
+    link_all_deplibs=yes
+    ;;
+
+  sunos4*)
+    archive_cmds='$LD -assert pure-text -Bstatic -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_direct=yes
+    hardcode_minus_L=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  sysv4)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    runpath_var='LD_RUN_PATH'
+    hardcode_shlibpath_var=no
+    hardcode_direct=no #Motorola manual says yes, but my tests say they lie
+    ;;
+
+  sysv4.3*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_shlibpath_var=no
+    export_dynamic_flag_spec='-Bexport'
+    ;;
+
+  sysv5*)
+    no_undefined_flag=' -z text'
+    # $CC -shared without GNU ld will not create a library from C++
+    # object files and a static libstdc++, better avoid it by now
+    archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    archive_expsym_cmds='$echo "{ global:" > $lib.exp~cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $lib.exp~$echo "local: *; };" >> $lib.exp~
+		$LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$rm $lib.exp'
+    hardcode_libdir_flag_spec=
+    hardcode_shlibpath_var=no
+    runpath_var='LD_RUN_PATH'
+    ;;
+
+  uts4*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_shlibpath_var=no
+    ;;
+
+  dgux*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_shlibpath_var=no
+    ;;
+
+  sysv4*MP*)
+    if test -d /usr/nec; then
+      archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+      hardcode_shlibpath_var=no
+      runpath_var=LD_RUN_PATH
+      hardcode_runpath_var=yes
+      ld_shlibs=yes
+    fi
+    ;;
+
+  sysv4.2uw2*)
+    archive_cmds='$LD -G -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_direct=yes
+    hardcode_minus_L=no
+    hardcode_shlibpath_var=no
+    hardcode_runpath_var=yes
+    runpath_var=LD_RUN_PATH
+    ;;
+
+  unixware7*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    runpath_var='LD_RUN_PATH'
+    hardcode_shlibpath_var=no
+    ;;
+
+  *)
+    ld_shlibs=no
+    ;;
+  esac
+fi
+echo "$ac_t$ld_shlibs" 1>&6
+test "$ld_shlibs" = no && can_build_shared=no
+
+# Check hardcoding attributes.
+echo $ac_n "checking how to hardcode library paths into programs... $ac_c" 1>&6
+hardcode_action=
+if test -n "$hardcode_libdir_flag_spec" || \
+   test -n "$runpath_var"; then
+
+  # We can hardcode non-existant directories.
+  if test "$hardcode_direct" != no &&
+     # If the only mechanism to avoid hardcoding is shlibpath_var, we
+     # have to relink, otherwise we might link with an installed library
+     # when we should be linking with a yet-to-be-installed one
+     ## test "$hardcode_shlibpath_var" != no &&
+     test "$hardcode_minus_L" != no; then
+    # Linking always hardcodes the temporary library directory.
+    hardcode_action=relink
+  else
+    # We can link without hardcoding, and we can hardcode nonexisting dirs.
+    hardcode_action=immediate
+  fi
+else
+  # We cannot hardcode anything, or else we can only hardcode existing
+  # directories.
+  hardcode_action=unsupported
+fi
+echo "$ac_t$hardcode_action" 1>&6
+
+echo $ac_n "checking whether stripping libraries is possible... $ac_c" 1>&6
+if test -n "$STRIP" && $STRIP -V 2>&1 | grep "GNU strip" >/dev/null; then
+  test -z "$old_striplib" && old_striplib="$STRIP --strip-debug"
+  test -z "$striplib" && striplib="$STRIP --strip-unneeded"
+  echo "${ac_t}yes" 1>&6
+else
+  echo "${ac_t}no" 1>&6
+fi
+
+reload_cmds='$LD$reload_flag -o $output$reload_objs'
+test -z "$deplibs_check_method" && deplibs_check_method=unknown
+
+# PORTME Fill in your ld.so characteristics
+library_names_spec=
+libname_spec='lib$name'
+soname_spec=
+postinstall_cmds=
+postuninstall_cmds=
+finish_cmds=
+finish_eval=
+shlibpath_var=
+shlibpath_overrides_runpath=unknown
+version_type=none
+dynamic_linker="$host_os ld.so"
+sys_lib_dlsearch_path_spec="/lib /usr/lib"
+sys_lib_search_path_spec="/lib /usr/lib /usr/local/lib"
+
+echo $ac_n "checking dynamic linker characteristics... $ac_c" 1>&6
+case "$host_os" in
+aix3*)
+  version_type=linux
+  library_names_spec='${libname}${release}.so$versuffix $libname.a'
+  shlibpath_var=LIBPATH
+
+  # AIX has no versioning support, so we append a major version to the name.
+  soname_spec='${libname}${release}.so$major'
+  ;;
+
+aix4*)
+  version_type=linux
+  # AIX has no versioning support, so currently we can not hardcode correct
+  # soname into executable. Probably we can add versioning support to
+  # collect2, so additional links can be useful in future.
+  # We preserve .a as extension for shared libraries though AIX4.2
+  # and later linker supports .so
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.a'
+  shlibpath_var=LIBPATH
+  ;;
+
+amigaos*)
+  library_names_spec='$libname.ixlibrary $libname.a'
+  # Create ${libname}_ixlibrary.a entries in /sys/libs.
+  finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`$echo "X$lib" | $Xsed -e '\''s%^.*/\([^/]*\)\.ixlibrary$%\1%'\''`; test $rm /sys/libs/${libname}_ixlibrary.a; $show "(cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a)"; (cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a) || exit 1; done'
+  ;;
+
+beos*)
+  library_names_spec='${libname}.so'
+  dynamic_linker="$host_os ld.so"
+  shlibpath_var=LIBRARY_PATH
+  lt_cv_dlopen="load_add_on"
+  lt_cv_dlopen_libs=
+  lt_cv_dlopen_self=yes
+  ;;
+
+bsdi4*)
+  version_type=linux
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  sys_lib_search_path_spec="/shlib /usr/lib /usr/X11/lib /usr/contrib/lib /lib /usr/local/lib"
+  sys_lib_dlsearch_path_spec="/shlib /usr/lib /usr/local/lib"
+  export_dynamic_flag_spec=-rdynamic
+  # the default ld.so.conf also contains /usr/contrib/lib and
+  # /usr/X11R6/lib (/usr/X11 is a link to /usr/X11R6), but let us allow
+  # libtool to hard-code these into programs
+  ;;
+
+cygwin* | mingw*)
+  version_type=windows
+  need_version=no
+  need_lib_prefix=no
+  if test "$with_gcc" = yes; then
+    library_names_spec='${libname}`echo ${release} | sed -e 's/[.]/-/g'`${versuffix}.dll'
+  else
+    library_names_spec='${libname}`echo ${release} | sed -e 's/[.]/-/g'`${versuffix}.dll $libname.lib'
+  fi
+  dynamic_linker='Win32 ld.exe'
+  # FIXME: first we should search . and the directory the executable is in
+  shlibpath_var=PATH
+  lt_cv_dlopen="LoadLibrary"
+  lt_cv_dlopen_libs=
+  ;;
+
+freebsd1*)
+  dynamic_linker=no
+  ;;
+
+freebsd*)
+  objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo aout`
+  version_type=freebsd-$objformat
+  case "$version_type" in
+    freebsd-elf*)
+      library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so $libname.so'
+      need_version=no
+      need_lib_prefix=no
+      ;;
+    freebsd-*)
+      library_names_spec='${libname}${release}.so$versuffix $libname.so$versuffix'
+      need_version=yes
+      ;;
+  esac
+  shlibpath_var=LD_LIBRARY_PATH
+  case "$host_os" in
+  freebsd2*)
+    shlibpath_overrides_runpath=yes
+    ;;
+  freebsd3.[01]* | freebsdelf3.[01]*)
+    shlibpath_overrides_runpath=yes
+    hardcode_into_libs=yes
+    ;;
+  *) # from 3.2 on
+    shlibpath_overrides_runpath=no
+    hardcode_into_libs=yes
+    ;;
+  esac
+  ;;
+
+gnu*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so${major} ${libname}.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  hardcode_into_libs=yes
+  ;;
+
+hpux9* | hpux10* | hpux11*)
+  # Give a soname corresponding to the major version so that dld.sl refuses to
+  # link against other versions.
+  dynamic_linker="$host_os dld.sl"
+  version_type=sunos
+  need_lib_prefix=no
+  need_version=no
+  shlibpath_var=SHLIB_PATH
+  shlibpath_overrides_runpath=no # +s is required to enable SHLIB_PATH
+  library_names_spec='${libname}${release}.sl$versuffix ${libname}${release}.sl$major $libname.sl'
+  soname_spec='${libname}${release}.sl$major'
+  # HP-UX runs *really* slowly unless shared libraries are mode 555.
+  postinstall_cmds='chmod 555 $lib'
+  ;;
+
+irix5* | irix6*)
+  version_type=irix
+  need_lib_prefix=no
+  need_version=no
+  soname_spec='${libname}${release}.so.$major'
+  library_names_spec='${libname}${release}.so.$versuffix ${libname}${release}.so.$major ${libname}${release}.so $libname.so'
+  case "$host_os" in
+  irix5*)
+    libsuff= shlibsuff=
+    ;;
+  *)
+    case "$LD" in # libtool.m4 will add one of these switches to LD
+    *-32|*"-32 ") libsuff= shlibsuff= libmagic=32-bit;;
+    *-n32|*"-n32 ") libsuff=32 shlibsuff=N32 libmagic=N32;;
+    *-64|*"-64 ") libsuff=64 shlibsuff=64 libmagic=64-bit;;
+    *) libsuff= shlibsuff= libmagic=never-match;;
+    esac
+    ;;
+  esac
+  shlibpath_var=LD_LIBRARY${shlibsuff}_PATH
+  shlibpath_overrides_runpath=no
+  sys_lib_search_path_spec="/usr/lib${libsuff} /lib${libsuff} /usr/local/lib${libsuff}"
+  sys_lib_dlsearch_path_spec="/usr/lib${libsuff} /lib${libsuff}"
+  ;;
+
+# No shared lib support for Linux oldld, aout, or coff.
+linux-gnuoldld* | linux-gnuaout* | linux-gnucoff*)
+  dynamic_linker=no
+  ;;
+
+# This must be Linux ELF.
+linux-gnu*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -n $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  # This implies no fast_install, which is unacceptable.
+  # Some rework will be needed to allow for fast_install
+  # before this can be enabled.
+  hardcode_into_libs=yes
+
+  if test -f /lib/ld.so.1; then
+    dynamic_linker='GNU ld.so'
+  else
+    # Only the GNU ld.so supports shared libraries on MkLinux.
+    case "$host_cpu" in
+    powerpc*) dynamic_linker=no ;;
+    *) dynamic_linker='Linux ld.so' ;;
+    esac
+  fi
+  ;;
+
+netbsd*)
+  version_type=sunos
+  if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+    library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix'
+    finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+    dynamic_linker='NetBSD (a.out) ld.so'
+  else
+    library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major ${libname}${release}.so ${libname}.so'
+    soname_spec='${libname}${release}.so$major'
+    dynamic_linker='NetBSD ld.elf_so'
+  fi
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+openbsd*)
+  version_type=sunos
+  if test "$with_gnu_ld" = yes; then
+    need_lib_prefix=no
+    need_version=no
+  fi
+  library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+os2*)
+  libname_spec='$name'
+  need_lib_prefix=no
+  library_names_spec='$libname.dll $libname.a'
+  dynamic_linker='OS/2 ld.exe'
+  shlibpath_var=LIBPATH
+  ;;
+
+osf3* | osf4* | osf5*)
+  version_type=osf
+  need_version=no
+  soname_spec='${libname}${release}.so'
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so $libname.so'
+  shlibpath_var=LD_LIBRARY_PATH
+  sys_lib_search_path_spec="/usr/shlib /usr/ccs/lib /usr/lib/cmplrs/cc /usr/lib /usr/local/lib /var/shlib"
+  sys_lib_dlsearch_path_spec="$sys_lib_search_path_spec"
+  ;;
+
+sco3.2v5*)
+  version_type=osf
+  soname_spec='${libname}${release}.so$major'
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+solaris*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  # ldd complains unless libraries are executable
+  postinstall_cmds='chmod +x $lib'
+  ;;
+
+sunos4*)
+  version_type=sunos
+  library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix'
+  finish_cmds='PATH="\$PATH:/usr/etc" ldconfig $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  if test "$with_gnu_ld" = yes; then
+    need_lib_prefix=no
+  fi
+  need_version=yes
+  ;;
+
+sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+  version_type=linux
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  case "$host_vendor" in
+    motorola)
+      need_lib_prefix=no
+      need_version=no
+      shlibpath_overrides_runpath=no
+      sys_lib_search_path_spec='/lib /usr/lib /usr/ccs/lib'
+      ;;
+  esac
+  ;;
+
+uts4*)
+  version_type=linux
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+dgux*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+sysv4*MP*)
+  if test -d /usr/nec ;then
+    version_type=linux
+    library_names_spec='$libname.so.$versuffix $libname.so.$major $libname.so'
+    soname_spec='$libname.so.$major'
+    shlibpath_var=LD_LIBRARY_PATH
+  fi
+  ;;
+
+*)
+  dynamic_linker=no
+  ;;
+esac
+echo "$ac_t$dynamic_linker" 1>&6
+test "$dynamic_linker" = no && can_build_shared=no
+
+# Check for command to grab the raw symbol name followed by C symbol from nm.
+echo $ac_n "checking command to parse $NM output... $ac_c" 1>&6
+
+# These are sane defaults that work on at least a few old systems.
+# [They come from Ultrix.  What could be older than Ultrix?!! ;)]
+
+# Character class describing NM global symbol codes.
+symcode='[BCDEGRST]'
+
+# Regexp to match symbols that can be accessed directly from C.
+sympat='\([_A-Za-z][_A-Za-z0-9]*\)'
+
+# Transform the above into a raw symbol and a C symbol.
+symxfrm='\1 \2\3 \3'
+
+# Transform an extracted symbol line into a proper C declaration
+global_symbol_to_cdecl="sed -n -e 's/^. .* \(.*\)$/extern char \1;/p'"
+
+# Define system-specific variables.
+case "$host_os" in
+aix*)
+  symcode='[BCDT]'
+  ;;
+cygwin* | mingw*)
+  symcode='[ABCDGISTW]'
+  ;;
+hpux*) # Its linker distinguishes data from code symbols
+  global_symbol_to_cdecl="sed -n -e 's/^T .* \(.*\)$/extern char \1();/p' -e 's/^. .* \(.*\)$/extern char \1;/p'"
+  ;;
+irix*)
+  symcode='[BCDEGRST]'
+  ;;
+solaris* | sysv5*)
+  symcode='[BDT]'
+  ;;
+sysv4)
+  symcode='[DFNSTU]'
+  ;;
+esac
+
+# Handle CRLF in mingw too chain
+opt_cr=
+case "$host_os" in
+mingw*)
+  opt_cr=`echo 'x\{0,1\}' | tr x '\015'` # option cr in regexp
+  ;;
+esac
+
+# If we're using GNU nm, then use its standard symbol codes.
+if $NM -V 2>&1 | egrep '(GNU|with BFD)' > /dev/null; then
+  symcode='[ABCDGISTW]'
+fi
+
+# Try without a prefix undercore, then with it.
+for ac_symprfx in "" "_"; do
+
+  # Write the raw and C identifiers.
+global_symbol_pipe="sed -n -e 's/^.*[ 	]\($symcode\)[ 	][ 	]*\($ac_symprfx\)$sympat$opt_cr$/$symxfrm/p'"
+
+  # Check to see that the pipe works correctly.
+  pipe_works=no
+  $rm conftest*
+  cat > conftest.c <<EOF
+#ifdef __cplusplus
+extern "C" {
+#endif
+char nm_test_var;
+void nm_test_func(){}
+#ifdef __cplusplus
+}
+#endif
+main(){nm_test_var='a';nm_test_func();return(0);}
+EOF
+
+  echo "$progname:1867: checking if global_symbol_pipe works" >&5
+  if { (eval echo $progname:1868: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; } && test -s conftest.$objext; then
+    # Now try to grab the symbols.
+    nlist=conftest.nm
+    if { echo "$progname:1871: eval \"$NM conftest.$objext | $global_symbol_pipe > $nlist\"" >&5; eval "$NM conftest.$objext | $global_symbol_pipe > $nlist 2>&5"; } && test -s "$nlist"; then
+
+      # Try sorting and uniquifying the output.
+      if sort "$nlist" | uniq > "$nlist"T; then
+	mv -f "$nlist"T "$nlist"
+      else
+	rm -f "$nlist"T
+      fi
+
+      # Make sure that we snagged all the symbols we need.
+      if egrep ' nm_test_var$' "$nlist" >/dev/null; then
+	if egrep ' nm_test_func$' "$nlist" >/dev/null; then
+	  cat <<EOF > conftest.c
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+EOF
+	  # Now generate the symbol file.
+	  eval "$global_symbol_to_cdecl"' < "$nlist" >> conftest.c'
+
+	  cat <<EOF >> conftest.c
+#if defined (__STDC__) && __STDC__
+# define lt_ptr_t void *
+#else
+# define lt_ptr_t char *
+# define const
+#endif
+
+/* The mapping between symbol names and symbols. */
+const struct {
+  const char *name;
+  lt_ptr_t address;
+}
+lt_preloaded_symbols[] =
+{
+EOF
+	  sed 's/^. \(.*\) \(.*\)$/  {"\2", (lt_ptr_t) \&\2},/' < "$nlist" >> conftest.c
+	  cat <<\EOF >> conftest.c
+  {0, (lt_ptr_t) 0}
+};
+
+#ifdef __cplusplus
+}
+#endif
+EOF
+	  # Now try linking the two files.
+	  mv conftest.$objext conftstm.$objext
+	  save_LIBS="$LIBS"
+	  save_CFLAGS="$CFLAGS"
+	  LIBS="conftstm.$objext"
+	  CFLAGS="$CFLAGS$no_builtin_flag"
+	  if { (eval echo $progname:1923: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
+	    pipe_works=yes
+	  else
+	    echo "$progname: failed program was:" >&5
+	    cat conftest.c >&5
+	  fi
+	  LIBS="$save_LIBS"
+	else
+	  echo "cannot find nm_test_func in $nlist" >&5
+	fi
+      else
+	echo "cannot find nm_test_var in $nlist" >&5
+      fi
+    else
+      echo "cannot run $global_symbol_pipe" >&5
+    fi
+  else
+    echo "$progname: failed program was:" >&5
+    cat conftest.c >&5
+  fi
+  $rm conftest* conftst*
+
+  # Do not use the global_symbol_pipe unless it works.
+  if test "$pipe_works" = yes; then
+    break
+  else
+    global_symbol_pipe=
+  fi
+done
+if test "$pipe_works" = yes; then
+  echo "${ac_t}ok" 1>&6
+else
+  echo "${ac_t}failed" 1>&6
+fi
+
+if test -z "$global_symbol_pipe"; then
+  global_symbol_to_cdecl=
+fi
+
+# Report the final consequences.
+echo "checking if libtool supports shared libraries... $can_build_shared" 1>&6
+
+# Only try to build win32 dlls if AC_LIBTOOL_WIN32_DLL was used in
+# configure.in, otherwise build static only libraries.
+case "$host_os" in
+cygwin* | mingw* | os2*)
+  if test x$can_build_shared = xyes; then
+    test x$enable_win32_dll = xno && can_build_shared=no
+    echo "checking if package supports dlls... $can_build_shared" 1>&6
+  fi
+;;
+esac
+
+echo $ac_n "checking whether to build shared libraries... $ac_c" 1>&6
+test "$can_build_shared" = "no" && enable_shared=no
+
+# On AIX, shared libraries and static libraries use the same namespace, and
+# are all built from PIC.
+case "$host_os" in
+aix3*)
+  test "$enable_shared" = yes && enable_static=no
+  if test -n "$RANLIB"; then
+    archive_cmds="$archive_cmds~\$RANLIB \$lib"
+    postinstall_cmds='$RANLIB $lib'
+  fi
+  ;;
+
+aix4*)
+  test "$enable_shared" = yes && enable_static=no
+  ;;
+esac
+
+echo "$ac_t$enable_shared" 1>&6
+
+# Make sure either enable_shared or enable_static is yes.
+test "$enable_shared" = yes || enable_static=yes
+
+echo "checking whether to build static libraries... $enable_static" 1>&6
+
+if test "$hardcode_action" = relink || test "$hardcode_into_libs" = all; then
+  # Fast installation is not supported
+  enable_fast_install=no
+elif test "$shlibpath_overrides_runpath" = yes ||
+     test "$enable_shared" = no; then
+  # Fast installation is not necessary
+  enable_fast_install=needless
+fi
+
+# Check whether we must set pic_mode to default
+test -z "$pic_flag" && pic_mode=default
+# On Cygwin there's no "real" PIC flag so we must build both object types
+case "$host_os" in
+cygwin* | mingw* | os2*)
+  pic_mode=default
+  ;;
+esac
+if test $pic_mode = no && test "$deplibs_check_method" != pass_all; then
+  # non-PIC code in shared libraries is not supported
+  pic_mode=default
+fi
+
+if test "x$enable_dlopen" != xyes; then
+  enable_dlopen=unknown
+  enable_dlopen_self=unknown
+  enable_dlopen_self_static=unknown
+else
+if test "X${lt_cv_dlopen+set}" != Xset; then
+  lt_cv_dlopen=no lt_cv_dlopen_libs=
+echo $ac_n "checking for dlopen in -ldl""... $ac_c" 1>&6
+echo "$progname:2032: checking for dlopen in -ldl" >&5
+if test "X${ac_cv_lib_dl_dlopen+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  ac_save_LIBS="$LIBS"
+LIBS="-ldl  $LIBS"
+cat > conftest.$ac_ext <<EOF
+#line 2039 "ltconfig"
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dlopen();
+
+int main() {
+dlopen()
+; return 0; }
+EOF
+if { (eval echo $progname:2052: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  ac_cv_lib_dl_dlopen=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_lib_dl_dlopen=no
+fi
+rm -f conftest*
+LIBS="$ac_save_LIBS"
+
+fi
+if test "X$ac_cv_lib_dl_dlopen" = Xyes; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-ldl"
+else
+  echo "$ac_t""no" 1>&6
+echo $ac_n "checking for dlopen""... $ac_c" 1>&6
+echo "$progname:2071: checking for dlopen" >&5
+if test "X${ac_cv_func_dlopen+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 2076 "ltconfig"
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char dlopen(); below.  */
+#include <assert.h>
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dlopen();
+
+int main() {
+
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined (__stub_dlopen) || defined (__stub___dlopen)
+choke me
+#else
+dlopen();
+#endif
+
+; return 0; }
+EOF
+if { (eval echo $progname:2101: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  ac_cv_func_dlopen=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_func_dlopen=no
+fi
+rm -f conftest*
+fi
+if test "X$ac_cv_func_dlopen" = Xyes; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="dlopen"
+else
+  echo "$ac_t""no" 1>&6
+echo $ac_n "checking for dld_link in -ldld""... $ac_c" 1>&6
+echo "$progname:2118: checking for dld_link in -ldld" >&5
+if test "X${ac_cv_lib_dld_dld_link+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  ac_save_LIBS="$LIBS"
+LIBS="-ldld  $LIBS"
+cat > conftest.$ac_ext <<EOF
+#line 2125 "ltconfig"
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dld_link();
+
+int main() {
+dld_link()
+; return 0; }
+EOF
+if { (eval echo $progname:2138: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  ac_cv_lib_dld_dld_link=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_lib_dld_dld_link=no
+fi
+rm -f conftest*
+LIBS="$ac_save_LIBS"
+
+fi
+if test "X$ac_cv_lib_dld_dld_link" = Xyes; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="dld_link" lt_cv_dlopen_libs="-ldld"
+else
+  echo "$ac_t""no" 1>&6
+echo $ac_n "checking for shl_load""... $ac_c" 1>&6
+echo "$progname:2157: checking for shl_load" >&5
+if test "X${ac_cv_func_shl_load+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 2162 "ltconfig"
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char shl_load(); below.  */
+#include <assert.h>
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char shl_load();
+
+int main() {
+
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined (__stub_shl_load) || defined (__stub___shl_load)
+choke me
+#else
+shl_load();
+#endif
+
+; return 0; }
+EOF
+if { (eval echo $progname:2187: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  ac_cv_func_shl_load=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_func_shl_load=no
+fi
+rm -f conftest*
+fi
+
+if test "X$ac_cv_func_shl_load" = Xyes; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="shl_load"
+else
+  echo "$ac_t""no" 1>&6
+echo $ac_n "checking for shl_load in -ldld""... $ac_c" 1>&6
+echo "$progname:2205: checking for shl_load in -ldld" >&5
+if test "X${ac_cv_lib_dld_shl_load+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  ac_save_LIBS="$LIBS"
+LIBS="-ldld  $LIBS"
+cat > conftest.$ac_ext <<EOF
+#line 2212 "ltconfig"
+#include "confdefs.h"
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char shl_load();
+
+int main() {
+shl_load()
+; return 0; }
+EOF
+if { (eval echo $progname:2226: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  ac_cv_lib_dld_shl_load=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_lib_dld_shl_load=no
+fi
+rm -f conftest*
+LIBS="$ac_save_LIBS"
+
+fi
+if test "X$ac_cv_lib_dld_shl_load" = Xyes; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="shl_load" lt_cv_dlopen_libs="-ldld"
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+
+fi
+
+
+fi
+
+
+fi
+
+
+fi
+
+fi
+
+  if test "x$lt_cv_dlopen" != xno; then
+    enable_dlopen=yes
+  fi
+
+  case "$lt_cv_dlopen" in
+  dlopen)
+for ac_hdr in dlfcn.h; do
+ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
+echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
+echo "$progname:2269: checking for $ac_hdr" >&5
+if eval "test \"`echo 'X$''{'ac_cv_header_$ac_safe'+set}'`\" = Xset"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 2274 "ltconfig"
+#include <$ac_hdr>
+int fnord = 0;
+int main () { }
+EOF
+ac_try="$ac_compile >/dev/null 2>conftest.out"
+{ (eval echo $progname:2280: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  rm -rf conftest*
+  eval "ac_cv_header_$ac_safe=yes"
+else
+  echo "$ac_err" >&5
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  eval "ac_cv_header_$ac_safe=no"
+fi
+rm -f conftest*
+fi
+if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then
+  echo "$ac_t""yes" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+done
+
+    if test "x$ac_cv_header_dlfcn_h" = xyes; then
+      CPPFLAGS="$CPPFLAGS -DHAVE_DLFCN_H"
+    fi
+    eval LDFLAGS=\"\$LDFLAGS $export_dynamic_flag_spec\"
+    LIBS="$lt_cv_dlopen_libs $LIBS"
+
+  echo $ac_n "checking whether a program can dlopen itself""... $ac_c" 1>&6
+echo "$progname:2308: checking whether a program can dlopen itself" >&5
+if test "X${lt_cv_dlopen_self+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test "$cross_compiling" = yes; then
+    lt_cv_dlopen_self=cross
+  else
+    cat > conftest.c <<EOF
+#line 2316 "ltconfig"
+
+#if HAVE_DLFCN_H
+#include <dlfcn.h>
+#endif
+
+#include <stdio.h>
+
+#ifdef RTLD_GLOBAL
+# define LTDL_GLOBAL	RTLD_GLOBAL
+#else
+# ifdef DL_GLOBAL
+#  define LTDL_GLOBAL	DL_GLOBAL
+# else
+#  define LTDL_GLOBAL	0
+# endif
+#endif
+
+/* We may have to define LTDL_LAZY_OR_NOW in the command line if we
+   find out it does not work in some platform. */
+#ifndef LTDL_LAZY_OR_NOW
+# ifdef RTLD_LAZY
+#  define LTDL_LAZY_OR_NOW	RTLD_LAZY
+# else
+#  ifdef DL_LAZY
+#   define LTDL_LAZY_OR_NOW	DL_LAZY
+#  else
+#   ifdef RTLD_NOW
+#    define LTDL_LAZY_OR_NOW	RTLD_NOW
+#   else
+#    ifdef DL_NOW
+#     define LTDL_LAZY_OR_NOW	DL_NOW
+#    else
+#     define LTDL_LAZY_OR_NOW	0
+#    endif
+#   endif
+#  endif
+# endif
+#endif
+
+fnord() { int i=42;}
+main() { void *self, *ptr1, *ptr2; self=dlopen(0,LTDL_GLOBAL|LTDL_LAZY_OR_NOW);
+    if(self) { ptr1=dlsym(self,"fnord"); ptr2=dlsym(self,"_fnord");
+	       if(ptr1 || ptr2) { dlclose(self); exit(0); } } exit(1); }
+
+EOF
+if { (eval echo $progname:2362: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null
+then
+  lt_cv_dlopen_self=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -fr conftest*
+  lt_cv_dlopen_self=no
+fi
+rm -fr conftest*
+fi
+
+fi
+
+echo "$ac_t""$lt_cv_dlopen_self" 1>&6
+
+  if test "$lt_cv_dlopen_self" = yes; then
+    LDFLAGS="$LDFLAGS $link_static_flag"
+  echo $ac_n "checking whether a statically linked program can dlopen itself""... $ac_c" 1>&6
+echo "$progname:2381: checking whether a statically linked program can dlopen itself" >&5
+if test "X${lt_cv_dlopen_self_static+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test "$cross_compiling" = yes; then
+    lt_cv_dlopen_self_static=cross
+  else
+    cat > conftest.c <<EOF
+#line 2389 "ltconfig"
+
+#if HAVE_DLFCN_H
+#include <dlfcn.h>
+#endif
+
+#include <stdio.h>
+
+#ifdef RTLD_GLOBAL
+# define LTDL_GLOBAL	RTLD_GLOBAL
+#else
+# ifdef DL_GLOBAL
+#  define LTDL_GLOBAL	DL_GLOBAL
+# else
+#  define LTDL_GLOBAL	0
+# endif
+#endif
+
+/* We may have to define LTDL_LAZY_OR_NOW in the command line if we
+   find out it does not work in some platform. */
+#ifndef LTDL_LAZY_OR_NOW
+# ifdef RTLD_LAZY
+#  define LTDL_LAZY_OR_NOW	RTLD_LAZY
+# else
+#  ifdef DL_LAZY
+#   define LTDL_LAZY_OR_NOW	DL_LAZY
+#  else
+#   ifdef RTLD_NOW
+#    define LTDL_LAZY_OR_NOW	RTLD_NOW
+#   else
+#    ifdef DL_NOW
+#     define LTDL_LAZY_OR_NOW	DL_NOW
+#    else
+#     define LTDL_LAZY_OR_NOW	0
+#    endif
+#   endif
+#  endif
+# endif
+#endif
+
+fnord() { int i=42;}
+main() { void *self, *ptr1, *ptr2; self=dlopen(0,LTDL_GLOBAL|LTDL_LAZY_OR_NOW);
+    if(self) { ptr1=dlsym(self,"fnord"); ptr2=dlsym(self,"_fnord");
+    if(ptr1 || ptr2) { dlclose(self); exit(0); } } exit(1); }
+
+EOF
+if { (eval echo $progname:2435: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null
+then
+  lt_cv_dlopen_self_static=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -fr conftest*
+  lt_cv_dlopen_self_static=no
+fi
+rm -fr conftest*
+fi
+
+fi
+
+echo "$ac_t""$lt_cv_dlopen_self_static" 1>&6
+fi
+    ;;
+  esac
+
+  case "$lt_cv_dlopen_self" in
+  yes|no) enable_dlopen_self=$lt_cv_dlopen_self ;;
+  *) enable_dlopen_self=unknown ;;
+  esac
+
+  case "$lt_cv_dlopen_self_static" in
+  yes|no) enable_dlopen_self_static=$lt_cv_dlopen_self_static ;;
+  *) enable_dlopen_self_static=unknown ;;
+  esac
+fi
+
+# Copy echo and quote the copy, instead of the original, because it is
+# used later.
+ltecho="$echo"
+if test "X$ltecho" = "X$CONFIG_SHELL $0 --fallback-echo"; then
+   ltecho="$CONFIG_SHELL \$0 --fallback-echo"
+fi
+LTSHELL="$SHELL"
+
+LTCONFIG_VERSION="$VERSION"
+
+# Only quote variables if we're using ltmain.sh.
+case "$ltmain" in
+*.sh)
+  # Now quote all the things that may contain metacharacters.
+  for var in ltecho old_AR old_CC old_CFLAGS old_CPPFLAGS \
+    old_MAGIC old_LD old_LDFLAGS old_LIBS \
+    old_LN_S old_NM old_RANLIB old_STRIP \
+    old_AS old_DLLTOOL old_OBJDUMP \
+    old_OBJEXT old_EXEEXT old_reload_flag \
+    old_deplibs_check_method old_file_magic_cmd \
+    AR CC LD LN_S NM LTSHELL LTCONFIG_VERSION \
+    reload_flag reload_cmds wl \
+    pic_flag link_static_flag no_builtin_flag export_dynamic_flag_spec \
+    thread_safe_flag_spec whole_archive_flag_spec libname_spec \
+    library_names_spec soname_spec \
+    RANLIB old_archive_cmds old_archive_from_new_cmds old_postinstall_cmds \
+    old_postuninstall_cmds archive_cmds archive_expsym_cmds postinstall_cmds \
+    postuninstall_cmds extract_expsyms_cmds old_archive_from_expsyms_cmds \
+    old_striplib striplib file_magic_cmd export_symbols_cmds \
+    deplibs_check_method allow_undefined_flag no_undefined_flag \
+    finish_cmds finish_eval global_symbol_pipe global_symbol_to_cdecl \
+    hardcode_libdir_flag_spec hardcode_libdir_separator  \
+    sys_lib_search_path_spec sys_lib_dlsearch_path_spec \
+    compiler_c_o compiler_o_lo need_locks exclude_expsyms include_expsyms; do
+
+    case "$var" in
+    reload_cmds | old_archive_cmds | old_archive_from_new_cmds | \
+    old_postinstall_cmds | old_postuninstall_cmds | \
+    export_symbols_cmds | archive_cmds | archive_expsym_cmds | \
+    extract_expsyms_cmds | old_archive_from_expsyms_cmds | \
+    postinstall_cmds | postuninstall_cmds | \
+    finish_cmds | sys_lib_search_path_spec | sys_lib_dlsearch_path_spec)
+      # Double-quote double-evaled strings.
+      eval "$var=\\\"\`\$echo \"X\$$var\" | \$Xsed -e \"\$double_quote_subst\" -e \"\$sed_quote_subst\" -e \"\$delay_variable_subst\"\`\\\"" ### testsuite: skip nested quoting test
+      ;;
+    *)
+      eval "$var=\\\"\`\$echo \"X\$$var\" | \$Xsed -e \"\$sed_quote_subst\"\`\\\"" ### testsuite: skip nested quoting test
+      ;;
+    esac
+  done
+
+  case "$ltecho" in
+  *'\$0 --fallback-echo"')
+    ltecho=`$echo "X$ltecho" | $Xsed -e 's/\\\\\\\$0 --fallback-echo"$/$0 --fallback-echo"/'`
+    ;;
+  esac
+
+  trap "$rm \"$ofile\"; exit 1" 1 2 15
+  echo "creating $ofile"
+  $rm "$ofile"
+  cat <<EOF > "$ofile"
+#! $SHELL
+
+# `$echo "$ofile" | sed 's%^.*/%%'` - Provide generalized library-building support services.
+# Generated automatically by $PROGRAM (GNU $PACKAGE $VERSION$TIMESTAMP)
+# NOTE: Changes made to this file will be lost: look at ltconfig or ltmain.sh.
+#
+# Copyright (C) 1996-2000 Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Sed that helps us avoid accidentally triggering echo(1) options like -n.
+Xsed="sed -e s/^X//"
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+if test "X\${CDPATH+set}" = Xset; then CDPATH=:; export CDPATH; fi
+
+### BEGIN LIBTOOL CONFIG
+EOF
+  cfgfile="$ofile"
+  ;;
+
+*)
+  # Double-quote the variables that need it (for aesthetics).
+  for var in old_AR old_CC old_CFLAGS old_CPPFLAGS \
+    old_MAGIC old_LD old_LDFLAGS old_LIBS \
+    old_LN_S old_NM old_RANLIB old_STRIP \
+    old_AS old_DLLTOOL old_OBJDUMP \
+    old_OBJEXT old_EXEEXT old_reload_flag \
+    old_deplibs_check_method old_file_magic_cmd; do
+    eval "$var=\\\"\$var\\\""
+  done
+
+  # Just create a config file.
+  cfgfile="$ofile.cfg"
+  trap "$rm \"$cfgfile\"; exit 1" 1 2 15
+  echo "creating $cfgfile"
+  $rm "$cfgfile"
+  cat <<EOF > "$cfgfile"
+# `$echo "$cfgfile" | sed 's%^.*/%%'` - Libtool configuration file.
+# Generated automatically by $PROGRAM (GNU $PACKAGE $VERSION$TIMESTAMP)
+EOF
+  ;;
+esac
+
+cat <<EOF >> "$cfgfile"
+# Libtool was configured as follows, on host `(hostname || uname -n) 2>/dev/null | sed 1q`:
+#
+# AR=$old_AR CC=$old_CC CFLAGS=$old_CFLAGS CPPFLAGS=$old_CPPFLAGS \\
+# MAGIC=$old_MAGIC LD=$old_LD LDFLAGS=$old_LDFLAGS LIBS=$old_LIBS \\
+# LN_S=$old_LN_S NM=$old_NM RANLIB=$old_RANLIB STRIP=$old_STRIP \\
+# AS=$old_AS DLLTOOL=$old_DLLTOOL OBJDUMP=$old_OBJDUMP \\
+# objext=$old_OBJEXT exeext=$old_EXEEXT reload_flag=$old_reload_flag \\
+# deplibs_check_method=$old_deplibs_check_method file_magic_cmd=$old_file_magic_cmd \\
+#   $0$ltconfig_args
+#
+# Compiler and other test output produced by $progname, useful for
+# debugging $progname, is in ./config.log if it exists.
+# The version of $progname that generated this script.
+LTCONFIG_VERSION=$LTCONFIG_VERSION
+
+# Shell to use when invoking shell scripts.
+SHELL=$LTSHELL
+
+# Whether or not to build shared libraries.
+build_libtool_libs=$enable_shared
+
+# Whether or not to build static libraries.
+build_old_libs=$enable_static
+
+# Whether or not to optimize for fast installation.
+fast_install=$enable_fast_install
+
+# The host system.
+host_alias=$host_alias
+host=$host
+
+# An echo program that does not interpret backslashes.
+echo=$ltecho
+
+# The archiver.
+AR=$AR
+
+# The default C compiler.
+CC=$CC
+
+# The linker used to build libraries.
+LD=$LD
+
+# Whether we need hard or soft links.
+LN_S=$LN_S
+
+# A BSD-compatible nm program.
+NM=$NM
+
+# A symbol stripping program
+STRIP=$STRIP
+
+# Used to examine libraries when file_magic_cmd begins "file"
+MAGIC=$MAGIC
+
+# Used on cygwin: DLL creation program.
+DLLTOOL="$DLLTOOL"
+
+# Used on cygwin: object dumper.
+OBJDUMP="$OBJDUMP"
+
+# Used on cygwin: assembler.
+AS="$AS"
+
+# The name of the directory that contains temporary libtool files.
+objdir=$objdir
+
+# How to create reloadable object files.
+reload_flag=$reload_flag
+reload_cmds=$reload_cmds
+
+# How to pass a linker flag through the compiler.
+wl=$wl
+
+# Object file suffix (normally "o").
+objext="$objext"
+
+# Old archive suffix (normally "a").
+libext="$libext"
+
+# Executable file suffix (normally "").
+exeext="$exeext"
+
+# Additional compiler flags for building library objects.
+pic_flag=$pic_flag
+pic_mode=$pic_mode
+
+# Does compiler simultaneously support -c and -o options?
+compiler_c_o=$compiler_c_o
+
+# Can we write directly to a .lo ?
+compiler_o_lo=$compiler_o_lo
+
+# Must we lock files when doing compilation ?
+need_locks=$need_locks
+
+# Do we need the lib prefix for modules?
+need_lib_prefix=$need_lib_prefix
+
+# Do we need a version for libraries?
+need_version=$need_version
+
+# Whether dlopen is supported.
+dlopen_support=$enable_dlopen
+
+# Whether dlopen of programs is supported.
+dlopen_self=$enable_dlopen_self
+
+# Whether dlopen of statically linked programs is supported.
+dlopen_self_static=$enable_dlopen_self_static
+
+# Compiler flag to prevent dynamic linking.
+link_static_flag=$link_static_flag
+
+# Compiler flag to turn off builtin functions.
+no_builtin_flag=$no_builtin_flag
+
+# Compiler flag to allow reflexive dlopens.
+export_dynamic_flag_spec=$export_dynamic_flag_spec
+
+# Compiler flag to generate shared objects directly from archives.
+whole_archive_flag_spec=$whole_archive_flag_spec
+
+# Compiler flag to generate thread-safe objects.
+thread_safe_flag_spec=$thread_safe_flag_spec
+
+# Library versioning type.
+version_type=$version_type
+
+# Format of library name prefix.
+libname_spec=$libname_spec
+
+# List of archive names.  First name is the real one, the rest are links.
+# The last name is the one that the linker finds with -lNAME.
+library_names_spec=$library_names_spec
+
+# The coded name of the library, if different from the real name.
+soname_spec=$soname_spec
+
+# Commands used to build and install an old-style archive.
+RANLIB=$RANLIB
+old_archive_cmds=$old_archive_cmds
+old_postinstall_cmds=$old_postinstall_cmds
+old_postuninstall_cmds=$old_postuninstall_cmds
+
+# Create an old-style archive from a shared archive.
+old_archive_from_new_cmds=$old_archive_from_new_cmds
+
+# Create a temporary old-style archive to link instead of a shared archive.
+old_archive_from_expsyms_cmds=$old_archive_from_expsyms_cmds
+
+# Commands used to build and install a shared archive.
+archive_cmds=$archive_cmds
+archive_expsym_cmds=$archive_expsym_cmds
+postinstall_cmds=$postinstall_cmds
+postuninstall_cmds=$postuninstall_cmds
+
+# Commands to strip libraries.
+old_striplib=$old_striplib
+striplib=$striplib
+
+# Method to check whether dependent libraries are shared objects.
+deplibs_check_method=$deplibs_check_method
+
+# Command to use when deplibs_check_method == file_magic.
+file_magic_cmd=$file_magic_cmd
+
+# Flag that allows shared libraries with undefined symbols to be built.
+allow_undefined_flag=$allow_undefined_flag
+
+# Flag that forces no undefined symbols.
+no_undefined_flag=$no_undefined_flag
+
+# Commands used to finish a libtool library installation in a directory.
+finish_cmds=$finish_cmds
+
+# Same as above, but a single script fragment to be evaled but not shown.
+finish_eval=$finish_eval
+
+# Take the output of nm and produce a listing of raw symbols and C names.
+global_symbol_pipe=$global_symbol_pipe
+
+# Transform the output of nm in a proper C declaration
+global_symbol_to_cdecl=$global_symbol_to_cdecl
+
+# This is the shared library runtime path variable.
+runpath_var=$runpath_var
+
+# This is the shared library path variable.
+shlibpath_var=$shlibpath_var
+
+# Is shlibpath searched before the hard-coded library search path?
+shlibpath_overrides_runpath=$shlibpath_overrides_runpath
+
+# How to hardcode a shared library path into an executable.
+hardcode_action=$hardcode_action
+
+# Whether we should hardcode library paths into libraries.
+hardcode_into_libs=$hardcode_into_libs
+
+# Flag to hardcode \$libdir into a binary during linking.
+# This must work even if \$libdir does not exist.
+hardcode_libdir_flag_spec=$hardcode_libdir_flag_spec
+
+# Whether we need a single -rpath flag with a separated argument.
+hardcode_libdir_separator=$hardcode_libdir_separator
+
+# Set to yes if using DIR/libNAME.so during linking hardcodes DIR into the
+# resulting binary.
+hardcode_direct=$hardcode_direct
+
+# Set to yes if using the -LDIR flag during linking hardcodes DIR into the
+# resulting binary.
+hardcode_minus_L=$hardcode_minus_L
+
+# Set to yes if using SHLIBPATH_VAR=DIR during linking hardcodes DIR into
+# the resulting binary.
+hardcode_shlibpath_var=$hardcode_shlibpath_var
+
+# Whether libtool must link a program against all its dependency libraries.
+link_all_deplibs=$link_all_deplibs
+
+# Compile-time system search path for libraries
+sys_lib_search_path_spec=$sys_lib_search_path_spec
+
+# Run-time system search path for libraries
+sys_lib_dlsearch_path_spec=$sys_lib_dlsearch_path_spec
+
+# Fix the shell variable \$srcfile for the compiler.
+fix_srcfile_path="$fix_srcfile_path"
+
+# Set to yes if exported symbols are required.
+always_export_symbols=$always_export_symbols
+
+# The commands to list exported symbols.
+export_symbols_cmds=$export_symbols_cmds
+
+# The commands to extract the exported symbol list from a shared archive.
+extract_expsyms_cmds=$extract_expsyms_cmds
+
+# Symbols that should not be listed in the preloaded symbols.
+exclude_expsyms=$exclude_expsyms
+
+# Symbols that must always be exported.
+include_expsyms=$include_expsyms
+
+EOF
+
+case "$ltmain" in
+*.sh)
+  echo '### END LIBTOOL CONFIG' >> "$ofile"
+  echo >> "$ofile"
+  case "$host_os" in
+  aix3*)
+    cat <<\EOF >> "$ofile"
+
+# AIX sometimes has problems with the GCC collect2 program.  For some
+# reason, if we set the COLLECT_NAMES environment variable, the problems
+# vanish in a puff of smoke.
+if test "X${COLLECT_NAMES+set}" != Xset; then
+  COLLECT_NAMES=
+  export COLLECT_NAMES
+fi
+EOF
+    ;;
+  esac
+  case "$host" in
+  *-*-cygwin* | *-*-mingw* | *-*-os2*)
+    cat <<'EOF' >> "$ofile"
+      # This is a source program that is used to create dlls on Windows
+      # Don't remove nor modify the starting and closing comments
+# /* ltdll.c starts here */
+# #define WIN32_LEAN_AND_MEAN
+# #include <windows.h>
+# #undef WIN32_LEAN_AND_MEAN
+# #include <stdio.h>
+#
+# #ifndef __CYGWIN__
+# #  ifdef __CYGWIN32__
+# #    define __CYGWIN__ __CYGWIN32__
+# #  endif
+# #endif
+#
+# #ifdef __cplusplus
+# extern "C" {
+# #endif
+# BOOL APIENTRY DllMain (HINSTANCE hInst, DWORD reason, LPVOID reserved);
+# #ifdef __cplusplus
+# }
+# #endif
+#
+# #ifdef __CYGWIN__
+# #include <cygwin/cygwin_dll.h>
+# DECLARE_CYGWIN_DLL( DllMain );
+# #endif
+# HINSTANCE __hDllInstance_base;
+#
+# BOOL APIENTRY
+# DllMain (HINSTANCE hInst, DWORD reason, LPVOID reserved)
+# {
+#   __hDllInstance_base = hInst;
+#   return TRUE;
+# }
+# /* ltdll.c ends here */
+      # This is a source program that is used to create import libraries
+      # on Windows for dlls which lack them. Don't remove nor modify the
+      # starting and closing comments
+# /* impgen.c starts here */
+# /*   Copyright (C) 1999-2000 Free Software Foundation, Inc.
+#
+#  This file is part of GNU libtool.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#  */
+#
+#  #include <stdio.h>		/* for printf() */
+#  #include <unistd.h>		/* for open(), lseek(), read() */
+#  #include <fcntl.h>		/* for O_RDONLY, O_BINARY */
+#  #include <string.h>		/* for strdup() */
+#
+#  /* O_BINARY isn't required (or even defined sometimes) under Unix */
+#  #ifndef O_BINARY
+#  #define O_BINARY 0
+#  #endif
+#
+#  static unsigned int
+#  pe_get16 (fd, offset)
+#       int fd;
+#       int offset;
+#  {
+#    unsigned char b[2];
+#    lseek (fd, offset, SEEK_SET);
+#    read (fd, b, 2);
+#    return b[0] + (b[1]<<8);
+#  }
+#
+#  static unsigned int
+#  pe_get32 (fd, offset)
+#      int fd;
+#      int offset;
+#  {
+#    unsigned char b[4];
+#    lseek (fd, offset, SEEK_SET);
+#    read (fd, b, 4);
+#    return b[0] + (b[1]<<8) + (b[2]<<16) + (b[3]<<24);
+#  }
+#
+#  static unsigned int
+#  pe_as32 (ptr)
+#       void *ptr;
+#  {
+#    unsigned char *b = ptr;
+#    return b[0] + (b[1]<<8) + (b[2]<<16) + (b[3]<<24);
+#  }
+#
+#  int
+#  main (argc, argv)
+#      int argc;
+#      char *argv[];
+#  {
+#      int dll;
+#      unsigned long pe_header_offset, opthdr_ofs, num_entries, i;
+#      unsigned long export_rva, export_size, nsections, secptr, expptr;
+#      unsigned long name_rvas, nexp;
+#      unsigned char *expdata, *erva;
+#      char *filename, *dll_name;
+#
+#      filename = argv[1];
+#
+#      dll = open(filename, O_RDONLY|O_BINARY);
+#      if (!dll)
+#  	return 1;
+#
+#      dll_name = filename;
+#
+#      for (i=0; filename[i]; i++)
+#  	if (filename[i] == '/' || filename[i] == '\\'  || filename[i] == ':')
+#  	    dll_name = filename + i +1;
+#
+#      pe_header_offset = pe_get32 (dll, 0x3c);
+#      opthdr_ofs = pe_header_offset + 4 + 20;
+#      num_entries = pe_get32 (dll, opthdr_ofs + 92);
+#
+#      if (num_entries < 1) /* no exports */
+#  	return 1;
+#
+#      export_rva = pe_get32 (dll, opthdr_ofs + 96);
+#      export_size = pe_get32 (dll, opthdr_ofs + 100);
+#      nsections = pe_get16 (dll, pe_header_offset + 4 +2);
+#      secptr = (pe_header_offset + 4 + 20 +
+#  	      pe_get16 (dll, pe_header_offset + 4 + 16));
+#
+#      expptr = 0;
+#      for (i = 0; i < nsections; i++)
+#      {
+#  	char sname[8];
+#  	unsigned long secptr1 = secptr + 40 * i;
+#  	unsigned long vaddr = pe_get32 (dll, secptr1 + 12);
+#  	unsigned long vsize = pe_get32 (dll, secptr1 + 16);
+#  	unsigned long fptr = pe_get32 (dll, secptr1 + 20);
+#  	lseek(dll, secptr1, SEEK_SET);
+#  	read(dll, sname, 8);
+#  	if (vaddr <= export_rva && vaddr+vsize > export_rva)
+#  	{
+#  	    expptr = fptr + (export_rva - vaddr);
+#  	    if (export_rva + export_size > vaddr + vsize)
+#  		export_size = vsize - (export_rva - vaddr);
+#  	    break;
+#  	}
+#      }
+#
+#      expdata = (unsigned char*)malloc(export_size);
+#      lseek (dll, expptr, SEEK_SET);
+#      read (dll, expdata, export_size);
+#      erva = expdata - export_rva;
+#
+#      nexp = pe_as32 (expdata+24);
+#      name_rvas = pe_as32 (expdata+32);
+#
+#      printf ("EXPORTS\n");
+#      for (i = 0; i<nexp; i++)
+#      {
+#  	unsigned long name_rva = pe_as32 (erva+name_rvas+i*4);
+#  	printf ("\t%s @ %ld ;\n", erva+name_rva, 1+ i);
+#      }
+#
+#      return 0;
+#  }
+# /* impgen.c ends here */
+
+EOF
+    ;;
+  esac
+
+
+  # Append the ltmain.sh script.
+  sed '$q' "$ltmain" >> "$ofile" || (rm -f "$ofile"; exit 1)
+  # We use sed instead of cat because bash on DJGPP gets confused if
+  # if finds mixed CR/LF and LF-only lines.  Since sed operates in
+  # text mode, it properly converts lines to CR/LF.  This bash problem
+  # is reportedly fixed, but why not run on old versions too?
+
+  chmod +x "$ofile"
+  ;;
+
+*)
+  # Compile the libtool program.
+  echo "FIXME: would compile $ltmain"
+  ;;
+esac
+
+test -n "$cache_file" || exit 0
+
+# AC_CACHE_SAVE
+trap '' 1 2 15
+cat > confcache <<\EOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs.  It is not useful on other systems.
+# If it contains results you don't want to keep, you may remove or edit it.
+#
+# By default, configure uses ./config.cache as the cache file,
+# creating it if it does not exist already.  You can give configure
+# the --cache-file=FILE option to use a different cache file; that is
+# what configure does when it calls configure scripts in
+# subdirectories, so they share the cache.
+# Giving --cache-file=/dev/null disables caching, for debugging configure.
+# config.status only pays attention to the cache file if you give it the
+# --recheck option to rerun configure.
+#
+EOF
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, don't put newlines in cache variables' values.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(set) 2>&1 |
+  case `(ac_space=' '; set | grep ac_space) 2>&1` in
+  *ac_space=\ *)
+    # `set' does not quote correctly, so add quotes (double-quote substitution
+    # turns \\\\ into \\, and sed turns \\ into \).
+    sed -n \
+      -e "s/'/'\\\\''/g" \
+      -e "s/^\\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\\)=\\(.*\\)/\\1=\${\\1='\\2'}/p"
+    ;;
+  *)
+    # `set' quotes correctly as required by POSIX, so do not add quotes.
+    sed -n -e 's/^\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\)=\(.*\)/\1=${\1=\2}/p'
+    ;;
+  esac >> confcache
+if cmp -s $cache_file confcache; then
+  :
+else
+  if test -w $cache_file; then
+    echo "updating cache $cache_file"
+    cat confcache > $cache_file
+  else
+    echo "not updating unwritable cache $cache_file"
+  fi
+fi
+rm -f confcache
+
+exit 0
+
+# Local Variables:
+# mode:shell-script
+# sh-indentation:2
+# End:
diff --git a/rts/gmp/ltmain.sh b/rts/gmp/ltmain.sh
new file mode 100644
index 0000000000..d81d89f878
--- /dev/null
+++ b/rts/gmp/ltmain.sh
@@ -0,0 +1,4692 @@
+# ltmain.sh - Provide generalized library-building support services.
+# NOTE: Changing this file will not affect anything until you rerun ltconfig.
+#
+# Copyright (C) 1996-2000 Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Check that we have a working $echo.
+if test "X$1" = X--no-reexec; then
+  # Discard the --no-reexec flag, and continue.
+  shift
+elif test "X$1" = X--fallback-echo; then
+  # Avoid inline document here, it may be left over
+  :
+elif test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then
+  # Yippee, $echo works!
+  :
+else
+  # Restart under the correct shell, and then maybe $echo will work.
+  exec $SHELL "$0" --no-reexec ${1+"$@"}
+fi
+
+if test "X$1" = X--fallback-echo; then
+  # used as fallback echo
+  shift
+  cat <<EOF
+$*
+EOF
+  exit 0
+fi
+
+# The name of this program.
+progname=`$echo "$0" | sed 's%^.*/%%'`
+modename="$progname"
+
+# Constants.
+PROGRAM=ltmain.sh
+PACKAGE=libtool
+VERSION=1.3c
+TIMESTAMP=" (1.696 2000/03/14 20:22:42)"
+
+default_mode=
+help="Try \`$progname --help' for more information."
+magic="%%%MAGIC variable%%%"
+mkdir="mkdir"
+mv="mv -f"
+rm="rm -f"
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+Xsed='sed -e 1s/^X//'
+sed_quote_subst='s/\([\\`\\"$\\\\]\)/\\\1/g'
+SP2NL='tr \040 \012'
+NL2SP='tr \015\012 \040\040'
+
+# NLS nuisances.
+# Only set LANG and LC_ALL to C if already set.
+# These must not be set unconditionally because not all systems understand
+# e.g. LANG=C (notably SCO).
+# We save the old values to restore during execute mode.
+if test "${LC_ALL+set}" = set; then
+  save_LC_ALL="$LC_ALL"; LC_ALL=C; export LC_ALL
+fi
+if test "${LANG+set}" = set; then
+  save_LANG="$LANG"; LANG=C; export LANG
+fi
+
+if test "$LTCONFIG_VERSION" != "$VERSION"; then
+  echo "$modename: ltconfig version \`$LTCONFIG_VERSION' does not match $PROGRAM version \`$VERSION'" 1>&2
+  echo "Fatal configuration error.  See the $PACKAGE docs for more information." 1>&2
+  exit 1
+fi
+
+if test "$build_libtool_libs" != yes && test "$build_old_libs" != yes; then
+  echo "$modename: not configured to build any kind of library" 1>&2
+  echo "Fatal configuration error.  See the $PACKAGE docs for more information." 1>&2
+  exit 1
+fi
+
+# Global variables.
+mode=$default_mode
+nonopt=
+prev=
+prevopt=
+run=
+show="$echo"
+show_help=
+execute_dlfiles=
+lo2o="s/\\.lo\$/.${objext}/"
+o2lo="s/\\.${objext}\$/.lo/"
+
+# Parse our command line options once, thoroughly.
+while test $# -gt 0
+do
+  arg="$1"
+  shift
+
+  case "$arg" in
+  -*=*) optarg=`$echo "X$arg" | $Xsed -e 's/[-_a-zA-Z0-9]*=//'` ;;
+  *) optarg= ;;
+  esac
+
+  # If the previous option needs an argument, assign it.
+  if test -n "$prev"; then
+    case "$prev" in
+    execute_dlfiles)
+      eval "$prev=\"\$$prev \$arg\""
+      ;;
+    *)
+      eval "$prev=\$arg"
+      ;;
+    esac
+
+    prev=
+    prevopt=
+    continue
+  fi
+
+  # Have we seen a non-optional argument yet?
+  case "$arg" in
+  --help)
+    show_help=yes
+    ;;
+
+  --version)
+    echo "$PROGRAM (GNU $PACKAGE) $VERSION$TIMESTAMP"
+    exit 0
+    ;;
+
+  --config)
+    sed -e '1,/^### BEGIN LIBTOOL CONFIG/d' -e '/^### END LIBTOOL CONFIG/,$d' $0
+    exit 0
+    ;;
+
+  --debug)
+    echo "$progname: enabling shell trace mode"
+    set -x
+    ;;
+
+  --dry-run | -n)
+    run=:
+    ;;
+
+  --features)
+    echo "host: $host"
+    if test "$build_libtool_libs" = yes; then
+      echo "enable shared libraries"
+    else
+      echo "disable shared libraries"
+    fi
+    if test "$build_old_libs" = yes; then
+      echo "enable static libraries"
+    else
+      echo "disable static libraries"
+    fi
+    exit 0
+    ;;
+
+  --finish) mode="finish" ;;
+
+  --mode) prevopt="--mode" prev=mode ;;
+  --mode=*) mode="$optarg" ;;
+
+  --quiet | --silent)
+    show=:
+    ;;
+
+  -dlopen)
+    prevopt="-dlopen"
+    prev=execute_dlfiles
+    ;;
+
+  -*)
+    $echo "$modename: unrecognized option \`$arg'" 1>&2
+    $echo "$help" 1>&2
+    exit 1
+    ;;
+
+  *)
+    nonopt="$arg"
+    break
+    ;;
+  esac
+done
+
+if test -n "$prevopt"; then
+  $echo "$modename: option \`$prevopt' requires an argument" 1>&2
+  $echo "$help" 1>&2
+  exit 1
+fi
+
+if test -z "$show_help"; then
+
+  # Infer the operation mode.
+  if test -z "$mode"; then
+    case "$nonopt" in
+    *cc | *++ | gcc* | *-gcc*)
+      mode=link
+      for arg
+      do
+	case "$arg" in
+	-c)
+	   mode=compile
+	   break
+	   ;;
+	esac
+      done
+      ;;
+    *db | *dbx | *strace | *truss)
+      mode=execute
+      ;;
+    *install*|cp|mv)
+      mode=install
+      ;;
+    *rm)
+      mode=uninstall
+      ;;
+    *)
+      # If we have no mode, but dlfiles were specified, then do execute mode.
+      test -n "$execute_dlfiles" && mode=execute
+
+      # Just use the default operation mode.
+      if test -z "$mode"; then
+	if test -n "$nonopt"; then
+	  $echo "$modename: warning: cannot infer operation mode from \`$nonopt'" 1>&2
+	else
+	  $echo "$modename: warning: cannot infer operation mode without MODE-ARGS" 1>&2
+	fi
+      fi
+      ;;
+    esac
+  fi
+
+  # Only execute mode is allowed to have -dlopen flags.
+  if test -n "$execute_dlfiles" && test "$mode" != execute; then
+    $echo "$modename: unrecognized option \`-dlopen'" 1>&2
+    $echo "$help" 1>&2
+    exit 1
+  fi
+
+  # Change the help message to a mode-specific one.
+  generic_help="$help"
+  help="Try \`$modename --help --mode=$mode' for more information."
+
+  # These modes are in order of execution frequency so that they run quickly.
+  case "$mode" in
+  # libtool compile mode
+  compile)
+    modename="$modename: compile"
+    # Get the compilation command and the source file.
+    base_compile=
+    prev=
+    lastarg=
+    srcfile="$nonopt"
+    suppress_output=
+
+    user_target=no
+    for arg
+    do
+      case "$prev" in
+      "") ;;
+      xcompiler)
+	# Aesthetically quote the previous argument.
+	prev=
+	lastarg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+
+	case "$arg" in
+	# Double-quote args containing other shell metacharacters.
+	# Many Bourne shells cannot handle close brackets correctly
+	# in scan sets, so we specify it separately.
+	*[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	  arg="\"$arg\""
+	  ;;
+	esac
+
+	# Add the previous argument to base_compile.
+	if test -z "$base_compile"; then
+	  base_compile="$lastarg"
+	else
+	  base_compile="$base_compile $lastarg"
+	fi
+	continue
+	;;
+      esac
+
+      # Accept any command-line options.
+      case "$arg" in
+      -o)
+	if test "$user_target" != "no"; then
+	  $echo "$modename: you cannot specify \`-o' more than once" 1>&2
+	  exit 1
+	fi
+	user_target=next
+	;;
+
+      -static)
+	build_old_libs=yes
+	continue
+	;;
+
+      -Xcompiler)
+	prev=xcompiler
+	continue
+	;;
+
+      -Wc,*)
+	args=`$echo "X$arg" | $Xsed -e "s/^-Wc,//"`
+	lastarg=
+	IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=','
+	for arg in $args; do
+	  IFS="$save_ifs"
+
+	  # Double-quote args containing other shell metacharacters.
+	  # Many Bourne shells cannot handle close brackets correctly
+	  # in scan sets, so we specify it separately.
+	  case "$arg" in
+	    *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	    arg="\"$arg\""
+	    ;;
+	  esac
+	  lastarg="$lastarg $arg"
+	done
+	IFS="$save_ifs"
+	lastarg=`$echo "X$lastarg" | $Xsed -e "s/^ //"`
+
+	# Add the arguments to base_compile.
+	if test -z "$base_compile"; then
+	  base_compile="$lastarg"
+	else
+	  base_compile="$base_compile $lastarg"
+	fi
+	continue
+	;;
+      esac
+
+      case "$user_target" in
+      next)
+	# The next one is the -o target name
+	user_target=yes
+	continue
+	;;
+      yes)
+	# We got the output file
+	user_target=set
+	libobj="$arg"
+	continue
+	;;
+      esac
+
+      # Accept the current argument as the source file.
+      lastarg="$srcfile"
+      srcfile="$arg"
+
+      # Aesthetically quote the previous argument.
+
+      # Backslashify any backslashes, double quotes, and dollar signs.
+      # These are the only characters that are still specially
+      # interpreted inside of double-quoted scrings.
+      lastarg=`$echo "X$lastarg" | $Xsed -e "$sed_quote_subst"`
+
+      # Double-quote args containing other shell metacharacters.
+      # Many Bourne shells cannot handle close brackets correctly
+      # in scan sets, so we specify it separately.
+      case "$lastarg" in
+      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	lastarg="\"$lastarg\""
+	;;
+      esac
+
+      # Add the previous argument to base_compile.
+      if test -z "$base_compile"; then
+	base_compile="$lastarg"
+      else
+	base_compile="$base_compile $lastarg"
+      fi
+    done
+
+    case "$user_target" in
+    set)
+      ;;
+    no)
+      # Get the name of the library object.
+      libobj=`$echo "X$srcfile" | $Xsed -e 's%^.*/%%'`
+      ;;
+    *)
+      $echo "$modename: you must specify a target with \`-o'" 1>&2
+      exit 1
+      ;;
+    esac
+
+    # Recognize several different file suffixes.
+    # If the user specifies -o file.o, it is replaced with file.lo
+    xform='[cCFSfmso]'
+    case "$libobj" in
+    *.ada) xform=ada ;;
+    *.adb) xform=adb ;;
+    *.ads) xform=ads ;;
+    *.asm) xform=asm ;;
+    *.c++) xform=c++ ;;
+    *.cc) xform=cc ;;
+    *.cpp) xform=cpp ;;
+    *.cxx) xform=cxx ;;
+    *.f90) xform=f90 ;;
+    *.for) xform=for ;;
+    esac
+
+    libobj=`$echo "X$libobj" | $Xsed -e "s/\.$xform$/.lo/"`
+
+    case "$libobj" in
+    *.lo) obj=`$echo "X$libobj" | $Xsed -e "$lo2o"` ;;
+    *)
+      $echo "$modename: cannot determine name of library object from \`$libobj'" 1>&2
+      exit 1
+      ;;
+    esac
+
+    if test -z "$base_compile"; then
+      $echo "$modename: you must specify a compilation command" 1>&2
+      $echo "$help" 1>&2
+      exit 1
+    fi
+
+    # Delete any leftover library objects.
+    if test "$build_old_libs" = yes; then
+      removelist="$obj $libobj"
+    else
+      removelist="$libobj"
+    fi
+
+    $run $rm $removelist
+    trap "$run $rm $removelist; exit 1" 1 2 15
+
+    # Calculate the filename of the output object if compiler does
+    # not support -o with -c
+    if test "$compiler_c_o" = no; then
+      output_obj=`$echo "X$srcfile" | $Xsed -e 's%^.*/%%' -e 's%\..*$%%'`.${objext}
+      lockfile="$output_obj.lock"
+      removelist="$removelist $output_obj $lockfile"
+      trap "$run $rm $removelist; exit 1" 1 2 15
+    else
+      need_locks=no
+      lockfile=
+    fi
+
+    # Lock this critical section if it is needed
+    # We use this script file to make the link, it avoids creating a new file
+    if test "$need_locks" = yes; then
+      until ln "$0" "$lockfile" 2>/dev/null; do
+	$show "Waiting for $lockfile to be removed"
+	sleep 2
+      done
+    elif test "$need_locks" = warn; then
+      if test -f "$lockfile"; then
+	echo "\
+*** ERROR, $lockfile exists and contains:
+`cat $lockfile 2>/dev/null`
+
+This indicates that another process is trying to use the same
+temporary object file, and libtool could not work around it because
+your compiler does not support \`-c' and \`-o' together.  If you
+repeat this compilation, it may succeed, by chance, but you had better
+avoid parallel builds (make -j) in this platform, or get a better
+compiler."
+
+	$run $rm $removelist
+	exit 1
+      fi
+      echo $srcfile > "$lockfile"
+    fi
+
+    if test -n "$fix_srcfile_path"; then
+      eval srcfile=\"$fix_srcfile_path\"
+    fi
+
+    # Only build a PIC object if we are building libtool libraries.
+    if test "$build_libtool_libs" = yes; then
+      # Without this assignment, base_compile gets emptied.
+      fbsd_hideous_sh_bug=$base_compile
+
+      if test "$pic_mode" != no; then
+	# All platforms use -DPIC, to notify preprocessed assembler code.
+	command="$base_compile $srcfile $pic_flag -DPIC"
+      else
+	# Don't build PIC code
+	command="$base_compile $srcfile"
+      fi
+      if test "$build_old_libs" = yes; then
+	lo_libobj="$libobj"
+	dir=`$echo "X$libobj" | $Xsed -e 's%/[^/]*$%%'`
+	if test "X$dir" = "X$libobj"; then
+	  dir="$objdir"
+	else
+	  dir="$dir/$objdir"
+	fi
+	libobj="$dir/"`$echo "X$libobj" | $Xsed -e 's%^.*/%%'`
+
+	if test -d "$dir"; then
+	  $show "$rm $libobj"
+	  $run $rm $libobj
+	else
+	  $show "$mkdir $dir"
+	  $run $mkdir $dir
+	  status=$?
+	  if test $status -ne 0 && test ! -d $dir; then
+	    exit $status
+	  fi
+	fi
+      fi
+      if test "$compiler_o_lo" = yes; then
+	output_obj="$libobj"
+	command="$command -o $output_obj"
+      elif test "$compiler_c_o" = yes; then
+	output_obj="$obj"
+	command="$command -o $output_obj"
+      fi
+
+      $run $rm "$output_obj"
+      $show "$command"
+      if $run eval "$command"; then :
+      else
+	test -n "$output_obj" && $run $rm $removelist
+	exit 1
+      fi
+
+      if test "$need_locks" = warn &&
+	 test x"`cat $lockfile 2>/dev/null`" != x"$srcfile"; then
+	echo "\
+*** ERROR, $lockfile contains:
+`cat $lockfile 2>/dev/null`
+
+but it should contain:
+$srcfile
+
+This indicates that another process is trying to use the same
+temporary object file, and libtool could not work around it because
+your compiler does not support \`-c' and \`-o' together.  If you
+repeat this compilation, it may succeed, by chance, but you had better
+avoid parallel builds (make -j) in this platform, or get a better
+compiler."
+
+	$run $rm $removelist
+	exit 1
+      fi
+
+      # Just move the object if needed, then go on to compile the next one
+      if test x"$output_obj" != x"$libobj"; then
+	$show "$mv $output_obj $libobj"
+	if $run $mv $output_obj $libobj; then :
+	else
+	  error=$?
+	  $run $rm $removelist
+	  exit $error
+	fi
+      fi
+
+      # If we have no pic_flag, then copy the object into place and finish.
+      if (test -z "$pic_flag" || test "$pic_mode" != default) &&
+	 test "$build_old_libs" = yes; then
+	# Rename the .lo from within objdir to obj
+	if test -f $obj; then
+	  $show $rm $obj
+	  $run $rm $obj
+	fi
+
+	$show "$mv $libobj $obj"
+	if $run $mv $libobj $obj; then :
+	else
+	  error=$?
+	  $run $rm $removelist
+	  exit $error
+	fi
+
+	xdir=`$echo "X$obj" | $Xsed -e 's%/[^/]*$%%'`
+	if test "X$xdir" = "X$obj"; then
+	  xdir="."
+	else
+	  xdir="$xdir"
+	fi
+	baseobj=`$echo "X$obj" | $Xsed -e "s%.*/%%"`
+	libobj=`$echo "X$baseobj" | $Xsed -e "$o2lo"`
+	# Now arrange that obj and lo_libobj become the same file
+	$show "(cd $xdir && $LN_S $baseobj $libobj)"
+	if $run eval '(cd $xdir && $LN_S $baseobj $libobj)'; then
+	  exit 0
+	else
+	  error=$?
+	  $run $rm $removelist
+	  exit $error
+	fi
+      fi
+
+      # Allow error messages only from the first compilation.
+      suppress_output=' >/dev/null 2>&1'
+    fi
+
+    # Only build a position-dependent object if we build old libraries.
+    if test "$build_old_libs" = yes; then
+      if test "$pic_mode" != yes; then
+	# Don't build PIC code
+	command="$base_compile $srcfile"
+      else
+	# All platforms use -DPIC, to notify preprocessed assembler code.
+	command="$base_compile $srcfile $pic_flag -DPIC"
+      fi
+      if test "$compiler_c_o" = yes; then
+	command="$command -o $obj"
+	output_obj="$obj"
+      fi
+
+      # Suppress compiler output if we already did a PIC compilation.
+      command="$command$suppress_output"
+      $run $rm "$output_obj"
+      $show "$command"
+      if $run eval "$command"; then :
+      else
+	$run $rm $removelist
+	exit 1
+      fi
+
+      if test "$need_locks" = warn &&
+	 test x"`cat $lockfile 2>/dev/null`" != x"$srcfile"; then
+	echo "\
+*** ERROR, $lockfile contains:
+`cat $lockfile 2>/dev/null`
+
+but it should contain:
+$srcfile
+
+This indicates that another process is trying to use the same
+temporary object file, and libtool could not work around it because
+your compiler does not support \`-c' and \`-o' together.  If you
+repeat this compilation, it may succeed, by chance, but you had better
+avoid parallel builds (make -j) in this platform, or get a better
+compiler."
+
+	$run $rm $removelist
+	exit 1
+      fi
+
+      # Just move the object if needed
+      if test x"$output_obj" != x"$obj"; then
+	$show "$mv $output_obj $obj"
+	if $run $mv $output_obj $obj; then :
+	else
+	  error=$?
+	  $run $rm $removelist
+	  exit $error
+	fi
+      fi
+
+      # Create an invalid libtool object if no PIC, so that we do not
+      # accidentally link it into a program.
+      if test "$build_libtool_libs" != yes; then
+	$show "echo timestamp > $libobj"
+	$run eval "echo timestamp > \$libobj" || exit $?
+      else
+	# Move the .lo from within objdir
+	$show "$mv $libobj $lo_libobj"
+	if $run $mv $libobj $lo_libobj; then :
+	else
+	  error=$?
+	  $run $rm $removelist
+	  exit $error
+	fi
+      fi
+    fi
+
+    # Unlock the critical section if it was locked
+    if test "$need_locks" != no; then
+      $rm "$lockfile"
+    fi
+
+    exit 0
+    ;;
+
+  # libtool link mode
+  link | relink)
+    modename="$modename: link"
+    case "$host" in
+    *-*-cygwin* | *-*-mingw* | *-*-os2*)
+      # It is impossible to link a dll without this setting, and
+      # we shouldn't force the makefile maintainer to figure out
+      # which system we are compiling for in order to pass an extra
+      # flag for every libtool invokation.
+      # allow_undefined=no
+
+      # FIXME: Unfortunately, there are problems with the above when trying
+      # to make a dll which has undefined symbols, in which case not
+      # even a static library is built.  For now, we need to specify
+      # -no-undefined on the libtool link line when we can be certain
+      # that all symbols are satisfied, otherwise we get a static library.
+      allow_undefined=yes
+      ;;
+    *)
+      allow_undefined=yes
+      ;;
+    esac
+    libtool_args="$nonopt"
+    compile_command="$nonopt"
+    finalize_command="$nonopt"
+
+    compile_rpath=
+    finalize_rpath=
+    compile_shlibpath=
+    finalize_shlibpath=
+    convenience=
+    old_convenience=
+    deplibs=
+    old_deplibs=
+    compiler_flags=
+    linker_flags=
+    dllsearchpath=
+    lib_search_path=`pwd`
+
+    avoid_version=no
+    dlfiles=
+    dlprefiles=
+    dlself=no
+    export_dynamic=no
+    export_symbols=
+    export_symbols_regex=
+    generated=
+    libobjs=
+    ltlibs=
+    module=no
+    no_install=no
+    objs=
+    prefer_static_libs=no
+    preload=no
+    prev=
+    prevarg=
+    release=
+    rpath=
+    xrpath=
+    perm_rpath=
+    temp_rpath=
+    thread_safe=no
+    vinfo=
+
+    # We need to know -static, to get the right output filenames.
+    for arg
+    do
+      case "$arg" in
+      -all-static | -static)
+	if test "X$arg" = "X-all-static"; then
+	  if test "$build_libtool_libs" = yes && test -z "$link_static_flag"; then
+	    $echo "$modename: warning: complete static linking is impossible in this configuration" 1>&2
+	  fi
+	  if test -n "$link_static_flag"; then
+	    dlopen_self=$dlopen_self_static
+	  fi
+	else
+	  if test -z "$pic_flag" && test -n "$link_static_flag"; then
+	    dlopen_self=$dlopen_self_static
+	  fi
+	fi
+	build_libtool_libs=no
+	build_old_libs=yes
+	prefer_static_libs=yes
+	break
+	;;
+      esac
+    done
+
+    # See if our shared archives depend on static archives.
+    test -n "$old_archive_from_new_cmds" && build_old_libs=yes
+
+    # Go through the arguments, transforming them on the way.
+    while test $# -gt 0; do
+      arg="$1"
+      shift
+      case "$arg" in
+      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	qarg=\"`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`\" ### testsuite: skip nested quoting test
+	;;
+      *) qarg=$arg ;;
+      esac
+      libtool_args="$libtool_args $qarg"
+
+      # If the previous option needs an argument, assign it.
+      if test -n "$prev"; then
+	case "$prev" in
+	output)
+	  compile_command="$compile_command @OUTPUT@"
+	  finalize_command="$finalize_command @OUTPUT@"
+	  ;;
+	esac
+
+	case "$prev" in
+	dlfiles|dlprefiles)
+	  if test "$preload" = no; then
+	    # Add the symbol object into the linking commands.
+	    compile_command="$compile_command @SYMFILE@"
+	    finalize_command="$finalize_command @SYMFILE@"
+	    preload=yes
+	  fi
+	  case "$arg" in
+	  *.la | *.lo) ;;  # We handle these cases below.
+	  force)
+	    if test "$dlself" = no; then
+	      dlself=needless
+	      export_dynamic=yes
+	    fi
+	    prev=
+	    continue
+	    ;;
+	  self)
+	    if test "$prev" = dlprefiles; then
+	      dlself=yes
+	    elif test "$prev" = dlfiles && test "$dlopen_self" != yes; then
+	      dlself=yes
+	    else
+	      dlself=needless
+	      export_dynamic=yes
+	    fi
+	    prev=
+	    continue
+	    ;;
+	  *)
+	    if test "$prev" = dlfiles; then
+	      dlfiles="$dlfiles $arg"
+	    else
+	      dlprefiles="$dlprefiles $arg"
+	    fi
+	    prev=
+	    continue
+	    ;;
+	  esac
+	  ;;
+	expsyms)
+	  export_symbols="$arg"
+	  if test ! -f "$arg"; then
+	    $echo "$modename: symbol file \`$arg' does not exist"
+	    exit 1
+	  fi
+	  prev=
+	  continue
+	  ;;
+	expsyms_regex)
+	  export_symbols_regex="$arg"
+	  prev=
+	  continue
+	  ;;
+	release)
+	  release="-$arg"
+	  prev=
+	  continue
+	  ;;
+	rpath | xrpath)
+	  # We need an absolute path.
+	  case "$arg" in
+	  [\\/]* | [A-Za-z]:[\\/]*) ;;
+	  *)
+	    $echo "$modename: only absolute run-paths are allowed" 1>&2
+	    exit 1
+	    ;;
+	  esac
+	  if test "$prev" = rpath; then
+	    case "$rpath " in
+	    *" $arg "*) ;;
+	    *) rpath="$rpath $arg" ;;
+	    esac
+	  else
+	    case "$xrpath " in
+	    *" $arg "*) ;;
+	    *) xrpath="$xrpath $arg" ;;
+	    esac
+	  fi
+	  prev=
+	  continue
+	  ;;
+	xcompiler)
+	  compiler_flags="$compiler_flags $qarg"
+	  prev=
+	  compile_command="$compile_command $qarg"
+	  finalize_command="$finalize_command $qarg"
+	  continue
+	  ;;
+	xlinker)
+	  linker_flags="$linker_flags $qarg"
+	  compiler_flags="$compiler_flags $wl$qarg"
+	  prev=
+	  compile_command="$compile_command $wl$qarg"
+	  finalize_command="$finalize_command $wl$qarg"
+	  continue
+	  ;;
+	*)
+	  eval "$prev=\"\$arg\""
+	  prev=
+	  continue
+	  ;;
+	esac
+      fi
+
+      prevarg="$arg"
+
+      case "$arg" in
+      -all-static)
+	if test -n "$link_static_flag"; then
+	  compile_command="$compile_command $link_static_flag"
+	  finalize_command="$finalize_command $link_static_flag"
+	fi
+	continue
+	;;
+
+      -allow-undefined)
+	# FIXME: remove this flag sometime in the future.
+	$echo "$modename: \`-allow-undefined' is deprecated because it is the default" 1>&2
+	continue
+	;;
+
+      -avoid-version)
+	avoid_version=yes
+	continue
+	;;
+
+      -dlopen)
+	prev=dlfiles
+	continue
+	;;
+
+      -dlpreopen)
+	prev=dlprefiles
+	continue
+	;;
+
+      -export-dynamic)
+	export_dynamic=yes
+	continue
+	;;
+
+      -export-symbols | -export-symbols-regex)
+	if test -n "$export_symbols" || test -n "$export_symbols_regex"; then
+	  $echo "$modename: not more than one -exported-symbols argument allowed"
+	  exit 1
+	fi
+	if test "X$arg" = "X-export-symbols"; then
+	  prev=expsyms
+	else
+	  prev=expsyms_regex
+	fi
+	continue
+	;;
+
+      -L*)
+	dir=`$echo "X$arg" | $Xsed -e 's/^-L//'`
+	# We need an absolute path.
+	case "$dir" in
+	[\\/]* | [A-Za-z]:[\\/]*) ;;
+	*)
+	  absdir=`cd "$dir" && pwd`
+	  if test -z "$absdir"; then
+	    $echo "$modename: cannot determine absolute directory name of \`$dir'" 1>&2
+	    exit 1
+	  fi
+	  dir="$absdir"
+	  ;;
+	esac
+	case "$deplibs " in
+	*" -L$dir "*) ;;
+	*)
+	  deplibs="$deplibs -L$dir"
+	  lib_search_path="$lib_search_path $dir"
+	  ;;
+	esac
+	case "$host" in
+	*-*-cygwin* | *-*-mingw* | *-*-os2*)
+	  case ":$dllsearchpath:" in
+	  *":$dir:"*) ;;
+	  *) dllsearchpath="$dllsearchpath:$dir";;
+	  esac
+	  ;;
+	esac
+	continue
+	;;
+
+      -l*)
+	if test "$arg" = "-lc"; then
+	  case "$host" in
+	  *-*-cygwin* | *-*-mingw* | *-*-os2* | *-*-beos*)
+	    # These systems don't actually have c library (as such)
+	    continue
+	    ;;
+	  esac
+	elif test "$arg" = "-lm"; then
+	  case "$host" in
+	  *-*-cygwin* | *-*-beos*)
+	    # These systems don't actually have math library (as such)
+	    continue
+	    ;;
+	  esac
+	fi
+	deplibs="$deplibs $arg"
+	continue
+	;;
+
+      -module)
+	module=yes
+	continue
+	;;
+
+      -no-fast-install)
+	fast_install=no
+	continue
+	;;
+
+      -no-install)
+	case "$host" in
+	*-*-cygwin* | *-*-mingw* | *-*-os2*)
+	  # The PATH hackery in wrapper scripts is required on Windows
+	  # in order for the loader to find any dlls it needs.
+	  $echo "$modename: warning: \`-no-install' is ignored for $host" 1>&2
+	  $echo "$modename: warning: assuming \`-no-fast-install' instead" 1>&2
+	  fast_install=no
+	  ;;
+	*)
+	  no_install=yes
+	  ;;
+	esac
+	continue
+	;;
+
+      -no-undefined)
+	allow_undefined=no
+	continue
+	;;
+
+      -o) prev=output ;;
+
+      -release)
+	prev=release
+	continue
+	;;
+
+      -rpath)
+	prev=rpath
+	continue
+	;;
+
+      -R)
+	prev=xrpath
+	continue
+	;;
+
+      -R*)
+	dir=`$echo "X$arg" | $Xsed -e 's/^-R//'`
+	# We need an absolute path.
+	case "$dir" in
+	[\\/]* | [A-Za-z]:[\\/]*) ;;
+	*)
+	  $echo "$modename: only absolute run-paths are allowed" 1>&2
+	  exit 1
+	  ;;
+	esac
+	case "$xrpath " in
+	*" $dir "*) ;;
+	*) xrpath="$xrpath $dir" ;;
+	esac
+	continue
+	;;
+
+      -static)
+	# If we have no pic_flag, then this is the same as -all-static.
+	if test -z "$pic_flag" && test -n "$link_static_flag"; then
+	  compile_command="$compile_command $link_static_flag"
+	  finalize_command="$finalize_command $link_static_flag"
+	fi
+	continue
+	;;
+
+      -thread-safe)
+	thread_safe=yes
+	continue
+	;;
+
+      -version-info)
+	prev=vinfo
+	continue
+	;;
+
+      -Wc,*)
+	args=`$echo "X$arg" | $Xsed -e "$sed_quote_subst" -e 's/^-Wc,//'`
+	arg=
+	IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=','
+	for flag in $args; do
+	  IFS="$save_ifs"
+	  case "$flag" in
+	    *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	    flag="\"$flag\""
+	    ;;
+	  esac
+	  arg="$arg $wl$flag"
+	  compiler_flags="$compiler_flags $flag"
+	done
+	IFS="$save_ifs"
+	arg=`$echo "X$arg" | $Xsed -e "s/^ //"`
+	;;
+
+      -Wl,*)
+	args=`$echo "X$arg" | $Xsed -e "$sed_quote_subst" -e 's/^-Wl,//'`
+	arg=
+	IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=','
+	for flag in $args; do
+	  IFS="$save_ifs"
+	  case "$flag" in
+	    *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	    flag="\"$flag\""
+	    ;;
+	  esac
+	  arg="$arg $wl$flag"
+	  compiler_flags="$compiler_flags $wl$flag"
+	  linker_flags="$linker_flags $flag"
+	done
+	IFS="$save_ifs"
+	arg=`$echo "X$arg" | $Xsed -e "s/^ //"`
+	;;
+
+      -Xcompiler)
+	prev=xcompiler
+	continue
+	;;
+
+      -Xlinker)
+	prev=xlinker
+	continue
+	;;
+
+      # Some other compiler flag.
+      -* | +*)
+	# Unknown arguments in both finalize_command and compile_command need
+	# to be aesthetically quoted because they are evaled later.
+	arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+	case "$arg" in
+	*[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	  arg="\"$arg\""
+	  ;;
+	esac
+	;;
+
+      *.$objext)
+	# A standard object.
+	objs="$objs $arg"
+	;;
+
+      *.lo)
+	# A library object.
+	if test "$prev" = dlfiles; then
+	  # This file was specified with -dlopen.
+	  if test "$build_libtool_libs" = yes && test "$dlopen_support" = yes; then
+	    dlfiles="$dlfiles $arg"
+	    prev=
+	    continue
+	  else
+	    # If libtool objects are unsupported, then we need to preload.
+	    prev=dlprefiles
+	  fi
+	fi
+
+	if test "$prev" = dlprefiles; then
+	  # Preload the old-style object.
+	  dlprefiles="$dlprefiles "`$echo "X$arg" | $Xsed -e "$lo2o"`
+	  prev=
+	else
+	  libobjs="$libobjs $arg"
+	fi
+	;;
+
+      *.$libext)
+	# An archive.
+	deplibs="$deplibs $arg"
+	old_deplibs="$old_deplibs $arg"
+	continue
+	;;
+
+      *.la)
+	# A libtool-controlled library.
+
+	if test "$prev" = dlfiles; then
+	  # This library was specified with -dlopen.
+	  dlfiles="$dlfiles $arg"
+	  prev=
+	elif test "$prev" = dlprefiles; then
+	  # The library was specified with -dlpreopen.
+	  dlprefiles="$dlprefiles $arg"
+	  prev=
+	else
+	  deplibs="$deplibs $arg"
+	fi
+	continue
+	;;
+
+      # Some other compiler argument.
+      *)
+	# Unknown arguments in both finalize_command and compile_command need
+	# to be aesthetically quoted because they are evaled later.
+	arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+	case "$arg" in
+	*[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	  arg="\"$arg\""
+	  ;;
+	esac
+	;;
+      esac
+
+      # Now actually substitute the argument into the commands.
+      if test -n "$arg"; then
+	compile_command="$compile_command $arg"
+	finalize_command="$finalize_command $arg"
+      fi
+    done
+
+    if test -n "$prev"; then
+      $echo "$modename: the \`$prevarg' option requires an argument" 1>&2
+      $echo "$help" 1>&2
+      exit 1
+    fi
+
+    if test "$export_dynamic" = yes && test -n "$export_dynamic_flag_spec"; then
+      eval arg=\"$export_dynamic_flag_spec\"
+      compile_command="$compile_command $arg"
+      finalize_command="$finalize_command $arg"
+    fi
+
+    oldlibs=
+    # calculate the name of the file, without its directory
+    outputname=`$echo "X$output" | $Xsed -e 's%^.*/%%'`
+    libobjs_save="$libobjs"
+
+    if test -n "$shlibpath_var"; then
+      # get the directories listed in $shlibpath_var
+      eval shlib_search_path=\`\$echo \"X \${$shlibpath_var}\" \| \$Xsed -e \'s/:/ /g\'\`
+    else
+      shlib_search_path=
+    fi
+    eval sys_lib_search_path=\"$sys_lib_search_path_spec\"
+    eval sys_lib_dlsearch_path=\"$sys_lib_dlsearch_path_spec\"
+    lib_search_path="$lib_search_path $sys_lib_search_path $shlib_search_path"
+
+    output_objdir=`$echo "X$output" | $Xsed -e 's%/[^/]*$%%'`
+    if test "X$output_objdir" = "X$output"; then
+      output_objdir="$objdir"
+    else
+      output_objdir="$output_objdir/$objdir"
+    fi
+    # Create the object directory.
+    if test ! -d $output_objdir; then
+      $show "$mkdir $output_objdir"
+      $run $mkdir $output_objdir
+      status=$?
+      if test $status -ne 0 && test ! -d $output_objdir; then
+	exit $status
+      fi
+    fi
+
+    case "$output" in
+    "")
+      $echo "$modename: you must specify an output file" 1>&2
+      $echo "$help" 1>&2
+      exit 1
+      ;;
+    *.$libext)
+      linkmode=oldlib ;;
+    *.lo | *.$objext)
+      linkmode=obj ;;
+    *.la)
+      linkmode=lib ;;
+    *) # Anything else should be a program.
+      linkmode=prog ;;
+    esac
+
+    specialdeplibs=
+    libs=
+    # Find all interdependent deplibs that
+    # are linked more than once (e.g. -la -lb -la)
+    for deplib in $deplibs; do
+      case "$libs " in
+      *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+      esac
+      libs="$libs $deplib"
+    done
+    deplibs=
+    newdependency_libs=
+    uninst_path= # paths that contain uninstalled libtool libraries
+    new_lib_search_path=
+    need_relink=no # whether we're linking any uninstalled libtool libraries
+    case $linkmode in
+    lib)
+	passes="link"
+	for file in $dlfiles $dlprefiles; do
+	  case "$file" in
+	  *.la) ;;
+	  *)
+	    $echo "$modename: libraries can \`-dlopen' only libtool libraries" 1>&2
+	    exit 1
+	    ;;
+	  esac
+	done
+	;;
+    prog)
+	compile_deplibs=
+	finalize_deplibs=
+	alldeplibs=no
+	newdlfiles=
+	newdlprefiles=
+	link_against_libtool_libs=
+	passes="scan dlopen dlpreopen link"
+	;;
+    *)	passes="link"
+	;;
+    esac
+    for pass in $passes; do
+      if test $linkmode = prog; then
+	case $pass in
+	dlopen) libs="$dlfiles" ;;
+	dlpreopen) libs="$dlprefiles" ;;
+	link) libs="$deplibs %DEPLIBS% $dependency_libs" ;;
+	esac
+      fi
+      if test $pass = dlopen; then
+	# Collect dlpreopened libraries
+	save_deplibs="$deplibs"
+	deplibs=
+      fi
+      for deplib in $libs; do
+	lib=
+	found=no
+	case "$deplib" in
+	-l*)
+	  if test $linkmode != lib && test $linkmode != prog; then
+	    $echo "$modename: warning: \`-l' is ignored for archives/objects" 1>&2
+	    continue
+	  fi
+	  name=`$echo "X$deplib" | $Xsed -e 's/^-l//'`
+	  for searchdir in $lib_search_path; do
+	    # Search the libtool library
+	    lib="$searchdir/lib${name}.la"
+	    if test -f "$lib"; then
+	      found=yes
+	      break
+	    fi
+	  done
+	  if test "$found" != yes; then
+	    if test "$linkmode,$pass" = "prog,link"; then
+	      compile_deplibs="$deplib $compile_deplibs"
+	      finalize_deplibs="$deplib $finalize_deplibs"
+	    else
+	      deplibs="$deplib $deplibs"
+	      test $linkmode = lib && newdependency_libs="$deplib $newdependency_libs"
+	    fi
+	    continue
+	  fi
+	  ;;
+	-L*)
+	  case $linkmode in
+	  lib)
+	    deplibs="$deplib $deplibs"
+	    newdependency_libs="$deplib $newdependency_libs"
+	    new_lib_search_path="$new_lib_search_path "`$echo "X$deplib" | $Xsed -e 's/^-L//'`
+	    ;;
+	  prog)
+	    if test $pass = scan; then
+	      deplibs="$deplib $deplibs"
+	      new_lib_search_path="$new_lib_search_path "`$echo "X$deplib" | $Xsed -e 's/^-L//'`
+	    else
+	      compile_deplibs="$deplib $compile_deplibs"
+	      finalize_deplibs="$deplib $finalize_deplibs"
+	    fi
+	    ;;
+	  *)
+	    $echo "$modename: warning: \`-L' is ignored for archives/objects" 1>&2
+	    ;;
+	  esac
+	  continue
+	  ;;
+	-R*)
+	  if test "$linkmode,$pass" = "prog,link"; then
+	    dir=`$echo "X$deplib" | $Xsed -e 's/^-R//'`
+	    # Make sure the xrpath contains only unique directories.
+	    case "$xrpath " in
+	    *" $dir "*) ;;
+	    *) xrpath="$xrpath $dir" ;;
+	    esac
+	  fi
+	  continue
+	  ;;
+	*.la) lib="$deplib" ;;
+	*.$libext)
+	  case $linkmode in
+	  lib)
+	    if test "$deplibs_check_method" != pass_all; then
+	      echo
+	      echo "*** Warning: This library needs some functionality provided by $deplib."
+	      echo "*** I have the capability to make that library automatically link in when"
+	      echo "*** you link to this library.  But I can only do this if you have a"
+	      echo "*** shared version of the library, which you do not appear to have."
+	    else
+	      echo
+	      echo "*** Warning: Linking the shared library $output against the"
+	      echo "*** static library $deplib is not portable!"
+	      deplibs="$deplib $deplibs"
+	    fi
+	    continue
+	    ;;
+	  prog)
+	    if test $pass != link; then
+	      deplibs="$deplib $deplibs"
+	    else
+	      compile_deplibs="$deplib $compile_deplibs"
+	      finalize_deplibs="$deplib $finalize_deplibs"
+	    fi
+	    continue
+	    ;;
+	  esac
+	  ;;
+	*.lo | *.$objext)
+	  if test $linkmode = prog; then
+	    if test $pass = dlpreopen || test "$dlopen_support" != yes || test "$build_libtool_libs" = no; then
+	      # If there is no dlopen support or we're linking statically,
+	      # we need to preload.
+	      newdlprefiles="$newdlprefiles $deplib"
+	      compile_deplibs="$deplib $compile_deplibs"
+	      finalize_deplibs="$deplib $finalize_deplibs"
+	    else
+	      newdlfiles="$newdlfiles $deplib"
+	    fi
+	  fi
+	  continue
+	  ;;
+	%DEPLIBS%)
+	  alldeplibs=yes
+	  continue
+	  ;;
+	esac
+	if test $found = yes || test -f "$lib"; then :
+	else
+	  $echo "$modename: cannot find the library \`$lib'" 1>&2
+	  exit 1
+	fi
+
+	# Check to see that this really is a libtool archive.
+	if (sed -e '2q' $lib | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then :
+	else
+	  $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+	  exit 1
+	fi
+
+	ladir=`$echo "X$lib" | $Xsed -e 's%/[^/]*$%%'`
+	test "X$ladir" = "X$lib" && ladir="."
+
+	dlname=
+	dlopen=
+	dlpreopen=
+	libdir=
+	library_names=
+	old_library=
+	# If the library was installed with an old release of libtool,
+	# it will not redefine variable installed.
+	installed=yes
+
+	# Read the .la file
+	case "$lib" in
+	*/* | *\\*) . $lib ;;
+	*) . ./$lib ;;
+	esac
+
+	if test $linkmode = lib || test "$linkmode,$pass" = "prog,scan"; then
+	  test -n "$dlopen" && dlfiles="$dlfiles $dlopen"
+	  test -n "$dlpreopen" && dlprefiles="$dlprefiles $dlpreopen"
+	fi
+
+	if test $linkmode != lib && test $linkmode != prog; then
+	  # only check for convenience libraries
+	  if test -z "$old_library"; then
+	    $echo "$modename: cannot find name of link library for \`$lib'" 1>&2
+	    exit 1
+	  fi
+	  if test -n "$libdir"; then
+	    $echo "$modename: \`$lib' is not a convenience library" 1>&2
+	    exit 1
+	  fi
+	  # It is a libtool convenience library, so add in its objects.
+	  convenience="$convenience $ladir/$objdir/$old_library"
+	  old_convenience="$old_convenience $ladir/$objdir/$old_library"
+	  continue
+	fi
+
+	# Get the name of the library we link against.
+	linklib=
+	for l in $old_library $library_names; do
+	  linklib="$l"
+	done
+	if test -z "$linklib"; then
+	  $echo "$modename: cannot find name of link library for \`$lib'" 1>&2
+	  exit 1
+	fi
+
+	# This library was specified with -dlopen.
+	if test $pass = dlopen; then
+	  if test -z "$dlname" || test "$dlopen_support" != yes || test "$build_libtool_libs" = no; then
+	    # If there is no dlname, no dlopen support or we're linking statically,
+	    # we need to preload.
+	    dlprefiles="$dlprefiles $lib"
+	  else
+	    newdlfiles="$newdlfiles $lib"
+	  fi
+	  continue
+	fi
+
+	# We need an absolute path.
+	case "$ladir" in
+	[\\/]* | [A-Za-z]:[\\/]*) abs_ladir="$ladir" ;;
+	*)
+	  abs_ladir=`cd "$ladir" && pwd`
+	  if test -z "$abs_ladir"; then
+	    $echo "$modename: warning: cannot determine absolute directory name of \`$ladir'" 1>&2
+	    $echo "$modename: passing it literally to the linker, although it might fail" 1>&2
+	    abs_ladir="$ladir"
+	  fi
+	  ;;
+	esac
+	laname=`$echo "X$lib" | $Xsed -e 's%^.*/%%'`
+
+	# Find the relevant object directory and library name.
+	if test "X$installed" = Xyes; then
+	  if test ! -f "$libdir/$linklib" && test -f "$abs_ladir/$linklib"; then
+	    $echo "$modename: warning: library \`$lib' was moved." 1>&2
+	    dir="$ladir"
+	    absdir="$abs_ladir"
+	    libdir="$abs_ladir"
+	  else
+	    dir="$libdir"
+	    absdir="$libdir"
+	  fi
+	else
+	  dir="$ladir/$objdir"
+	  absdir="$abs_ladir/$objdir"
+	  # Remove this search path later
+	  uninst_path="$uninst_path $abs_ladir"
+	fi
+	name=`$echo "X$laname" | $Xsed -e 's/\.la$//' -e 's/^lib//'`
+
+	# This library was specified with -dlpreopen.
+	if test $pass = dlpreopen; then
+	  # Prefer using a static library (so that no silly _DYNAMIC symbols
+	  # are required to link).
+	  if test -n "$old_library"; then
+	    newdlprefiles="$newdlprefiles $dir/$old_library"
+	  else
+	    newdlprefiles="$newdlprefiles $dir/$linklib"
+	  fi
+	fi
+
+	if test $linkmode = prog && test $pass != link; then
+	  new_lib_search_path="$new_lib_search_path $ladir"
+	  deplibs="$lib $deplibs"
+
+	  linkalldeplibs=no
+	  if test "$link_all_deplibs" != no || test "$fast_install" != no || \
+	     test "$build_libtool_libs" = no || test -z "$library_names"; then
+	    linkalldeplibs=yes
+	  fi
+
+	  tmp_libs=
+	  for deplib in $dependency_libs; do
+	    case "$deplib" in
+	    -L*) new_lib_search_path="$new_lib_search_path "`$echo "X$deplib" | $Xsed -e 's/^-L//'`;; ### testsuite: skip nested quoting test
+	    esac
+	    # Need to link against all dependency_libs?
+	    if test $linkalldeplibs = yes; then
+	      deplibs="$deplib $deplibs"
+	    else
+	      # Need to hardcode shared library paths
+	      # or/and link against static libraries
+	      newdependency_libs="$deplib $newdependency_libs"
+	    fi
+	    case "$tmp_libs " in
+	    *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+	    esac
+	    tmp_libs="$tmp_libs $deplib"
+	  done
+	  continue
+	fi
+
+	if test -z "$libdir"; then
+	  # It is a libtool convenience library, so add in its objects.
+	  convenience="$convenience $dir/$old_library"
+	  old_convenience="$old_convenience $dir/$old_library"
+	  if test $linkmode = lib; then
+	    deplibs="$dir/$old_library $deplibs"
+	    tmp_libs=
+	    for deplib in $dependency_libs; do
+	      newdependency_libs="$deplib $newdependency_libs"
+	      case "$tmp_libs " in
+	      *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+	      esac
+	      tmp_libs="$tmp_libs $deplib"
+	    done
+	  elif test "$linkmode,$pass" = "prog,link"; then
+	    compile_deplibs="$dir/$old_library $compile_deplibs"
+	    finalize_deplibs="$dir/$old_library $finalize_deplibs"
+	  fi
+	  continue
+	fi
+
+	if test "$linkmode,$pass" = "prog,link"; then
+	  if test -n "$library_names" &&
+	     { test "$hardcode_into_libs" != all || test "$alldeplibs" != yes; } &&
+	     { test "$prefer_static_libs" = no || test -z "$old_library"; }; then
+	    # We need to hardcode the library path
+	    if test -n "$shlibpath_var"; then
+	      # Make sure the rpath contains only unique directories.
+	      case "$temp_rpath " in
+	      *" $dir "*) ;;
+	      *" $absdir "*) ;;
+	      *) temp_rpath="$temp_rpath $dir" ;;
+	      esac
+	    fi
+
+	    # Hardcode the library path.
+	    # Skip directories that are in the system default run-time
+	    # search path.
+	    case " $sys_lib_dlsearch_path " in
+	    *" $absdir "*) ;;
+	    *)
+	      case "$compile_rpath " in
+	      *" $absdir "*) ;;
+	      *) compile_rpath="$compile_rpath $absdir"
+	      esac
+	      ;;
+	    esac
+
+	    case " $sys_lib_dlsearch_path " in
+	    *" $libdir "*) ;;
+	    *)
+	      case "$finalize_rpath " in
+	      *" $libdir "*) ;;
+	      *) finalize_rpath="$finalize_rpath $libdir"
+	      esac
+	      ;;
+	    esac
+	  fi
+
+	  if test "$alldeplibs" = yes &&
+	     { test "$deplibs_check_method" = pass_all ||
+	       { test "$build_libtool_libs" = yes &&
+		 test -n "$library_names"; }; }; then
+	    # Do we only need to link against static libraries?
+	    continue
+	  fi
+	fi
+
+	link_static=no # Whether this library is linked statically
+	if test -n "$library_names" &&
+	   { test "$prefer_static_libs" = no || test -z "$old_library"; }; then
+	  link_against_libtool_libs="$link_against_libtool_libs $lib"
+	  test "X$installed" = xno && need_relink=yes
+	  # This is a shared library
+	  if test $linkmode = lib && test "$hardcode_into_libs" = all; then
+	    # Hardcode the library path.
+	    # Skip directories that are in the system default run-time
+	    # search path.
+	    case " $sys_lib_dlsearch_path " in
+	    *" $absdir "*) ;;
+	    *)
+	      case "$compile_rpath " in
+	      *" $absdir "*) ;;
+	      *) compile_rpath="$compile_rpath $absdir"
+	      esac
+	      ;;
+	    esac
+	    case " $sys_lib_dlsearch_path " in
+	    *" $libdir "*) ;;
+	    *)
+	      case "$finalize_rpath " in
+	      *" $libdir "*) ;;
+	      *) finalize_rpath="$finalize_rpath $libdir"
+	      esac
+	      ;;
+	    esac
+	  fi
+
+	  if test -n "$old_archive_from_expsyms_cmds"; then
+	    # figure out the soname
+	    set dummy $library_names
+	    realname="$2"
+	    shift; shift
+	    libname=`eval \\$echo \"$libname_spec\"`
+	    if test -n "$soname_spec"; then
+	      eval soname=\"$soname_spec\"
+	    else
+	      soname="$realname"
+	    fi
+
+	    # Make a new name for the extract_expsyms_cmds to use
+	    newlib="libimp-`echo $soname | sed 's/^lib//;s/\.dll$//'`.a"
+
+	    # If the library has no export list, then create one now
+	    if test -f "$output_objdir/$soname-def"; then :
+	    else
+	      $show "extracting exported symbol list from \`$soname'"
+	      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+	      eval cmds=\"$extract_expsyms_cmds\"
+	      for cmd in $cmds; do
+		IFS="$save_ifs"
+		$show "$cmd"
+		$run eval "$cmd" || exit $?
+	      done
+	      IFS="$save_ifs"
+	    fi
+
+	    # Create $newlib
+	    if test -f "$output_objdir/$newlib"; then :; else
+	      $show "generating import library for \`$soname'"
+	      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+	      eval cmds=\"$old_archive_from_expsyms_cmds\"
+	      for cmd in $cmds; do
+		IFS="$save_ifs"
+		$show "$cmd"
+		$run eval "$cmd" || exit $?
+	      done
+	      IFS="$save_ifs"
+	    fi
+	    # make sure the library variables are pointing to the new library
+	    dir=$output_objdir
+	    linklib=$newlib
+	  fi
+
+	  if test $linkmode = prog || test "$mode" != relink; then
+	    add_shlibpath=
+	    add_dir=
+	    add=
+	    lib_linked=yes
+	    case "$hardcode_action" in
+	    immediate | unsupported)
+	      if test "$hardcode_direct" = no; then
+		add="$dir/$linklib"
+	      elif test "$hardcode_minus_L" = no; then
+		case "$host" in
+		*-*-sunos*) add_shlibpath="$dir" ;;
+		esac
+		add_dir="-L$dir"
+		add="-l$name"
+	      elif test "$hardcode_shlibpath_var" = no; then
+		add_shlibpath="$dir"
+		add="-l$name"
+	      else
+		lib_linked=no
+	      fi
+	      ;;
+	    relink)
+	      if test "$hardcode_direct" = yes; then
+		add="$dir/$linklib"
+	      elif test "$hardcode_minus_L" = yes; then
+		add_dir="-L$dir"
+		add="-l$name"
+	      elif test "$hardcode_shlibpath_var" = yes; then
+		add_shlibpath="$dir"
+		add="-l$name"
+	      else
+		lib_linked=no
+	      fi
+	      ;;
+	    *) lib_linked=no ;;
+	    esac
+
+	    if test "$lib_linked" != yes; then
+	      $echo "$modename: configuration error: unsupported hardcode properties"
+	      exit 1
+	    fi
+
+	    if test -n "$add_shlibpath"; then
+	      case ":$compile_shlibpath:" in
+	      *":$add_shlibpath:"*) ;;
+	      *) compile_shlibpath="$compile_shlibpath$add_shlibpath:" ;;
+	      esac
+	    fi
+	    if test $linkmode = prog; then
+	      test -n "$add_dir" && compile_deplibs="$add_dir $compile_deplibs"
+	      test -n "$add" && compile_deplibs="$add $compile_deplibs"
+	    else
+	      test -n "$add_dir" && deplibs="$add_dir $deplibs"
+	      test -n "$add" && deplibs="$add $deplibs"
+	      if test "$hardcode_direct" != yes && \
+		 test "$hardcode_minus_L" != yes && \
+		 test "$hardcode_shlibpath_var" = yes; then
+		case ":$finalize_shlibpath:" in
+		*":$libdir:"*) ;;
+		*) finalize_shlibpath="$finalize_shlibpath$libdir:" ;;
+		esac
+	      fi
+	    fi
+	  fi
+
+	  if test $linkmode = prog || test "$mode" = relink; then
+	    add_shlibpath=
+	    add_dir=
+	    add=
+	    # Finalize command for both is simple: just hardcode it.
+	    if test "$hardcode_direct" = yes; then
+	      add="$libdir/$linklib"
+	    elif test "$hardcode_minus_L" = yes; then
+	      add_dir="-L$libdir"
+	      add="-l$name"
+	    elif test "$hardcode_shlibpath_var" = yes; then
+	      case ":$finalize_shlibpath:" in
+	      *":$libdir:"*) ;;
+	      *) finalize_shlibpath="$finalize_shlibpath$libdir:" ;;
+	      esac
+	      add="-l$name"
+	    else
+	      # We cannot seem to hardcode it, guess we'll fake it.
+	      add_dir="-L$libdir"
+	      add="-l$name"
+	    fi
+
+	    if test $linkmode = prog; then
+	      test -n "$add_dir" && finalize_deplibs="$add_dir $finalize_deplibs"
+	      test -n "$add" && finalize_deplibs="$add $finalize_deplibs"
+	    else
+	      test -n "$add_dir" && deplibs="$add_dir $deplibs"
+	      test -n "$add" && deplibs="$add deplibs"
+	    fi
+	  fi
+	elif test $linkmode = prog; then
+	  # Here we assume that one of hardcode_direct or hardcode_minus_L
+	  # is not unsupported.  This is valid on all known static and
+	  # shared platforms.
+	  if test "$hardcode_direct" != unsupported; then
+	    test -n "$old_library" && linklib="$old_library"
+	    compile_deplibs="$dir/$linklib $compile_deplibs"
+	    finalize_deplibs="$dir/$linklib $finalize_deplibs"
+	  else
+	    compile_deplibs="-l$name -L$dir $compile_deplibs"
+	    finalize_deplibs="-l$name -L$dir $finalize_deplibs"
+	  fi
+	elif test "$build_libtool_libs" = yes; then
+	  # Not a shared library
+	  if test "$deplibs_check_method" != pass_all; then
+	    # We're trying link a shared library against a static one
+	    # but the system doesn't support it.
+	    # Just print a warning and add the library to dependency_libs so
+	    # that the program can be linked against the static library.
+	    echo
+	    echo "*** Warning: This library needs some functionality provided by $lib."
+	    echo "*** I have the capability to make that library automatically link in when"
+	    echo "*** you link to this library.  But I can only do this if you have a"
+	    echo "*** shared version of the library, which you do not appear to have."
+	  else
+	    convenience="$convenience $dir/$old_library"
+	    old_convenience="$old_convenience $dir/$old_library"
+	    deplibs="$dir/$old_library $deplibs"
+	    link_static=yes
+	  fi
+	fi
+
+	if test $linkmode = lib; then
+	  if test -n "$dependency_libs" &&
+	     { test "$hardcode_into_libs" = no || test $build_old_libs = yes ||
+	       test $link_static = yes; }; then
+	    # Extract -R from dependency_libs
+	    temp_deplibs=
+	    for libdir in $dependency_libs; do
+	      case "$libdir" in
+	      -R*) temp_xrpath=`$echo "X$libdir" | $Xsed -e 's/^-R//'`
+		   case " $xrpath " in
+		   *" $temp_xrpath "*) ;;
+		   *) xrpath="$xrpath $temp_xrpath";;
+		   esac;;
+	      *) temp_deplibs="$temp_deplibs $libdir";;
+	      esac
+	    done
+	    dependency_libs="$temp_deplibs"
+	  fi
+
+	  new_lib_search_path="$new_lib_search_path $absdir"
+	  # Link against this library
+	  test "$link_static" = no && newdependency_libs="$abs_ladir/$laname $newdependency_libs"
+	  # ... and its dependency_libs
+	  tmp_libs=
+	  for deplib in $dependency_libs; do
+	    newdependency_libs="$deplib $newdependency_libs"
+	    case "$tmp_libs " in
+	    *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+	    esac
+	    tmp_libs="$tmp_libs $deplib"
+	  done
+
+	  if test $link_all_deplibs != no; then
+	    # Add the search paths of all dependency libraries
+	    for deplib in $dependency_libs; do
+	      case "$deplib" in
+	      -L*) path="$deplib" ;;
+	      *.la)
+		dir=`$echo "X$deplib" | $Xsed -e 's%/[^/]*$%%'`
+		test "X$dir" = "X$deplib" && dir="."
+		# We need an absolute path.
+		case "$dir" in
+		[\\/]* | [A-Za-z]:[\\/]*) absdir="$dir" ;;
+		*)
+		  absdir=`cd "$dir" && pwd`
+		  if test -z "$absdir"; then
+		    $echo "$modename: warning: cannot determine absolute directory name of \`$dir'" 1>&2
+		    absdir="$dir"
+		  fi
+		  ;;
+		esac
+		if grep "^installed=no" $deplib > /dev/null; then
+		  path="-L$absdir/$objdir"
+		else
+		  eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $deplib`
+		  if test -z "$libdir"; then
+		    $echo "$modename: \`$deplib' is not a valid libtool archive" 1>&2
+		    exit 1
+		  fi
+		  if test "$absdir" != "$libdir"; then
+		    $echo "$modename: warning: \`$deplib' seems to be moved" 1>&2
+		  fi
+		  path="-L$absdir"
+		fi
+		;;
+	      *) continue ;;
+	      esac
+	      case " $deplibs " in
+	      *" $path "*) ;;
+	      *) deplibs="$deplibs $path" ;;
+	      esac
+	    done
+	  fi
+	fi
+      done
+      dependency_libs="$newdependency_libs"
+      if test $pass = dlpreopen; then
+	# Link the dlpreopened libraries before other libraries
+	deplibs="$deplibs $save_deplibs"
+      elif test $pass != dlopen; then
+	# Make sure lib_search_path contains only unique directories.
+	lib_search_path=
+	for dir in $new_lib_search_path; do
+	  case "$lib_search_path " in
+	  *" $dir "*) ;;
+	  *) lib_search_path="$lib_search_path $dir" ;;
+	  esac
+	done
+	lib_search_path="$lib_search_path $sys_lib_search_path"
+
+	if test "$linkmode,$pass" != "prog,link"; then
+	  vars="deplibs"
+	else
+	  vars="compile_deplibs finalize_deplibs"
+	fi
+	for var in $vars dependency_libs; do
+	  # Make sure that $var contains only unique libraries
+	  # and add them in reverse order
+	  eval tmp_libs=\"\$$var\"
+	  new_libs=
+	  for deplib in $tmp_libs; do
+	    case "$deplib" in
+	    -L*) new_libs="$deplib $new_libs" ;;
+	    *)
+	      case " $specialdeplibs " in
+	      *" $deplib "*) new_libs="$deplib $new_libs" ;;
+	      *)
+		case " $new_libs " in
+		*" $deplib "*) ;;
+		*) new_libs="$deplib $new_libs" ;;
+		esac
+		;;
+	      esac
+	      ;;
+	    esac
+	  done
+	  tmp_libs=
+	  for deplib in $new_libs; do
+	    case "$deplib" in
+	    -L*)
+	      case " $tmp_libs " in
+	      *" $deplib "*) ;;
+	      *) tmp_libs="$tmp_libs $deplib" ;;
+	      esac
+	      ;;
+	    *) tmp_libs="$tmp_libs $deplib" ;;
+	    esac
+	  done
+	  eval $var=\"$tmp_libs\"
+	done
+      fi
+    done
+    if test $linkmode = prog; then
+      dlfiles="$newdlfiles"
+      dlprefiles="$newdlprefiles"
+    fi
+
+    case $linkmode in
+    oldlib)
+      if test -n "$deplibs"; then
+	$echo "$modename: warning: \`-l' and \`-L' are ignored for archives" 1>&2
+      fi
+
+      if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+	$echo "$modename: warning: \`-dlopen' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$rpath"; then
+	$echo "$modename: warning: \`-rpath' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$xrpath"; then
+	$echo "$modename: warning: \`-R' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$vinfo"; then
+	$echo "$modename: warning: \`-version-info' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$release"; then
+	$echo "$modename: warning: \`-release' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$export_symbols" || test -n "$export_symbols_regex"; then
+	$echo "$modename: warning: \`-export-symbols' is ignored for archives" 1>&2
+      fi
+
+      # Now set the variables for building old libraries.
+      build_libtool_libs=no
+      oldlibs="$output"
+      objs="$objs$old_deplibs"
+      ;;
+
+    lib)
+      # Make sure we only generate libraries of the form `libNAME.la'.
+      case "$outputname" in
+      lib*)
+	name=`$echo "X$outputname" | $Xsed -e 's/\.la$//' -e 's/^lib//'`
+	eval libname=\"$libname_spec\"
+	;;
+      *)
+	if test "$module" = no; then
+	  $echo "$modename: libtool library \`$output' must begin with \`lib'" 1>&2
+	  $echo "$help" 1>&2
+	  exit 1
+	fi
+	if test "$need_lib_prefix" != no; then
+	  # Add the "lib" prefix for modules if required
+	  name=`$echo "X$outputname" | $Xsed -e 's/\.la$//'`
+	  eval libname=\"$libname_spec\"
+	else
+	  libname=`$echo "X$outputname" | $Xsed -e 's/\.la$//'`
+	fi
+	;;
+      esac
+
+      if test -n "$objs"; then
+	if test "$deplibs_check_method" != pass_all; then
+	  $echo "$modename: cannot build libtool library \`$output' from non-libtool objects on this host:$objs" 2>&1
+	  exit 1
+	else
+	  echo
+	  echo "*** Warning: Linking the shared library $output against the non-libtool"
+	  echo "*** objects $objs is not portable!"
+	  libobjs="$libobjs $objs"
+	fi
+      fi
+
+      if test "$dlself" != no; then
+	$echo "$modename: warning: \`-dlopen self' is ignored for libtool libraries" 1>&2
+      fi
+
+      set dummy $rpath
+      if test $# -gt 2; then
+	$echo "$modename: warning: ignoring multiple \`-rpath's for a libtool library" 1>&2
+      fi
+      install_libdir="$2"
+
+      oldlibs=
+      if test -z "$rpath"; then
+	if test "$build_libtool_libs" = yes; then
+	  # Building a libtool convenience library.
+	  libext=al
+	  oldlibs="$output_objdir/$libname.$libext $oldlibs"
+	  build_libtool_libs=convenience
+	  build_old_libs=yes
+	fi
+
+	if test -n "$vinfo"; then
+	  $echo "$modename: warning: \`-version-info' is ignored for convenience libraries" 1>&2
+	fi
+
+	if test -n "$release"; then
+	  $echo "$modename: warning: \`-release' is ignored for convenience libraries" 1>&2
+	fi
+      else
+
+	# Parse the version information argument.
+	IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=':'
+	set dummy $vinfo 0 0 0
+	IFS="$save_ifs"
+
+	if test -n "$8"; then
+	  $echo "$modename: too many parameters to \`-version-info'" 1>&2
+	  $echo "$help" 1>&2
+	  exit 1
+	fi
+
+	current="$2"
+	revision="$3"
+	age="$4"
+
+	# Check that each of the things are valid numbers.
+	case "$current" in
+	0 | [1-9] | [1-9][0-9]*) ;;
+	*)
+	  $echo "$modename: CURRENT \`$current' is not a nonnegative integer" 1>&2
+	  $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+	  exit 1
+	  ;;
+	esac
+
+	case "$revision" in
+	0 | [1-9] | [1-9][0-9]*) ;;
+	*)
+	  $echo "$modename: REVISION \`$revision' is not a nonnegative integer" 1>&2
+	  $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+	  exit 1
+	  ;;
+	esac
+
+	case "$age" in
+	0 | [1-9] | [1-9][0-9]*) ;;
+	*)
+	  $echo "$modename: AGE \`$age' is not a nonnegative integer" 1>&2
+	  $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+	  exit 1
+	  ;;
+	esac
+
+	if test $age -gt $current; then
+	  $echo "$modename: AGE \`$age' is greater than the current interface number \`$current'" 1>&2
+	  $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+	  exit 1
+	fi
+
+	# Calculate the version variables.
+	major=
+	versuffix=
+	verstring=
+	case "$version_type" in
+	none) ;;
+
+	irix)
+	  major=`expr $current - $age + 1`
+	  versuffix="$major.$revision"
+	  verstring="sgi$major.$revision"
+
+	  # Add in all the interfaces that we are compatible with.
+	  loop=$revision
+	  while test $loop != 0; do
+	    iface=`expr $revision - $loop`
+	    loop=`expr $loop - 1`
+	    verstring="sgi$major.$iface:$verstring"
+	  done
+	  ;;
+
+	linux)
+	  major=.`expr $current - $age`
+	  versuffix="$major.$age.$revision"
+	  ;;
+
+	osf)
+	  major=`expr $current - $age`
+	  versuffix=".$current.$age.$revision"
+	  verstring="$current.$age.$revision"
+
+	  # Add in all the interfaces that we are compatible with.
+	  loop=$age
+	  while test $loop != 0; do
+	    iface=`expr $current - $loop`
+	    loop=`expr $loop - 1`
+	    verstring="$verstring:${iface}.0"
+	  done
+
+	  # Make executables depend on our current version.
+	  verstring="$verstring:${current}.0"
+	  ;;
+
+	sunos)
+	  major=".$current"
+	  versuffix=".$current.$revision"
+	  ;;
+
+	freebsd-aout)
+	  major=".$current"
+	  versuffix=".$current.$revision";
+	  ;;
+
+	freebsd-elf)
+	  major=".$current"
+	  versuffix=".$current";
+	  ;;
+
+	windows)
+	  # Like Linux, but with '-' rather than '.', since we only
+	  # want one extension on Windows 95.
+	  major=`expr $current - $age`
+	  versuffix="-$major-$age-$revision"
+	  ;;
+
+	*)
+	  $echo "$modename: unknown library version type \`$version_type'" 1>&2
+	  echo "Fatal configuration error.  See the $PACKAGE docs for more information." 1>&2
+	  exit 1
+	  ;;
+	esac
+
+	# Clear the version info if we defaulted, and they specified a release.
+	if test -z "$vinfo" && test -n "$release"; then
+	  major=
+	  verstring="0.0"
+	  if test "$need_version" = no; then
+	    versuffix=
+	  else
+	    versuffix=".0.0"
+	  fi
+	fi
+
+	# Remove version info from name if versioning should be avoided
+	if test "$avoid_version" = yes && test "$need_version" = no; then
+	  major=
+	  versuffix=
+	  verstring=""
+	fi
+
+	# Check to see if the archive will have undefined symbols.
+	if test "$allow_undefined" = yes; then
+	  if test "$allow_undefined_flag" = unsupported; then
+	    $echo "$modename: warning: undefined symbols not allowed in $host shared libraries" 1>&2
+	    build_libtool_libs=no
+	    build_old_libs=yes
+	  fi
+	else
+	  # Don't allow undefined symbols.
+	  allow_undefined_flag="$no_undefined_flag"
+	fi
+      fi
+
+      if test "$mode" != relink; then
+	# Remove our outputs.
+	$show "${rm}r $output_objdir/$outputname $output_objdir/$libname.* $output_objdir/${libname}${release}.*"
+	$run ${rm}r $output_objdir/$outputname $output_objdir/$libname.* $output_objdir/${libname}${release}.*
+      fi
+
+      # Now set the variables for building old libraries.
+      if test "$build_old_libs" = yes && test "$build_libtool_libs" != convenience ; then
+	oldlibs="$oldlibs $output_objdir/$libname.$libext"
+
+	# Transform .lo files to .o files.
+	oldobjs="$objs "`$echo "X$libobjs" | $SP2NL | $Xsed -e '/\.'${libext}'$/d' -e "$lo2o" | $NL2SP`
+      fi
+
+      # Eliminate all temporary directories.
+      for path in $uninst_path; do
+	lib_search_path=`echo "$lib_search_path " | sed -e 's% $path % %g'`
+	deplibs=`echo "$deplibs " | sed -e 's% -L$path % %g'`
+	dependency_libs=`echo "$dependency_libs " | sed -e 's% -L$path % %g'`
+      done
+
+      if test -n "$xrpath"; then
+	# If the user specified any rpath flags, then add them.
+	temp_xrpath=
+	for libdir in $xrpath; do
+	  temp_xrpath="$temp_xrpath -R$libdir"
+	  case "$finalize_rpath " in
+	  *" $libdir "*) ;;
+	  *) finalize_rpath="$finalize_rpath $libdir" ;;
+	  esac
+	done
+	if test "$hardcode_into_libs" = no || test $build_old_libs = yes; then
+	  dependency_libs="$temp_xrpath $dependency_libs"
+	fi
+      fi
+
+      # Make sure dlfiles contains only unique files that won't be dlpreopened
+      old_dlfiles="$dlfiles"
+      dlfiles=
+      for lib in $old_dlfiles; do
+	case " $dlprefiles $dlfiles " in
+	*" $lib "*) ;;
+	*) dlfiles="$dlfiles $lib" ;;
+	esac
+      done
+
+      # Make sure dlprefiles contains only unique files
+      old_dlprefiles="$dlprefiles"
+      dlprefiles=
+      for lib in $old_dlprefiles; do
+	case "$dlprefiles " in
+	*" $lib "*) ;;
+	*) dlprefiles="$dlprefiles $lib" ;;
+	esac
+      done
+
+      if test "$build_libtool_libs" = yes; then
+	if test -n "$rpath"; then
+	  case "$host" in
+	  *-*-cygwin* | *-*-mingw* | *-*-os2* | *-*-beos*)
+	    # these systems don't actually have a c library (as such)!
+	    ;;
+	  *)
+	    # Add libc to deplibs on all other systems.
+	    deplibs="$deplibs -lc"
+	    ;;
+	  esac
+	fi
+
+	# Transform deplibs into only deplibs that can be linked in shared.
+	name_save=$name
+	libname_save=$libname
+	release_save=$release
+	versuffix_save=$versuffix
+	major_save=$major
+	# I'm not sure if I'm treating the release correctly.  I think
+	# release should show up in the -l (ie -lgmp5) so we don't want to
+	# add it in twice.  Is that correct?
+	release=""
+	versuffix=""
+	major=""
+	newdeplibs=
+	droppeddeps=no
+	case "$deplibs_check_method" in
+	pass_all)
+	  # Don't check for shared/static.  Everything works.
+	  # This might be a little naive.  We might want to check
+	  # whether the library exists or not.  But this is on
+	  # osf3 & osf4 and I'm not really sure... Just
+	  # implementing what was already the behaviour.
+	  newdeplibs=$deplibs
+	  ;;
+	test_compile)
+	  # This code stresses the "libraries are programs" paradigm to its
+	  # limits. Maybe even breaks it.  We compile a program, linking it
+	  # against the deplibs as a proxy for the library.  Then we can check
+	  # whether they linked in statically or dynamically with ldd.
+	  $rm conftest.c
+	  cat > conftest.c <<EOF
+	  int main() { return 0; }
+EOF
+	  $rm conftest
+	  $CC -o conftest conftest.c $deplibs
+	  if test $? -eq 0 ; then
+	    ldd_output=`ldd conftest`
+	    for i in $deplibs; do
+	      name="`expr $i : '-l\(.*\)'`"
+	      # If $name is empty we are operating on a -L argument.
+	      if test "$name" != "" ; then
+		libname=`eval \\$echo \"$libname_spec\"`
+		deplib_matches=`eval \\$echo \"$library_names_spec\"`
+		set dummy $deplib_matches
+		deplib_match=$2
+		if test `expr "$ldd_output" : ".*$deplib_match"` -ne 0 ; then
+		  newdeplibs="$newdeplibs $i"
+		else
+		  droppeddeps=yes
+		  echo
+		  echo "*** Warning: This library needs some functionality provided by $i."
+		  echo "*** I have the capability to make that library automatically link in when"
+		  echo "*** you link to this library.  But I can only do this if you have a"
+		  echo "*** shared version of the library, which you do not appear to have."
+		fi
+	      else
+		newdeplibs="$newdeplibs $i"
+	      fi
+	    done
+	  else
+	    # Error occured in the first compile.  Let's try to salvage the situation:
+	    # Compile a seperate program for each library.
+	    for i in $deplibs; do
+	      name="`expr $i : '-l\(.*\)'`"
+	     # If $name is empty we are operating on a -L argument.
+	      if test "$name" != "" ; then
+		$rm conftest
+		$CC -o conftest conftest.c $i
+		# Did it work?
+		if test $? -eq 0 ; then
+		  ldd_output=`ldd conftest`
+		  libname=`eval \\$echo \"$libname_spec\"`
+		  deplib_matches=`eval \\$echo \"$library_names_spec\"`
+		  set dummy $deplib_matches
+		  deplib_match=$2
+		  if test `expr "$ldd_output" : ".*$deplib_match"` -ne 0 ; then
+		    newdeplibs="$newdeplibs $i"
+		  else
+		    droppeddeps=yes
+		    echo
+		    echo "*** Warning: This library needs some functionality provided by $i."
+		    echo "*** I have the capability to make that library automatically link in when"
+		    echo "*** you link to this library.  But I can only do this if you have a"
+		    echo "*** shared version of the library, which you do not appear to have."
+		  fi
+		else
+		  droppeddeps=yes
+		  echo
+		  echo "*** Warning!  Library $i is needed by this library but I was not able to"
+		  echo "***  make it link in!  You will probably need to install it or some"
+		  echo "*** library that it depends on before this library will be fully"
+		  echo "*** functional.  Installing it before continuing would be even better."
+		fi
+	      else
+		newdeplibs="$newdeplibs $i"
+	      fi
+	    done
+	  fi
+	  ;;
+	file_magic*)
+	  set dummy $deplibs_check_method
+	  file_magic_regex=`expr "$deplibs_check_method" : "$2 \(.*\)"`
+	  for a_deplib in $deplibs; do
+	    name="`expr $a_deplib : '-l\(.*\)'`"
+	    # If $name is empty we are operating on a -L argument.
+	    if test "$name" != "" ; then
+	      libname=`eval \\$echo \"$libname_spec\"`
+	      for i in $lib_search_path; do
+		    potential_libs=`ls $i/$libname[.-]* 2>/dev/null`
+		    for potent_lib in $potential_libs; do
+		      # Follow soft links.
+		      if ls -lLd "$potent_lib" 2>/dev/null \
+			 | grep " -> " >/dev/null; then
+			continue
+		      fi
+		      # The statement above tries to avoid entering an
+		      # endless loop below, in case of cyclic links.
+		      # We might still enter an endless loop, since a link
+		      # loop can be closed while we follow links,
+		      # but so what?
+		      potlib="$potent_lib"
+		      while test -h "$potlib" 2>/dev/null; do
+			potliblink=`ls -ld $potlib | sed 's/.* -> //'`
+			case "$potliblink" in
+			[\\/]* | [A-Za-z]:[\\/]*) potlib="$potliblink";;
+			*) potlib=`$echo "X$potlib" | $Xsed -e 's,[^/]*$,,'`"$potliblink";;
+			esac
+		      done
+		      if eval $file_magic_cmd \"\$potlib\" 2>/dev/null \
+			 | sed 10q \
+			 | egrep "$file_magic_regex" > /dev/null; then
+			newdeplibs="$newdeplibs $a_deplib"
+			a_deplib=""
+			break 2
+		      fi
+		    done
+	      done
+	      if test -n "$a_deplib" ; then
+		droppeddeps=yes
+		echo
+		echo "*** Warning: This library needs some functionality provided by $a_deplib."
+		echo "*** I have the capability to make that library automatically link in when"
+		echo "*** you link to this library.  But I can only do this if you have a"
+		echo "*** shared version of the library, which you do not appear to have."
+	      fi
+	    else
+	      # Add a -L argument.
+	      newdeplibs="$newdeplibs $a_deplib"
+	    fi
+	  done # Gone through all deplibs.
+	  ;;
+	none | unknown | *)
+	  newdeplibs=""
+	  if $echo "X $deplibs" | $Xsed -e 's/ -lc$//' \
+	       -e 's/ -[LR][^ ]*//g' -e 's/[ 	]//g' |
+	     grep . >/dev/null; then
+	    echo
+	    if test "X$deplibs_check_method" = "Xnone"; then
+	      echo "*** Warning: inter-library dependencies are not supported in this platform."
+	    else
+	      echo "*** Warning: inter-library dependencies are not known to be supported."
+	    fi
+	    echo "*** All declared inter-library dependencies are being dropped."
+	    droppeddeps=yes
+	  fi
+	  ;;
+	esac
+	versuffix=$versuffix_save
+	major=$major_save
+	release=$release_save
+	libname=$libname_save
+	name=$name_save
+
+	if test "$droppeddeps" = yes; then
+	  if test "$module" = yes; then
+	    echo
+	    echo "*** Warning: libtool could not satisfy all declared inter-library"
+	    echo "*** dependencies of module $libname.  Therefore, libtool will create"
+	    echo "*** a static module, that should work as long as the dlopening"
+	    echo "*** application is linked with the -dlopen flag."
+	    if test -z "$global_symbol_pipe"; then
+	      echo
+	      echo "*** However, this would only work if libtool was able to extract symbol"
+	      echo "*** lists from a program, using \`nm' or equivalent, but libtool could"
+	      echo "*** not find such a program.  So, this module is probably useless."
+	      echo "*** \`nm' from GNU binutils and a full rebuild may help."
+	    fi
+	    if test "$build_old_libs" = no; then
+	      oldlibs="$output_objdir/$libname.$libext"
+	      build_libtool_libs=module
+	      build_old_libs=yes
+	    else
+	      build_libtool_libs=no
+	    fi
+	  else
+	    echo "*** The inter-library dependencies that have been dropped here will be"
+	    echo "*** automatically added whenever a program is linked with this library"
+	    echo "*** or is declared to -dlopen it."
+	  fi
+	fi
+	# Done checking deplibs!
+	deplibs=$newdeplibs
+      fi
+
+      # All the library-specific variables (install_libdir is set above).
+      library_names=
+      old_library=
+      dlname=
+
+      # Test again, we may have decided not to build it any more
+      if test "$build_libtool_libs" = yes; then
+	if test "$hardcode_into_libs" != no; then
+	  # Hardcode the library paths
+	  hardcode_libdirs=
+	  dep_rpath=
+	  rpath="$finalize_rpath"
+	  test "$mode" != relink && rpath="$compile_rpath$rpath"
+	  for libdir in $rpath; do
+	    if test -n "$hardcode_libdir_flag_spec"; then
+	      if test -n "$hardcode_libdir_separator"; then
+		if test -z "$hardcode_libdirs"; then
+		  hardcode_libdirs="$libdir"
+		else
+		  # Just accumulate the unique libdirs.
+		  case "$hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator" in
+		  *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
+		    ;;
+		  *)
+		    hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
+		    ;;
+		  esac
+		fi
+	      else
+		eval flag=\"$hardcode_libdir_flag_spec\"
+		dep_rpath="$dep_rpath $flag"
+	      fi
+	    elif test -n "$runpath_var"; then
+	      case "$perm_rpath " in
+	      *" $libdir "*) ;;
+	      *) perm_rpath="$perm_rpath $libdir" ;;
+	      esac
+	    fi
+	  done
+	  # Substitute the hardcoded libdirs into the rpath.
+	  if test -n "$hardcode_libdir_separator" &&
+	     test -n "$hardcode_libdirs"; then
+	    libdir="$hardcode_libdirs"
+	    eval dep_rpath=\"$hardcode_libdir_flag_spec\"
+	  fi
+	  if test -n "$runpath_var" && test -n "$perm_rpath"; then
+	    # We should set the runpath_var.
+	    rpath=
+	    for dir in $perm_rpath; do
+	      rpath="$rpath$dir:"
+	    done
+	    eval "$runpath_var='$rpath\$$runpath_var'; export $runpath_var"
+	  fi
+	  test -n "$dep_rpath" && deplibs="$dep_rpath $deplibs"
+	fi
+
+	shlibpath="$finalize_shlibpath"
+	test "$mode" != relink && shlibpath="$compile_shlibpath$shlibpath"
+	if test -n "$shlibpath"; then
+	  eval "$shlibpath_var='$shlibpath\$$shlibpath_var'; export $shlibpath_var"
+	fi
+
+	# Get the real and link names of the library.
+	eval library_names=\"$library_names_spec\"
+	set dummy $library_names
+	realname="$2"
+	shift; shift
+
+	if test -n "$soname_spec"; then
+	  eval soname=\"$soname_spec\"
+	else
+	  soname="$realname"
+	fi
+
+	lib="$output_objdir/$realname"
+	for link
+	do
+	  linknames="$linknames $link"
+	done
+
+	# Ensure that we have .o objects for linkers which dislike .lo
+	# (e.g. aix) in case we are running --disable-static
+	for obj in $libobjs; do
+	  xdir=`$echo "X$obj" | $Xsed -e 's%/[^/]*$%%'`
+	  if test "X$xdir" = "X$obj"; then
+	    xdir="."
+	  else
+	    xdir="$xdir"
+	  fi
+	  baseobj=`$echo "X$obj" | $Xsed -e 's%^.*/%%'`
+	  oldobj=`$echo "X$baseobj" | $Xsed -e "$lo2o"`
+	  if test ! -f $xdir/$oldobj; then
+	    $show "(cd $xdir && ${LN_S} $baseobj $oldobj)"
+	    $run eval '(cd $xdir && ${LN_S} $baseobj $oldobj)' || exit $?
+	  fi
+	done
+
+	# Use standard objects if they are pic
+	test -z "$pic_flag" && libobjs=`$echo "X$libobjs" | $SP2NL | $Xsed -e "$lo2o" | $NL2SP`
+
+	# Prepare the list of exported symbols
+	if test -z "$export_symbols"; then
+	  if test "$always_export_symbols" = yes || test -n "$export_symbols_regex"; then
+	    $show "generating symbol list for \`$libname.la'"
+	    export_symbols="$output_objdir/$libname.exp"
+	    $run $rm $export_symbols
+	    eval cmds=\"$export_symbols_cmds\"
+	    IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+	    for cmd in $cmds; do
+	      IFS="$save_ifs"
+	      $show "$cmd"
+	      $run eval "$cmd" || exit $?
+	    done
+	    IFS="$save_ifs"
+	    if test -n "$export_symbols_regex"; then
+	      $show "egrep -e \"$export_symbols_regex\" \"$export_symbols\" > \"${export_symbols}T\""
+	      $run eval 'egrep -e "$export_symbols_regex" "$export_symbols" > "${export_symbols}T"'
+	      $show "$mv \"${export_symbols}T\" \"$export_symbols\""
+	      $run eval '$mv "${export_symbols}T" "$export_symbols"'
+	    fi
+	  fi
+	fi
+
+	if test -n "$export_symbols" && test -n "$include_expsyms"; then
+	  $run eval '$echo "X$include_expsyms" | $SP2NL >> "$export_symbols"'
+	fi
+
+	if test -n "$convenience"; then
+	  if test -n "$whole_archive_flag_spec"; then
+	    eval libobjs=\"\$libobjs $whole_archive_flag_spec\"
+	  else
+	    gentop="$output_objdir/${outputname}x"
+	    $show "${rm}r $gentop"
+	    $run ${rm}r "$gentop"
+	    $show "mkdir $gentop"
+	    $run mkdir "$gentop"
+	    status=$?
+	    if test $status -ne 0 && test ! -d "$gentop"; then
+	      exit $status
+	    fi
+	    generated="$generated $gentop"
+
+	    for xlib in $convenience; do
+	      # Extract the objects.
+	      case "$xlib" in
+	      [\\/]* | [A-Za-z]:[\\/]*) xabs="$xlib" ;;
+	      *) xabs=`pwd`"/$xlib" ;;
+	      esac
+	      xlib=`$echo "X$xlib" | $Xsed -e 's%^.*/%%'`
+	      xdir="$gentop/$xlib"
+
+	      $show "${rm}r $xdir"
+	      $run ${rm}r "$xdir"
+	      $show "mkdir $xdir"
+	      $run mkdir "$xdir"
+	      status=$?
+	      if test $status -ne 0 && test ! -d "$xdir"; then
+		exit $status
+	      fi
+	      $show "(cd $xdir && $AR x $xabs)"
+	      $run eval "(cd \$xdir && $AR x \$xabs)" || exit $?
+
+	      libobjs="$libobjs "`find $xdir -name \*.o -print -o -name \*.lo -print | $NL2SP`
+	    done
+	  fi
+	fi
+
+	if test "$thread_safe" = yes && test -n "$thread_safe_flag_spec"; then
+	  eval flag=\"$thread_safe_flag_spec\"
+	  linker_flags="$linker_flags $flag"
+	fi
+
+	# Make a backup of the uninstalled library when relinking
+	if test "$mode" = relink && test "$hardcode_into_libs" = all; then
+	  $run eval '(cd $output_objdir && $rm ${realname}U && $mv $realname ${realname}U)' || exit $?
+	fi
+
+	# Do each of the archive commands.
+	if test -n "$export_symbols" && test -n "$archive_expsym_cmds"; then
+	  eval cmds=\"$archive_expsym_cmds\"
+	else
+	  eval cmds=\"$archive_cmds\"
+	fi
+	IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+	for cmd in $cmds; do
+	  IFS="$save_ifs"
+	  $show "$cmd"
+	  $run eval "$cmd" || exit $?
+	done
+	IFS="$save_ifs"
+
+	# Restore the uninstalled library and exit
+	if test "$mode" = relink && test "$hardcode_into_libs" = all; then
+	  $run eval '(cd $output_objdir && $rm ${realname}T && $mv $realname ${realname}T && $mv "$realname"U $realname)' || exit $?
+	  exit 0
+	fi
+
+	# Create links to the real library.
+	for linkname in $linknames; do
+	  if test "$realname" != "$linkname"; then
+	    $show "(cd $output_objdir && $rm $linkname && $LN_S $realname $linkname)"
+	    $run eval '(cd $output_objdir && $rm $linkname && $LN_S $realname $linkname)' || exit $?
+	  fi
+	done
+
+	# If -module or -export-dynamic was specified, set the dlname.
+	if test "$module" = yes || test "$export_dynamic" = yes; then
+	  # On all known operating systems, these are identical.
+	  dlname="$soname"
+	fi
+      fi
+      ;;
+
+    obj)
+      if test -n "$deplibs"; then
+	$echo "$modename: warning: \`-l' and \`-L' are ignored for objects" 1>&2
+      fi
+
+      if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+	$echo "$modename: warning: \`-dlopen' is ignored for objects" 1>&2
+      fi
+
+      if test -n "$rpath"; then
+	$echo "$modename: warning: \`-rpath' is ignored for objects" 1>&2
+      fi
+
+      if test -n "$xrpath"; then
+	$echo "$modename: warning: \`-R' is ignored for objects" 1>&2
+      fi
+
+      if test -n "$vinfo"; then
+	$echo "$modename: warning: \`-version-info' is ignored for objects" 1>&2
+      fi
+
+      if test -n "$release"; then
+	$echo "$modename: warning: \`-release' is ignored for objects" 1>&2
+      fi
+
+      case "$output" in
+      *.lo)
+	if test -n "$objs$old_deplibs"; then
+	  $echo "$modename: cannot build library object \`$output' from non-libtool objects" 1>&2
+	  exit 1
+	fi
+	libobj="$output"
+	obj=`$echo "X$output" | $Xsed -e "$lo2o"`
+	;;
+      *)
+	libobj=
+	obj="$output"
+	;;
+      esac
+
+      # Delete the old objects.
+      $run $rm $obj $libobj
+
+      # Objects from convenience libraries.  This assumes
+      # single-version convenience libraries.  Whenever we create
+      # different ones for PIC/non-PIC, this we'll have to duplicate
+      # the extraction.
+      reload_conv_objs=
+      gentop=
+      # reload_cmds runs $LD directly, so let us get rid of
+      # -Wl from whole_archive_flag_spec
+      wl=
+
+      if test -n "$convenience"; then
+	if test -n "$whole_archive_flag_spec"; then
+	  eval reload_conv_objs=\"\$reload_objs $whole_archive_flag_spec\"
+	else
+	  gentop="$output_objdir/${obj}x"
+	  $show "${rm}r $gentop"
+	  $run ${rm}r "$gentop"
+	  $show "mkdir $gentop"
+	  $run mkdir "$gentop"
+	  status=$?
+	  if test $status -ne 0 && test ! -d "$gentop"; then
+	    exit $status
+	  fi
+	  generated="$generated $gentop"
+
+	  for xlib in $convenience; do
+	    # Extract the objects.
+	    case "$xlib" in
+	    [\\/]* | [A-Za-z]:[\\/]*) xabs="$xlib" ;;
+	    *) xabs=`pwd`"/$xlib" ;;
+	    esac
+	    xlib=`$echo "X$xlib" | $Xsed -e 's%^.*/%%'`
+	    xdir="$gentop/$xlib"
+
+	    $show "${rm}r $xdir"
+	    $run ${rm}r "$xdir"
+	    $show "mkdir $xdir"
+	    $run mkdir "$xdir"
+	    status=$?
+	    if test $status -ne 0 && test ! -d "$xdir"; then
+	      exit $status
+	    fi
+	    $show "(cd $xdir && $AR x $xabs)"
+	    $run eval "(cd \$xdir && $AR x \$xabs)" || exit $?
+
+	    reload_conv_objs="$reload_objs "`find $xdir -name \*.o -print -o -name \*.lo -print | $NL2SP`
+	  done
+	fi
+      fi
+
+      # Create the old-style object.
+      reload_objs="$objs$old_deplibs "`$echo "X$libobjs" | $SP2NL | $Xsed -e '/\.'${libext}$'/d' -e '/\.lib$/d' -e "$lo2o" | $NL2SP`" $reload_conv_objs" ### testsuite: skip nested quoting test
+
+      output="$obj"
+      eval cmds=\"$reload_cmds\"
+      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+      for cmd in $cmds; do
+	IFS="$save_ifs"
+	$show "$cmd"
+	$run eval "$cmd" || exit $?
+      done
+      IFS="$save_ifs"
+
+      # Exit if we aren't doing a library object file.
+      if test -z "$libobj"; then
+	if test -n "$gentop"; then
+	  $show "${rm}r $gentop"
+	  $run ${rm}r $gentop
+	fi
+
+	exit 0
+      fi
+
+      if test "$build_libtool_libs" != yes; then
+	if test -n "$gentop"; then
+	  $show "${rm}r $gentop"
+	  $run ${rm}r $gentop
+	fi
+
+	# Create an invalid libtool object if no PIC, so that we don't
+	# accidentally link it into a program.
+	$show "echo timestamp > $libobj"
+	$run eval "echo timestamp > $libobj" || exit $?
+	exit 0
+      fi
+
+      if test -n "$pic_flag" || test "$pic_mode" != default; then
+	# Only do commands if we really have different PIC objects.
+	reload_objs="$libobjs $reload_conv_objs"
+	output="$libobj"
+	eval cmds=\"$reload_cmds\"
+	IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+	for cmd in $cmds; do
+	  IFS="$save_ifs"
+	  $show "$cmd"
+	  $run eval "$cmd" || exit $?
+	done
+	IFS="$save_ifs"
+      else
+	# Just create a symlink.
+	$show $rm $libobj
+	$run $rm $libobj
+	xdir=`$echo "X$libobj" | $Xsed -e 's%/[^/]*$%%'`
+	if test "X$xdir" = "X$libobj"; then
+	  xdir="."
+	else
+	  xdir="$xdir"
+	fi
+	baseobj=`$echo "X$libobj" | $Xsed -e 's%^.*/%%'`
+	oldobj=`$echo "X$baseobj" | $Xsed -e "$lo2o"`
+	$show "(cd $xdir && $LN_S $oldobj $baseobj)"
+	$run eval '(cd $xdir && $LN_S $oldobj $baseobj)' || exit $?
+      fi
+
+      if test -n "$gentop"; then
+	$show "${rm}r $gentop"
+	$run ${rm}r $gentop
+      fi
+
+      exit 0
+      ;;
+
+    prog)
+      if test -n "$vinfo"; then
+	$echo "$modename: warning: \`-version-info' is ignored for programs" 1>&2
+      fi
+
+      if test -n "$release"; then
+	$echo "$modename: warning: \`-release' is ignored for programs" 1>&2
+      fi
+
+      if test "$preload" = yes; then
+	if test "$dlopen_support" = unknown && test "$dlopen_self" = unknown &&
+	   test "$dlopen_self_static" = unknown; then
+	  $echo "$modename: warning: \`AC_LIBTOOL_DLOPEN' not used. Assuming no dlopen support."
+	fi
+      fi
+
+      compile_command="$compile_command $compile_deplibs"
+      finalize_command="$finalize_command $finalize_deplibs"
+
+      if test -n "$rpath$xrpath"; then
+	# If the user specified any rpath flags, then add them.
+	for libdir in $rpath $xrpath; do
+	  # This is the magic to use -rpath.
+	  case "$finalize_rpath " in
+	  *" $libdir "*) ;;
+	  *) finalize_rpath="$finalize_rpath $libdir" ;;
+	  esac
+	done
+      fi
+
+      # Now hardcode the library paths
+      rpath=
+      hardcode_libdirs=
+      for libdir in $compile_rpath $finalize_rpath; do
+	if test -n "$hardcode_libdir_flag_spec"; then
+	  if test -n "$hardcode_libdir_separator"; then
+	    if test -z "$hardcode_libdirs"; then
+	      hardcode_libdirs="$libdir"
+	    else
+	      # Just accumulate the unique libdirs.
+	      case "$hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator" in
+	      *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
+		;;
+	      *)
+		hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
+		;;
+	      esac
+	    fi
+	  else
+	    eval flag=\"$hardcode_libdir_flag_spec\"
+	    rpath="$rpath $flag"
+	  fi
+	elif test -n "$runpath_var"; then
+	  case "$perm_rpath " in
+	  *" $libdir "*) ;;
+	  *) perm_rpath="$perm_rpath $libdir" ;;
+	  esac
+	fi
+	case "$host" in
+	*-*-cygwin* | *-*-mingw* | *-*-os2*)
+	  case ":$dllsearchpath:" in
+	  *":$libdir:"*) ;;
+	  *) dllsearchpath="$dllsearchpath:$libdir";;
+	  esac
+	  ;;
+	esac
+      done
+      # Substitute the hardcoded libdirs into the rpath.
+      if test -n "$hardcode_libdir_separator" &&
+	 test -n "$hardcode_libdirs"; then
+	libdir="$hardcode_libdirs"
+	eval rpath=\" $hardcode_libdir_flag_spec\"
+      fi
+      compile_rpath="$rpath"
+
+      rpath=
+      hardcode_libdirs=
+      for libdir in $finalize_rpath; do
+	if test -n "$hardcode_libdir_flag_spec"; then
+	  if test -n "$hardcode_libdir_separator"; then
+	    if test -z "$hardcode_libdirs"; then
+	      hardcode_libdirs="$libdir"
+	    else
+	      # Just accumulate the unique libdirs.
+	      case "$hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator" in
+	      *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
+		;;
+	      *)
+		hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
+		;;
+	      esac
+	    fi
+	  else
+	    eval flag=\"$hardcode_libdir_flag_spec\"
+	    rpath="$rpath $flag"
+	  fi
+	elif test -n "$runpath_var"; then
+	  case "$finalize_perm_rpath " in
+	  *" $libdir "*) ;;
+	  *) finalize_perm_rpath="$finalize_perm_rpath $libdir" ;;
+	  esac
+	fi
+      done
+      # Substitute the hardcoded libdirs into the rpath.
+      if test -n "$hardcode_libdir_separator" &&
+	 test -n "$hardcode_libdirs"; then
+	libdir="$hardcode_libdirs"
+	eval rpath=\" $hardcode_libdir_flag_spec\"
+      fi
+      finalize_rpath="$rpath"
+
+      if test -n "$libobjs" && test "$build_old_libs" = yes; then
+	# Transform all the library objects into standard objects.
+	compile_command=`$echo "X$compile_command" | $SP2NL | $Xsed -e "$lo2o" | $NL2SP`
+	finalize_command=`$echo "X$finalize_command" | $SP2NL | $Xsed -e "$lo2o" | $NL2SP`
+      fi
+
+      dlsyms=
+      if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+	if test -n "$NM" && test -n "$global_symbol_pipe"; then
+	  dlsyms="${outputname}S.c"
+	else
+	  $echo "$modename: not configured to extract global symbols from dlpreopened files" 1>&2
+	fi
+      fi
+
+      if test -n "$dlsyms"; then
+	case "$dlsyms" in
+	"") ;;
+	*.c)
+	  # Discover the nlist of each of the dlfiles.
+	  nlist="$output_objdir/${outputname}.nm"
+
+	  $show "$rm $nlist ${nlist}S ${nlist}T"
+	  $run $rm "$nlist" "${nlist}S" "${nlist}T"
+
+	  # Parse the name list into a source file.
+	  $show "creating $output_objdir/$dlsyms"
+
+	  test -z "$run" && $echo > "$output_objdir/$dlsyms" "\
+/* $dlsyms - symbol resolution table for \`$outputname' dlsym emulation. */
+/* Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP */
+
+#ifdef __cplusplus
+extern \"C\" {
+#endif
+
+/* Prevent the only kind of declaration conflicts we can make. */
+#define lt_preloaded_symbols some_other_symbol
+
+/* External symbol declarations for the compiler. */\
+"
+
+	  if test "$dlself" = yes; then
+	    $show "generating symbol list for \`$output'"
+
+	    test -z "$run" && $echo ': @PROGRAM@ ' > "$nlist"
+
+	    # Add our own program objects to the symbol list.
+	    progfiles=`$echo "X$objs$old_deplibs" | $SP2NL | $Xsed -e "$lo2o" | $NL2SP`
+	    for arg in $progfiles; do
+	      $show "extracting global C symbols from \`$arg'"
+	      $run eval "$NM $arg | $global_symbol_pipe >> '$nlist'"
+	    done
+
+	    if test -n "$exclude_expsyms"; then
+	      $run eval 'egrep -v " ($exclude_expsyms)$" "$nlist" > "$nlist"T'
+	      $run eval '$mv "$nlist"T "$nlist"'
+	    fi
+
+	    if test -n "$export_symbols_regex"; then
+	      $run eval 'egrep -e "$export_symbols_regex" "$nlist" > "$nlist"T'
+	      $run eval '$mv "$nlist"T "$nlist"'
+	    fi
+
+	    # Prepare the list of exported symbols
+	    if test -z "$export_symbols"; then
+	      export_symbols="$output_objdir/$output.exp"
+	      $run $rm $export_symbols
+	      $run eval "sed -n -e '/^: @PROGRAM@$/d' -e 's/^.* \(.*\)$/\1/p' "'< "$nlist" > "$export_symbols"'
+	    else
+	      $run eval "sed -e 's/\([][.*^$]\)/\\\1/g' -e 's/^/ /' -e 's/$/$/'"' < "$export_symbols" > "$output_objdir/$output.exp"'
+	      $run eval 'grep -f "$output_objdir/$output.exp" < "$nlist" > "$nlist"T'
+	      $run eval 'mv "$nlist"T "$nlist"'
+	    fi
+	  fi
+
+	  for arg in $dlprefiles; do
+	    $show "extracting global C symbols from \`$arg'"
+	    name=`echo "$arg" | sed -e 's%^.*/%%'`
+	    $run eval 'echo ": $name " >> "$nlist"'
+	    $run eval "$NM $arg | $global_symbol_pipe >> '$nlist'"
+	  done
+
+	  if test -z "$run"; then
+	    # Make sure we have at least an empty file.
+	    test -f "$nlist" || : > "$nlist"
+
+	    if test -n "$exclude_expsyms"; then
+	      egrep -v " ($exclude_expsyms)$" "$nlist" > "$nlist"T
+	      $mv "$nlist"T "$nlist"
+	    fi
+
+	    # Try sorting and uniquifying the output.
+	    if grep -v "^: " < "$nlist" | sort +2 | uniq > "$nlist"S; then
+	      :
+	    else
+	      grep -v "^: " < "$nlist" > "$nlist"S
+	    fi
+
+	    if test -f "$nlist"S; then
+	      eval "$global_symbol_to_cdecl"' < "$nlist"S >> "$output_objdir/$dlsyms"'
+	    else
+	      echo '/* NONE */' >> "$output_objdir/$dlsyms"
+	    fi
+
+	    $echo >> "$output_objdir/$dlsyms" "\
+
+#undef lt_preloaded_symbols
+
+#if defined (__STDC__) && __STDC__
+# define lt_ptr_t void *
+#else
+# define lt_ptr_t char *
+# define const
+#endif
+
+/* The mapping between symbol names and symbols. */
+const struct {
+  const char *name;
+  lt_ptr_t address;
+}
+lt_preloaded_symbols[] =
+{\
+"
+
+	    sed -n -e 's/^: \([^ ]*\) $/  {\"\1\", (lt_ptr_t) 0},/p' \
+		-e 's/^. \([^ ]*\) \([^ ]*\)$/  {"\2", (lt_ptr_t) \&\2},/p' \
+		  < "$nlist" >> "$output_objdir/$dlsyms"
+
+	    $echo >> "$output_objdir/$dlsyms" "\
+  {0, (lt_ptr_t) 0}
+};
+
+/* This works around a problem in FreeBSD linker */
+#ifdef FREEBSD_WORKAROUND
+static const void *lt_preloaded_setup() {
+  return lt_preloaded_symbols;
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif\
+"
+	  fi
+
+	  pic_flag_for_symtable=
+	  case "$host" in
+	  # compiling the symbol table file with pic_flag works around
+	  # a FreeBSD bug that causes programs to crash when -lm is
+	  # linked before any other PIC object.  But we must not use
+	  # pic_flag when linking with -static.  The problem exists in
+	  # FreeBSD 2.2.6 and is fixed in FreeBSD 3.1.
+	  *-*-freebsd2*|*-*-freebsd3.0*|*-*-freebsdelf3.0*)
+	    case "$compile_command " in
+	    *" -static "*) ;;
+	    *) pic_flag_for_symtable=" $pic_flag -DPIC -DFREEBSD_WORKAROUND";;
+	    esac;;
+	  *-*-hpux*)
+	    case "$compile_command " in
+	    *" -static "*) ;;
+	    *) pic_flag_for_symtable=" $pic_flag -DPIC";;
+	    esac
+	  esac
+
+	  # Now compile the dynamic symbol file.
+	  $show "(cd $output_objdir && $CC -c$no_builtin_flag$pic_flag_for_symtable \"$dlsyms\")"
+	  $run eval '(cd $output_objdir && $CC -c$no_builtin_flag$pic_flag_for_symtable "$dlsyms")' || exit $?
+
+	  # Clean up the generated files.
+	  $show "$rm $output_objdir/$dlsyms $nlist ${nlist}S ${nlist}T"
+	  $run $rm "$output_objdir/$dlsyms" "$nlist" "${nlist}S" "${nlist}T"
+
+	  # Transform the symbol file into the correct name.
+	  compile_command=`$echo "X$compile_command" | $Xsed -e "s%@SYMFILE@%$output_objdir/${outputname}S.${objext}%"`
+	  finalize_command=`$echo "X$finalize_command" | $Xsed -e "s%@SYMFILE@%$output_objdir/${outputname}S.${objext}%"`
+	  ;;
+	*)
+	  $echo "$modename: unknown suffix for \`$dlsyms'" 1>&2
+	  exit 1
+	  ;;
+	esac
+      else
+	# We keep going just in case the user didn't refer to
+	# lt_preloaded_symbols.  The linker will fail if global_symbol_pipe
+	# really was required.
+
+	# Nullify the symbol file.
+	compile_command=`$echo "X$compile_command" | $Xsed -e "s% @SYMFILE@%%"`
+	finalize_command=`$echo "X$finalize_command" | $Xsed -e "s% @SYMFILE@%%"`
+      fi
+
+      if test -z "$link_against_libtool_libs" || test "$build_libtool_libs" != yes; then
+	# Replace the output file specification.
+	compile_command=`$echo "X$compile_command" | $Xsed -e 's%@OUTPUT@%'"$output"'%g'`
+	link_command="$compile_command$compile_rpath"
+
+	# We have no uninstalled library dependencies, so finalize right now.
+	$show "$link_command"
+	$run eval "$link_command"
+	status=$?
+
+	# Delete the generated files.
+	if test -n "$dlsyms"; then
+	  $show "$rm $output_objdir/${outputname}S.${objext}"
+	  $run $rm "$output_objdir/${outputname}S.${objext}"
+	fi
+
+	exit $status
+      fi
+
+      if test -n "$shlibpath_var"; then
+	# We should set the shlibpath_var
+	rpath=
+	for dir in $temp_rpath; do
+	  case "$dir" in
+	  [\\/]* | [A-Za-z]:[\\/]*)
+	    # Absolute path.
+	    rpath="$rpath$dir:"
+	    ;;
+	  *)
+	    # Relative path: add a thisdir entry.
+	    rpath="$rpath\$thisdir/$dir:"
+	    ;;
+	  esac
+	done
+	temp_rpath="$rpath"
+      fi
+
+      if test -n "$compile_shlibpath$finalize_shlibpath"; then
+	compile_command="$shlibpath_var=\"$compile_shlibpath$finalize_shlibpath\$$shlibpath_var\" $compile_command"
+      fi
+      if test -n "$finalize_shlibpath"; then
+	finalize_command="$shlibpath_var=\"$finalize_shlibpath\$$shlibpath_var\" $finalize_command"
+      fi
+
+      compile_var=
+      finalize_var=
+      if test -n "$runpath_var"; then
+	if test -n "$perm_rpath"; then
+	  # We should set the runpath_var.
+	  rpath=
+	  for dir in $perm_rpath; do
+	    rpath="$rpath$dir:"
+	  done
+	  compile_var="$runpath_var=\"$rpath\$$runpath_var\" "
+	fi
+	if test -n "$finalize_perm_rpath"; then
+	  # We should set the runpath_var.
+	  rpath=
+	  for dir in $finalize_perm_rpath; do
+	    rpath="$rpath$dir:"
+	  done
+	  finalize_var="$runpath_var=\"$rpath\$$runpath_var\" "
+	fi
+      fi
+
+      if test "$no_install" = yes; then
+	# We don't need to create a wrapper script.
+	link_command="$compile_var$compile_command$compile_rpath"
+	# Replace the output file specification.
+	link_command=`$echo "X$link_command" | $Xsed -e 's%@OUTPUT@%'"$output"'%g'`
+	# Delete the old output file.
+	$run $rm $output
+	# Link the executable and exit
+	$show "$link_command"
+	$run eval "$link_command" || exit $?
+	exit 0
+      fi
+
+      if test "$hardcode_action" = relink || test "$hardcode_into_libs" = all; then
+	# Fast installation is not supported
+	link_command="$compile_var$compile_command$compile_rpath"
+	relink_command="$finalize_var$finalize_command$finalize_rpath"
+
+	$echo "$modename: warning: this platform does not like uninstalled shared libraries" 1>&2
+	$echo "$modename: \`$output' will be relinked during installation" 1>&2
+      else
+	if test "$fast_install" != no; then
+	  link_command="$finalize_var$compile_command$finalize_rpath"
+	  if test "$fast_install" = yes; then
+	    relink_command=`$echo "X$compile_var$compile_command$compile_rpath" | $Xsed -e 's%@OUTPUT@%\$progdir/\$file%g'`
+	  else
+	    # fast_install is set to needless
+	    relink_command=
+	  fi
+	else
+	  link_command="$compile_var$compile_command$compile_rpath"
+	  relink_command="$finalize_var$finalize_command$finalize_rpath"
+	fi
+      fi
+
+      # Replace the output file specification.
+      link_command=`$echo "X$link_command" | $Xsed -e 's%@OUTPUT@%'"$output_objdir/$outputname"'%g'`
+
+      # Delete the old output files.
+      $run $rm $output $output_objdir/$outputname $output_objdir/lt-$outputname
+
+      $show "$link_command"
+      $run eval "$link_command" || exit $?
+
+      # Now create the wrapper script.
+      $show "creating $output"
+
+      # Quote the relink command for shipping.
+      if test -n "$relink_command"; then
+	relink_command="cd `pwd`; $relink_command"
+	relink_command=`$echo "X$relink_command" | $Xsed -e "$sed_quote_subst"`
+      fi
+
+      # Quote $echo for shipping.
+      if test "X$echo" = "X$SHELL $0 --fallback-echo"; then
+	case "$0" in
+	[\\/]* | [A-Za-z]:[\\/]*) qecho="$SHELL $0 --fallback-echo";;
+	*) qecho="$SHELL `pwd`/$0 --fallback-echo";;
+	esac
+	qecho=`$echo "X$qecho" | $Xsed -e "$sed_quote_subst"`
+      else
+	qecho=`$echo "X$echo" | $Xsed -e "$sed_quote_subst"`
+      fi
+
+      # Only actually do things if our run command is non-null.
+      if test -z "$run"; then
+	# win32 will think the script is a binary if it has
+	# a .exe suffix, so we strip it off here.
+	case $output in
+	  *.exe) output=`echo $output|sed 's,.exe$,,'` ;;
+	esac
+	$rm $output
+	trap "$rm $output; exit 1" 1 2 15
+
+	$echo > $output "\
+#! $SHELL
+
+# $output - temporary wrapper script for $objdir/$outputname
+# Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP
+#
+# The $output program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+Xsed='sed -e 1s/^X//'
+sed_quote_subst='$sed_quote_subst'
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+if test \"\${CDPATH+set}\" = set; then CDPATH=:; export CDPATH; fi
+
+relink_command=\"$relink_command\"
+
+# This environment variable determines our operation mode.
+if test \"\$libtool_install_magic\" = \"$magic\"; then
+  # install mode needs the following variable:
+  link_against_libtool_libs='$link_against_libtool_libs'
+else
+  # When we are sourced in execute mode, \$file and \$echo are already set.
+  if test \"\$libtool_execute_magic\" != \"$magic\"; then
+    echo=\"$qecho\"
+    file=\"\$0\"
+    # Make sure echo works.
+    if test \"X\$1\" = X--no-reexec; then
+      # Discard the --no-reexec flag, and continue.
+      shift
+    elif test \"X\`(\$echo '\t') 2>/dev/null\`\" = 'X\t'; then
+      # Yippee, \$echo works!
+      :
+    else
+      # Restart under the correct shell, and then maybe \$echo will work.
+      exec $SHELL \"\$0\" --no-reexec \${1+\"\$@\"}
+    fi
+  fi\
+"
+	$echo >> $output "\
+
+  # Find the directory that this script lives in.
+  thisdir=\`\$echo \"X\$file\" | \$Xsed -e 's%/[^/]*$%%'\`
+  test \"x\$thisdir\" = \"x\$file\" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=\`ls -ld \"\$file\" | sed -n 's/.*-> //p'\`
+  while test -n \"\$file\"; do
+    destdir=\`\$echo \"X\$file\" | \$Xsed -e 's%/[^/]*\$%%'\`
+
+    # If there was a directory component, then change thisdir.
+    if test \"x\$destdir\" != \"x\$file\"; then
+      case \"\$destdir\" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir=\"\$destdir\" ;;
+      *) thisdir=\"\$thisdir/\$destdir\" ;;
+      esac
+    fi
+
+    file=\`\$echo \"X\$file\" | \$Xsed -e 's%^.*/%%'\`
+    file=\`ls -ld \"\$thisdir/\$file\" | sed -n 's/.*-> //p'\`
+  done
+
+  # Try to get the absolute directory name.
+  absdir=\`cd \"\$thisdir\" && pwd\`
+  test -n \"\$absdir\" && thisdir=\"\$absdir\"
+"
+
+	if test "$fast_install" = yes; then
+	  echo >> $output "\
+  program=lt-'$outputname'
+  progdir=\"\$thisdir/$objdir\"
+
+  if test ! -f \"\$progdir/\$program\" || \\
+     { file=\`ls -1dt \"\$progdir/\$program\" \"\$progdir/../\$program\" 2>/dev/null | sed 1q\`; \\
+       test \"X\$file\" != \"X\$progdir/\$program\"; }; then
+
+    file=\"\$\$-\$program\"
+
+    if test ! -d \"\$progdir\"; then
+      $mkdir \"\$progdir\"
+    else
+      $rm \"\$progdir/\$file\"
+    fi"
+
+	  echo >> $output "\
+
+    # relink executable if necessary
+    if test -n \"\$relink_command\"; then
+      if (eval \$relink_command); then :
+      else
+	$rm \"\$progdir/\$file\"
+	exit 1
+      fi
+    fi
+
+    $mv \"\$progdir/\$file\" \"\$progdir/\$program\" 2>/dev/null ||
+    { $rm \"\$progdir/\$program\";
+      $mv \"\$progdir/\$file\" \"\$progdir/\$program\"; }
+    $rm \"\$progdir/\$file\"
+  fi"
+	else
+	  echo >> $output "\
+  program='$outputname'
+  progdir=\"\$thisdir/$objdir\"
+"
+	fi
+
+	echo >> $output "\
+
+  if test -f \"\$progdir/\$program\"; then"
+
+	# Export our shlibpath_var if we have one.
+	if test "$shlibpath_overrides_runpath" = yes && test -n "$shlibpath_var" && test -n "$temp_rpath"; then
+	  $echo >> $output "\
+    # Add our own library path to $shlibpath_var
+    $shlibpath_var=\"$temp_rpath\$$shlibpath_var\"
+
+    # Some systems cannot cope with colon-terminated $shlibpath_var
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    $shlibpath_var=\`\$echo \"X\$$shlibpath_var\" | \$Xsed -e 's/::*\$//'\`
+
+    export $shlibpath_var
+"
+	fi
+
+	# fixup the dll searchpath if we need to.
+	if test -n "$dllsearchpath"; then
+	  $echo >> $output "\
+    # Add the dll search path components to the executable PATH
+    PATH=$dllsearchpath:\$PATH
+"
+	fi
+
+	$echo >> $output "\
+    if test \"\$libtool_execute_magic\" != \"$magic\"; then
+      # Run the actual program with our arguments.
+"
+	case $host in
+	*-*-cygwin* | *-*-mingw | *-*-os2*)
+	  # win32 systems need to use the prog path for dll
+	  # lookup to work
+	  $echo >> $output "\
+      exec \$progdir\\\\\$program \${1+\"\$@\"}
+"
+	  ;;
+	*)
+	  $echo >> $output "\
+      # Export the path to the program.
+      PATH=\"\$progdir:\$PATH\"
+      export PATH
+
+      exec \$program \${1+\"\$@\"}
+"
+	  ;;
+	esac
+	$echo >> $output "\
+      \$echo \"\$0: cannot exec \$program \${1+\"\$@\"}\"
+      exit 1
+    fi
+  else
+    # The program doesn't exist.
+    \$echo \"\$0: error: \$progdir/\$program does not exist\" 1>&2
+    \$echo \"This script is just a wrapper for \$program.\" 1>&2
+    echo \"See the $PACKAGE documentation for more information.\" 1>&2
+    exit 1
+  fi
+fi\
+"
+	chmod +x $output
+      fi
+      exit 0
+      ;;
+    esac
+
+    # See if we need to build an old-fashioned archive.
+    for oldlib in $oldlibs; do
+
+      if test "$build_libtool_libs" = convenience; then
+	oldobjs="$libobjs_save"
+	addlibs="$convenience"
+	build_libtool_libs=no
+      else
+	if test "$build_libtool_libs" = module; then
+	  oldobjs="$libobjs_save"
+	  build_libtool_libs=no
+	else
+	  oldobjs="$objs$old_deplibs "`$echo "X$libobjs_save" | $SP2NL | $Xsed -e '/\.'${libext}'$/d' -e '/\.lib$/d' -e "$lo2o" | $NL2SP`
+	fi
+	addlibs="$old_convenience"
+      fi
+
+      if test -n "$addlibs"; then
+	gentop="$output_objdir/${outputname}x"
+	$show "${rm}r $gentop"
+	$run ${rm}r "$gentop"
+	$show "mkdir $gentop"
+	$run mkdir "$gentop"
+	status=$?
+	if test $status -ne 0 && test ! -d "$gentop"; then
+	  exit $status
+	fi
+	generated="$generated $gentop"
+
+	# Add in members from convenience archives.
+	for xlib in $addlibs; do
+	  # Extract the objects.
+	  case "$xlib" in
+	  [\\/]* | [A-Za-z]:[\\/]*) xabs="$xlib" ;;
+	  *) xabs=`pwd`"/$xlib" ;;
+	  esac
+	  xlib=`$echo "X$xlib" | $Xsed -e 's%^.*/%%'`
+	  xdir="$gentop/$xlib"
+
+	  $show "${rm}r $xdir"
+	  $run ${rm}r "$xdir"
+	  $show "mkdir $xdir"
+	  $run mkdir "$xdir"
+	  status=$?
+	  if test $status -ne 0 && test ! -d "$xdir"; then
+	    exit $status
+	  fi
+	  $show "(cd $xdir && $AR x $xabs)"
+	  $run eval "(cd \$xdir && $AR x \$xabs)" || exit $?
+
+	  oldobjs="$oldobjs "`find $xdir -name \*.${objext} -print -o -name \*.lo -print | $NL2SP`
+	done
+      fi
+
+      # Do each command in the archive commands.
+      if test -n "$old_archive_from_new_cmds" && test "$build_libtool_libs" = yes; then
+	eval cmds=\"$old_archive_from_new_cmds\"
+      else
+	# Ensure that we have .o objects in place in case we decided
+	# not to build a shared library, and have fallen back to building
+	# static libs even though --disable-static was passed!
+	for oldobj in $oldobjs; do
+	  if test ! -f $oldobj; then
+	    xdir=`$echo "X$oldobj" | $Xsed -e 's%/[^/]*$%%'`
+	    if test "X$xdir" = "X$oldobj"; then
+	      xdir="."
+	    else
+	      xdir="$xdir"
+	    fi
+	    baseobj=`$echo "X$oldobj" | $Xsed -e 's%^.*/%%'`
+	    obj=`$echo "X$baseobj" | $Xsed -e "$o2lo"`
+	    $show "(cd $xdir && ${LN_S} $obj $baseobj)"
+	    $run eval '(cd $xdir && ${LN_S} $obj $baseobj)' || exit $?
+	  fi
+	done
+
+	eval cmds=\"$old_archive_cmds\"
+      fi
+      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+      for cmd in $cmds; do
+	IFS="$save_ifs"
+	$show "$cmd"
+	$run eval "$cmd" || exit $?
+      done
+      IFS="$save_ifs"
+    done
+
+    if test -n "$generated"; then
+      $show "${rm}r$generated"
+      $run ${rm}r$generated
+    fi
+
+    # Now create the libtool archive.
+    case "$output" in
+    *.la)
+      old_library=
+      test "$build_old_libs" = yes && old_library="$libname.$libext"
+      $show "creating $output"
+
+      # Quote the link command for shipping.
+      relink_command="cd `pwd`; $SHELL $0 --mode=relink $libtool_args"
+      relink_command=`$echo "X$relink_command" | $Xsed -e "$sed_quote_subst"`
+
+      # Only create the output if not a dry run.
+      if test -z "$run"; then
+	for installed in no yes; do
+	  if test "$installed" = yes; then
+	    if test -z "$install_libdir"; then
+	      break
+	    fi
+	    output="$output_objdir/$outputname"i
+	    # Replace all uninstalled libtool libraries with the installed ones
+	    newdependency_libs=
+	    for deplib in $dependency_libs; do
+	      case "$deplib" in
+	      *.la)
+		name=`$echo "X$deplib" | $Xsed -e 's%^.*/%%'`
+		eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $deplib`
+		if test -z "$libdir"; then
+		  $echo "$modename: \`$deplib' is not a valid libtool archive" 1>&2
+		  exit 1
+		fi
+		newdependency_libs="$newdependency_libs $libdir/$name"
+		;;
+	      *) newdependency_libs="$newdependency_libs $deplib" ;;
+	      esac
+	    done
+	    dependency_libs="$newdependency_libs"
+	    newdlfiles=
+	    for lib in $dlfiles; do
+	      name=`$echo "X$lib" | $Xsed -e 's%^.*/%%'`
+	      eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $lib`
+	      if test -z "$libdir"; then
+		$echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+		exit 1
+	      fi
+	      newdlfiles="$newdlfiles $libdir/$name"
+	    done
+	    dlfiles="$newdlfiles"
+	    newdlprefiles=
+	    for lib in $dlprefiles; do
+	      name=`$echo "X$lib" | $Xsed -e 's%^.*/%%'`
+	      eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $lib`
+	      if test -z "$libdir"; then
+		$echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+		exit 1
+	      fi
+	      newdlprefiles="$newdlprefiles $libdir/$name"
+	    done
+	    dlprefiles="$newdlprefiles"
+	  fi
+	  $rm $output
+	  $echo > $output "\
+# $outputname - a libtool library file
+# Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP
+#
+# Please DO NOT delete this file!
+# It is necessary for linking the library.
+
+# The name that we can dlopen(3).
+dlname='$dlname'
+
+# Names of this library.
+library_names='$library_names'
+
+# The name of the static archive.
+old_library='$old_library'
+
+# Libraries that this one depends upon.
+dependency_libs='$dependency_libs'
+
+# Version information for $libname.
+current=$current
+age=$age
+revision=$revision
+
+# Is this an already installed library?
+installed=$installed
+
+# Files to dlopen/dlpreopen
+dlopen='$dlfiles'
+dlpreopen='$dlprefiles'
+
+# Directory that this library needs to be installed in:
+libdir='$install_libdir'"
+	  if test "$installed" = no; then
+	    $echo >> $output "\
+relink_command=\"$relink_command\""
+	  fi
+	done
+      fi
+
+      # Do a symbolic link so that the libtool archive can be found in
+      # LD_LIBRARY_PATH before the program is installed.
+      $show "(cd $output_objdir && $rm $outputname && $LN_S ../$outputname $outputname)"
+      $run eval '(cd $output_objdir && $rm $outputname && $LN_S ../$outputname $outputname)' || exit $?
+      ;;
+    esac
+    exit 0
+    ;;
+
+  # libtool install mode
+  install)
+    modename="$modename: install"
+
+    # There may be an optional sh(1) argument at the beginning of
+    # install_prog (especially on Windows NT).
+    if test "$nonopt" = "$SHELL" || test "$nonopt" = /bin/sh; then
+      # Aesthetically quote it.
+      arg=`$echo "X$nonopt" | $Xsed -e "$sed_quote_subst"`
+      case "$arg" in
+      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*)
+	arg="\"$arg\""
+	;;
+      esac
+      install_prog="$arg "
+      arg="$1"
+      shift
+    else
+      install_prog=
+      arg="$nonopt"
+    fi
+
+    # The real first argument should be the name of the installation program.
+    # Aesthetically quote it.
+    arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+    case "$arg" in
+    *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*)
+      arg="\"$arg\""
+      ;;
+    esac
+    install_prog="$install_prog$arg"
+
+    # We need to accept at least all the BSD install flags.
+    dest=
+    files=
+    opts=
+    prev=
+    install_type=
+    isdir=no
+    stripme=
+    for arg
+    do
+      if test -n "$dest"; then
+	files="$files $dest"
+	dest="$arg"
+	continue
+      fi
+
+      case "$arg" in
+      -d) isdir=yes ;;
+      -f) prev="-f" ;;
+      -g) prev="-g" ;;
+      -m) prev="-m" ;;
+      -o) prev="-o" ;;
+      -s)
+	stripme=" -s"
+	continue
+	;;
+      -*) ;;
+
+      *)
+	# If the previous option needed an argument, then skip it.
+	if test -n "$prev"; then
+	  prev=
+	else
+	  dest="$arg"
+	  continue
+	fi
+	;;
+      esac
+
+      # Aesthetically quote the argument.
+      arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+      case "$arg" in
+      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*)
+	arg="\"$arg\""
+	;;
+      esac
+      install_prog="$install_prog $arg"
+    done
+
+    if test -z "$install_prog"; then
+      $echo "$modename: you must specify an install program" 1>&2
+      $echo "$help" 1>&2
+      exit 1
+    fi
+
+    if test -n "$prev"; then
+      $echo "$modename: the \`$prev' option requires an argument" 1>&2
+      $echo "$help" 1>&2
+      exit 1
+    fi
+
+    if test -z "$files"; then
+      if test -z "$dest"; then
+	$echo "$modename: no file or destination specified" 1>&2
+      else
+	$echo "$modename: you must specify a destination" 1>&2
+      fi
+      $echo "$help" 1>&2
+      exit 1
+    fi
+
+    # Strip any trailing slash from the destination.
+    dest=`$echo "X$dest" | $Xsed -e 's%/$%%'`
+
+    # Check to see that the destination is a directory.
+    test -d "$dest" && isdir=yes
+    if test "$isdir" = yes; then
+      destdir="$dest"
+      destname=
+    else
+      destdir=`$echo "X$dest" | $Xsed -e 's%/[^/]*$%%'`
+      test "X$destdir" = "X$dest" && destdir=.
+      destname=`$echo "X$dest" | $Xsed -e 's%^.*/%%'`
+
+      # Not a directory, so check to see that there is only one file specified.
+      set dummy $files
+      if test $# -gt 2; then
+	$echo "$modename: \`$dest' is not a directory" 1>&2
+	$echo "$help" 1>&2
+	exit 1
+      fi
+    fi
+    case "$destdir" in
+    [\\/]* | [A-Za-z]:[\\/]*) ;;
+    *)
+      for file in $files; do
+	case "$file" in
+	*.lo) ;;
+	*)
+	  $echo "$modename: \`$destdir' must be an absolute directory name" 1>&2
+	  $echo "$help" 1>&2
+	  exit 1
+	  ;;
+	esac
+      done
+      ;;
+    esac
+
+    # This variable tells wrapper scripts just to set variables rather
+    # than running their programs.
+    libtool_install_magic="$magic"
+
+    staticlibs=
+    future_libdirs=
+    current_libdirs=
+    for file in $files; do
+
+      # Do each installation.
+      case "$file" in
+      *.$libext)
+	# Do the static libraries later.
+	staticlibs="$staticlibs $file"
+	;;
+
+      *.la)
+	# Check to see that this really is a libtool archive.
+	if (sed -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then :
+	else
+	  $echo "$modename: \`$file' is not a valid libtool archive" 1>&2
+	  $echo "$help" 1>&2
+	  exit 1
+	fi
+
+	library_names=
+	old_library=
+	relink_command=
+	# If there is no directory component, then add one.
+	case "$file" in
+	*/* | *\\*) . $file ;;
+	*) . ./$file ;;
+	esac
+
+	# Add the libdir to current_libdirs if it is the destination.
+	if test "X$destdir" = "X$libdir"; then
+	  case "$current_libdirs " in
+	  *" $libdir "*) ;;
+	  *) current_libdirs="$current_libdirs $libdir" ;;
+	  esac
+	else
+	  # Note the libdir as a future libdir.
+	  case "$future_libdirs " in
+	  *" $libdir "*) ;;
+	  *) future_libdirs="$future_libdirs $libdir" ;;
+	  esac
+	fi
+
+	dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`/
+	test "X$dir" = "X$file/" && dir=
+	dir="$dir$objdir"
+
+	if test "$hardcode_into_libs" = all; then
+	  if test -z "$relink_command"; then
+	    $echo "$modename: invalid libtool pseudo library \`$file'" 1>&2
+	    exit 1
+	  fi
+	  $echo "$modename: warning: relinking \`$file'" 1>&2
+	  $show "$relink_command"
+	  if $run eval "$relink_command"; then :
+	  else
+	    $echo "$modename: error: relink \`$file' with the above command before installing it" 1>&2
+	    continue
+	  fi
+	fi
+
+	# See the names of the shared library.
+	set dummy $library_names
+	if test -n "$2"; then
+	  realname="$2"
+	  shift
+	  shift
+
+	  srcname="$realname"
+	  test "$hardcode_into_libs" = all && srcname="$realname"T
+
+	  # Install the shared library and build the symlinks.
+	  $show "$install_prog $dir/$srcname $destdir/$realname"
+	  $run eval "$install_prog $dir/$srcname $destdir/$realname" || exit $?
+	  if test -n "$stripme" && test -n "$striplib"; then
+	    $show "$striplib $destdir/$realname"
+	    $run eval "$striplib $destdir/$realname" || exit $?
+	  fi
+
+	  if test $# -gt 0; then
+	    # Delete the old symlinks, and create new ones.
+	    for linkname
+	    do
+	      if test "$linkname" != "$realname"; then
+		$show "(cd $destdir && $rm $linkname && $LN_S $realname $linkname)"
+		$run eval "(cd $destdir && $rm $linkname && $LN_S $realname $linkname)"
+	      fi
+	    done
+	  fi
+
+	  # Do each command in the postinstall commands.
+	  lib="$destdir/$realname"
+	  eval cmds=\"$postinstall_cmds\"
+	  IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+	  for cmd in $cmds; do
+	    IFS="$save_ifs"
+	    $show "$cmd"
+	    $run eval "$cmd" || exit $?
+	  done
+	  IFS="$save_ifs"
+	fi
+
+	# Install the pseudo-library for information purposes.
+	name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+	instname="$dir/$name"i
+	$show "$install_prog $instname $destdir/$name"
+	$run eval "$install_prog $instname $destdir/$name" || exit $?
+
+	# Maybe install the static library, too.
+	test -n "$old_library" && staticlibs="$staticlibs $dir/$old_library"
+	;;
+
+      *.lo)
+	# Install (i.e. copy) a libtool object.
+
+	# Figure out destination file name, if it wasn't already specified.
+	if test -n "$destname"; then
+	  destfile="$destdir/$destname"
+	else
+	  destfile=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+	  destfile="$destdir/$destfile"
+	fi
+
+	# Deduce the name of the destination old-style object file.
+	case "$destfile" in
+	*.lo)
+	  staticdest=`$echo "X$destfile" | $Xsed -e "$lo2o"`
+	  ;;
+	*.$objext)
+	  staticdest="$destfile"
+	  destfile=
+	  ;;
+	*)
+	  $echo "$modename: cannot copy a libtool object to \`$destfile'" 1>&2
+	  $echo "$help" 1>&2
+	  exit 1
+	  ;;
+	esac
+
+	# Install the libtool object if requested.
+	if test -n "$destfile"; then
+	  $show "$install_prog $file $destfile"
+	  $run eval "$install_prog $file $destfile" || exit $?
+	fi
+
+	# Install the old object if enabled.
+	if test "$build_old_libs" = yes; then
+	  # Deduce the name of the old-style object file.
+	  staticobj=`$echo "X$file" | $Xsed -e "$lo2o"`
+
+	  $show "$install_prog $staticobj $staticdest"
+	  $run eval "$install_prog \$staticobj \$staticdest" || exit $?
+	fi
+	exit 0
+	;;
+
+      *)
+	# Figure out destination file name, if it wasn't already specified.
+	if test -n "$destname"; then
+	  destfile="$destdir/$destname"
+	else
+	  destfile=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+	  destfile="$destdir/$destfile"
+	fi
+
+	# Do a test to see if this is really a libtool program.
+	if (sed -e '4q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+	  link_against_libtool_libs=
+	  relink_command=
+
+	  # If there is no directory component, then add one.
+	  case "$file" in
+	  */* | *\\*) . $file ;;
+	  *) . ./$file ;;
+	  esac
+
+	  # Check the variables that should have been set.
+	  if test -z "$link_against_libtool_libs"; then
+	    $echo "$modename: invalid libtool wrapper script \`$file'" 1>&2
+	    exit 1
+	  fi
+
+	  finalize=yes
+	  for lib in $link_against_libtool_libs; do
+	    # Check to see that each library is installed.
+	    libdir=
+	    if test -f "$lib"; then
+	      # If there is no directory component, then add one.
+	      case "$lib" in
+	      */* | *\\*) . $lib ;;
+	      *) . ./$lib ;;
+	      esac
+	    fi
+	    libfile="$libdir/"`$echo "X$lib" | $Xsed -e 's%^.*/%%g'` ### testsuite: skip nested quoting test
+	    if test -n "$libdir" && test ! -f "$libfile"; then
+	      $echo "$modename: warning: \`$lib' has not been installed in \`$libdir'" 1>&2
+	      finalize=no
+	    fi
+	  done
+
+	  relink_command=
+	  # If there is no directory component, then add one.
+	  case "$file" in
+	  */* | *\\*) . $file ;;
+	  *) . ./$file ;;
+	  esac
+
+	  outputname=
+	  if test "$fast_install" = no && test -n "$relink_command"; then
+	    if test "$finalize" = yes && test -z "$run"; then
+	      tmpdir="/tmp"
+	      test -n "$TMPDIR" && tmpdir="$TMPDIR"
+	      tmpdir="$tmpdir/libtool-$$"
+	      if $mkdir -p "$tmpdir" && chmod 700 "$tmpdir"; then :
+	      else
+		$echo "$modename: error: cannot create temporary directory \`$tmpdir'" 1>&2
+		continue
+	      fi
+	      outputname="$tmpdir/$file"
+	      # Replace the output file specification.
+	      relink_command=`$echo "X$relink_command" | $Xsed -e 's%@OUTPUT@%'"$outputname"'%g'`
+
+	      $show "$relink_command"
+	      if $run eval "$relink_command"; then :
+	      else
+		$echo "$modename: error: relink \`$file' with the above command before installing it" 1>&2
+		${rm}r "$tmpdir"
+		continue
+	      fi
+	      file="$outputname"
+	    else
+	      $echo "$modename: warning: cannot relink \`$file'" 1>&2
+	    fi
+	  else
+	    # Install the binary that we compiled earlier.
+	    file=`$echo "X$file" | $Xsed -e "s%\([^/]*\)$%$objdir/\1%"`
+	  fi
+	fi
+
+	$show "$install_prog$stripme $file $destfile"
+	$run eval "$install_prog\$stripme \$file \$destfile" || exit $?
+	test -n "$outputname" && ${rm}r "$tmpdir"
+	;;
+      esac
+    done
+
+    for file in $staticlibs; do
+      name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+
+      # Set up the ranlib parameters.
+      oldlib="$destdir/$name"
+
+      $show "$install_prog $file $oldlib"
+      $run eval "$install_prog \$file \$oldlib" || exit $?
+
+      if test -n "$stripme" && test -n "$striplib"; then
+	$show "$old_striplib $oldlib"
+	$run eval "$old_striplib $oldlib" || exit $?
+      fi
+
+      # Do each command in the postinstall commands.
+      eval cmds=\"$old_postinstall_cmds\"
+      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+      for cmd in $cmds; do
+	IFS="$save_ifs"
+	$show "$cmd"
+	$run eval "$cmd" || exit $?
+      done
+      IFS="$save_ifs"
+    done
+
+    if test -n "$future_libdirs"; then
+      $echo "$modename: warning: remember to run \`$progname --finish$future_libdirs'" 1>&2
+    fi
+
+    if test -n "$current_libdirs"; then
+      # Maybe just do a dry run.
+      test -n "$run" && current_libdirs=" -n$current_libdirs"
+      exec $SHELL $0 --finish$current_libdirs
+      exit 1
+    fi
+
+    exit 0
+    ;;
+
+  # libtool finish mode
+  finish)
+    modename="$modename: finish"
+    libdirs="$nonopt"
+    admincmds=
+
+    if test -n "$finish_cmds$finish_eval" && test -n "$libdirs"; then
+      for dir
+      do
+	libdirs="$libdirs $dir"
+      done
+
+      for libdir in $libdirs; do
+	if test -n "$finish_cmds"; then
+	  # Do each command in the finish commands.
+	  eval cmds=\"$finish_cmds\"
+	  IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+	  for cmd in $cmds; do
+	    IFS="$save_ifs"
+	    $show "$cmd"
+	    $run eval "$cmd" || admincmds="$admincmds
+       $cmd"
+	  done
+	  IFS="$save_ifs"
+	fi
+	if test -n "$finish_eval"; then
+	  # Do the single finish_eval.
+	  eval cmds=\"$finish_eval\"
+	  $run eval "$cmds" || admincmds="$admincmds
+       $cmds"
+	fi
+      done
+    fi
+
+    # Exit here if they wanted silent mode.
+    test "$show" = : && exit 0
+
+    echo "----------------------------------------------------------------------"
+    echo "Libraries have been installed in:"
+    for libdir in $libdirs; do
+      echo "   $libdir"
+    done
+    echo
+    echo "If you ever happen to want to link against installed libraries"
+    echo "in a given directory, LIBDIR, you must either use libtool, and"
+    echo "specify the full pathname of the library, or use \`-LLIBDIR'"
+    echo "flag during linking and do at least one of the following:"
+    if test -n "$shlibpath_var"; then
+      echo "   - add LIBDIR to the \`$shlibpath_var' environment variable"
+      echo "     during execution"
+    fi
+    if test -n "$runpath_var"; then
+      echo "   - add LIBDIR to the \`$runpath_var' environment variable"
+      echo "     during linking"
+    fi
+    if test -n "$hardcode_libdir_flag_spec"; then
+      libdir=LIBDIR
+      eval flag=\"$hardcode_libdir_flag_spec\"
+
+      echo "   - use the \`$flag' linker flag"
+    fi
+    if test -n "$admincmds"; then
+      echo "   - have your system administrator run these commands:$admincmds"
+    fi
+    if test -f /etc/ld.so.conf; then
+      echo "   - have your system administrator add LIBDIR to \`/etc/ld.so.conf'"
+    fi
+    echo
+    echo "See any operating system documentation about shared libraries for"
+    echo "more information, such as the ld(1) and ld.so(8) manual pages."
+    echo "----------------------------------------------------------------------"
+    exit 0
+    ;;
+
+  # libtool execute mode
+  execute)
+    modename="$modename: execute"
+
+    # The first argument is the command name.
+    cmd="$nonopt"
+    if test -z "$cmd"; then
+      $echo "$modename: you must specify a COMMAND" 1>&2
+      $echo "$help"
+      exit 1
+    fi
+
+    # Handle -dlopen flags immediately.
+    for file in $execute_dlfiles; do
+      if test ! -f "$file"; then
+	$echo "$modename: \`$file' is not a file" 1>&2
+	$echo "$help" 1>&2
+	exit 1
+      fi
+
+      dir=
+      case "$file" in
+      *.la)
+	# Check to see that this really is a libtool archive.
+	if (sed -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then :
+	else
+	  $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+	  $echo "$help" 1>&2
+	  exit 1
+	fi
+
+	# Read the libtool library.
+	dlname=
+	library_names=
+
+	# If there is no directory component, then add one.
+	case "$file" in
+	*/* | *\\*) . $file ;;
+	*) . ./$file ;;
+	esac
+
+	# Skip this library if it cannot be dlopened.
+	if test -z "$dlname"; then
+	  # Warn if it was a shared library.
+	  test -n "$library_names" && $echo "$modename: warning: \`$file' was not linked with \`-export-dynamic'"
+	  continue
+	fi
+
+	dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
+	test "X$dir" = "X$file" && dir=.
+
+	if test -f "$dir/$objdir/$dlname"; then
+	  dir="$dir/$objdir"
+	else
+	  $echo "$modename: cannot find \`$dlname' in \`$dir' or \`$dir/$objdir'" 1>&2
+	  exit 1
+	fi
+	;;
+
+      *.lo)
+	# Just add the directory containing the .lo file.
+	dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
+	test "X$dir" = "X$file" && dir=.
+	;;
+
+      *)
+	$echo "$modename: warning \`-dlopen' is ignored for non-libtool libraries and objects" 1>&2
+	continue
+	;;
+      esac
+
+      # Get the absolute pathname.
+      absdir=`cd "$dir" && pwd`
+      test -n "$absdir" && dir="$absdir"
+
+      # Now add the directory to shlibpath_var.
+      if eval "test -z \"\$$shlibpath_var\""; then
+	eval "$shlibpath_var=\"\$dir\""
+      else
+	eval "$shlibpath_var=\"\$dir:\$$shlibpath_var\""
+      fi
+    done
+
+    # This variable tells wrapper scripts just to set shlibpath_var
+    # rather than running their programs.
+    libtool_execute_magic="$magic"
+
+    # Check if any of the arguments is a wrapper script.
+    args=
+    for file
+    do
+      case "$file" in
+      -*) ;;
+      *)
+	# Do a test to see if this is really a libtool program.
+	if (sed -e '4q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+	  # If there is no directory component, then add one.
+	  case "$file" in
+	  */* | *\\*) . $file ;;
+	  *) . ./$file ;;
+	  esac
+
+	  # Transform arg to wrapped name.
+	  file="$progdir/$program"
+	fi
+	;;
+      esac
+      # Quote arguments (to preserve shell metacharacters).
+      file=`$echo "X$file" | $Xsed -e "$sed_quote_subst"`
+      args="$args \"$file\""
+    done
+
+    if test -z "$run"; then
+      if test -n "$shlibpath_var"; then
+	# Export the shlibpath_var.
+	eval "export $shlibpath_var"
+      fi
+
+      # Restore saved enviroment variables
+      if test "${save_LC_ALL+set}" = set; then
+	LC_ALL="$save_LC_ALL"; export LC_ALL
+      fi
+      if test "${save_LANG+set}" = set; then
+	LANG="$save_LANG"; export LANG
+      fi
+
+      # Now actually exec the command.
+      eval "exec \$cmd$args"
+
+      $echo "$modename: cannot exec \$cmd$args"
+      exit 1
+    else
+      # Display what would be done.
+      if test -n "$shlibpath_var"; then
+	eval "\$echo \"\$shlibpath_var=\$$shlibpath_var\""
+	$echo "export $shlibpath_var"
+      fi
+      $echo "$cmd$args"
+      exit 0
+    fi
+    ;;
+
+  # libtool clean and uninstall mode
+  clean | uninstall)
+    modename="$modename: $mode"
+    rm="$nonopt"
+    files=
+
+    # This variable tells wrapper scripts just to set variables rather
+    # than running their programs.
+    libtool_install_magic="$magic"
+
+    for arg
+    do
+      case "$arg" in
+      -*) rm="$rm $arg" ;;
+      *) files="$files $arg" ;;
+      esac
+    done
+
+    if test -z "$rm"; then
+      $echo "$modename: you must specify an RM program" 1>&2
+      $echo "$help" 1>&2
+      exit 1
+    fi
+
+    for file in $files; do
+      dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
+      if test "X$dir" = "X$file"; then
+	dir=.
+	objdir="$objdir"
+      else
+	objdir="$dir/$objdir"
+      fi
+      name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+      test $mode = uninstall && objdir="$dir"
+
+      rmfiles="$file"
+
+      case "$name" in
+      *.la)
+	# Possibly a libtool archive, so verify it.
+	if (sed -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+	  . $dir/$name
+
+	  # Delete the libtool libraries and symlinks.
+	  for n in $library_names; do
+	    rmfiles="$rmfiles $objdir/$n"
+	  done
+	  test -n "$old_library" && rmfiles="$rmfiles $objdir/$old_library"
+	  test $mode = clean && rmfiles="$rmfiles $objdir/$name $objdir/${name}i"
+
+	  if test $mode = uninstall; then
+	    if test -n "$library_names"; then
+	      # Do each command in the postuninstall commands.
+	      eval cmds=\"$postuninstall_cmds\"
+	      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+	      for cmd in $cmds; do
+		IFS="$save_ifs"
+		$show "$cmd"
+		$run eval "$cmd"
+	      done
+	      IFS="$save_ifs"
+	    fi
+
+	    if test -n "$old_library"; then
+	      # Do each command in the old_postuninstall commands.
+	      eval cmds=\"$old_postuninstall_cmds\"
+	      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+	      for cmd in $cmds; do
+		IFS="$save_ifs"
+		$show "$cmd"
+		$run eval "$cmd"
+	      done
+	      IFS="$save_ifs"
+	    fi
+	    # FIXME: should reinstall the best remaining shared library.
+	  fi
+	fi
+	;;
+
+      *.lo)
+	if test "$build_old_libs" = yes; then
+	  oldobj=`$echo "X$name" | $Xsed -e "$lo2o"`
+	  rmfiles="$rmfiles $dir/$oldobj"
+	fi
+	;;
+
+      *)
+	# Do a test to see if this is a libtool program.
+	if test $mode = clean &&
+	   (sed -e '4q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+	  relink_command=
+	  . $dir/$file
+
+	  rmfiles="$rmfiles $objdir/$name $objdir/${name}S.${objext}"
+	  if test "$fast_install" = yes && test -n "$relink_command"; then
+	    rmfiles="$rmfiles $objdir/lt-$name"
+	  fi
+	fi
+	;;
+      esac
+      $show "$rm $rmfiles"
+      $run $rm $rmfiles
+    done
+    exit 0
+    ;;
+
+  "")
+    $echo "$modename: you must specify a MODE" 1>&2
+    $echo "$generic_help" 1>&2
+    exit 1
+    ;;
+  esac
+
+  $echo "$modename: invalid operation mode \`$mode'" 1>&2
+  $echo "$generic_help" 1>&2
+  exit 1
+fi # test -z "$show_help"
+
+# We need to display help for each of the modes.
+case "$mode" in
+"") $echo \
+"Usage: $modename [OPTION]... [MODE-ARG]...
+
+Provide generalized library-building support services.
+
+    --config          show all configuration variables
+    --debug           enable verbose shell tracing
+-n, --dry-run         display commands without modifying any files
+    --features        display basic configuration information and exit
+    --finish          same as \`--mode=finish'
+    --help            display this help message and exit
+    --mode=MODE       use operation mode MODE [default=inferred from MODE-ARGS]
+    --quiet           same as \`--silent'
+    --silent          don't print informational messages
+    --version         print version information
+
+MODE must be one of the following:
+
+      clean           remove files from the build directory
+      compile         compile a source file into a libtool object
+      execute         automatically set library path, then run a program
+      finish          complete the installation of libtool libraries
+      install         install libraries or executables
+      link            create a library or an executable
+      uninstall       remove libraries from an installed directory
+
+MODE-ARGS vary depending on the MODE.  Try \`$modename --help --mode=MODE' for
+a more detailed description of MODE."
+  exit 0
+  ;;
+
+clean)
+  $echo \
+"Usage: $modename [OPTION]... --mode=clean RM [RM-OPTION]... FILE...
+
+Remove files from the build directory.
+
+RM is the name of the program to use to delete files associated with each FILE
+(typically \`/bin/rm').  RM-OPTIONS are options (such as \`-f') to be passed
+to RM.
+
+If FILE is a libtool library, object or program, all the files associated
+with it are deleted. Otherwise, only FILE itself is deleted using RM."
+  ;;
+
+compile)
+  $echo \
+"Usage: $modename [OPTION]... --mode=compile COMPILE-COMMAND... SOURCEFILE
+
+Compile a source file into a libtool library object.
+
+This mode accepts the following additional options:
+
+  -o OUTPUT-FILE    set the output file name to OUTPUT-FILE
+  -static           always build a \`.o' file suitable for static linking
+
+COMPILE-COMMAND is a command to be used in creating a \`standard' object file
+from the given SOURCEFILE.
+
+The output file name is determined by removing the directory component from
+SOURCEFILE, then substituting the C source code suffix \`.c' with the
+library object suffix, \`.lo'."
+  ;;
+
+execute)
+  $echo \
+"Usage: $modename [OPTION]... --mode=execute COMMAND [ARGS]...
+
+Automatically set library path, then run a program.
+
+This mode accepts the following additional options:
+
+  -dlopen FILE      add the directory containing FILE to the library path
+
+This mode sets the library path environment variable according to \`-dlopen'
+flags.
+
+If any of the ARGS are libtool executable wrappers, then they are translated
+into their corresponding uninstalled binary, and any of their required library
+directories are added to the library path.
+
+Then, COMMAND is executed, with ARGS as arguments."
+  ;;
+
+finish)
+  $echo \
+"Usage: $modename [OPTION]... --mode=finish [LIBDIR]...
+
+Complete the installation of libtool libraries.
+
+Each LIBDIR is a directory that contains libtool libraries.
+
+The commands that this mode executes may require superuser privileges.  Use
+the \`--dry-run' option if you just want to see what would be executed."
+  ;;
+
+install)
+  $echo \
+"Usage: $modename [OPTION]... --mode=install INSTALL-COMMAND...
+
+Install executables or libraries.
+
+INSTALL-COMMAND is the installation command.  The first component should be
+either the \`install' or \`cp' program.
+
+The rest of the components are interpreted as arguments to that command (only
+BSD-compatible install options are recognized)."
+  ;;
+
+link)
+  $echo \
+"Usage: $modename [OPTION]... --mode=link LINK-COMMAND...
+
+Link object files or libraries together to form another library, or to
+create an executable program.
+
+LINK-COMMAND is a command using the C compiler that you would use to create
+a program from several object files.
+
+The following components of LINK-COMMAND are treated specially:
+
+  -all-static       do not do any dynamic linking at all
+  -avoid-version    do not add a version suffix if possible
+  -dlopen FILE      \`-dlpreopen' FILE if it cannot be dlopened at runtime
+  -dlpreopen FILE   link in FILE and add its symbols to lt_preloaded_symbols
+  -export-dynamic   allow symbols from OUTPUT-FILE to be resolved with dlsym(3)
+  -export-symbols SYMFILE
+		    try to export only the symbols listed in SYMFILE
+  -export-symbols-regex REGEX
+		    try to export only the symbols matching REGEX
+  -LLIBDIR          search LIBDIR for required installed libraries
+  -lNAME            OUTPUT-FILE requires the installed library libNAME
+  -module           build a library that can dlopened
+  -no-fast-install  disable the fast-install mode
+  -no-install       link a not-installable executable
+  -no-undefined     declare that a library does not refer to external symbols
+  -o OUTPUT-FILE    create OUTPUT-FILE from the specified objects
+  -release RELEASE  specify package release information
+  -rpath LIBDIR     the created library will eventually be installed in LIBDIR
+  -R[ ]LIBDIR       add LIBDIR to the runtime path of programs and libraries
+  -static           do not do any dynamic linking of libtool libraries
+  -version-info CURRENT[:REVISION[:AGE]]
+		    specify library version info [each variable defaults to 0]
+
+All other options (arguments beginning with \`-') are ignored.
+
+Every other argument is treated as a filename.  Files ending in \`.la' are
+treated as uninstalled libtool libraries, other files are standard or library
+object files.
+
+If the OUTPUT-FILE ends in \`.la', then a libtool library is created,
+only library objects (\`.lo' files) may be specified, and \`-rpath' is
+required, except when creating a convenience library.
+
+If OUTPUT-FILE ends in \`.a' or \`.lib', then a standard library is created
+using \`ar' and \`ranlib', or on Windows using \`lib'.
+
+If OUTPUT-FILE ends in \`.lo' or \`.${objext}', then a reloadable object file
+is created, otherwise an executable program is created."
+  ;;
+
+uninstall)
+  $echo \
+"Usage: $modename [OPTION]... --mode=uninstall RM [RM-OPTION]... FILE...
+
+Remove libraries from an installation directory.
+
+RM is the name of the program to use to delete files associated with each FILE
+(typically \`/bin/rm').  RM-OPTIONS are options (such as \`-f') to be passed
+to RM.
+
+If FILE is a libtool library, all the files associated with it are deleted.
+Otherwise, only FILE itself is deleted using RM."
+  ;;
+
+*)
+  $echo "$modename: invalid operation mode \`$mode'" 1>&2
+  $echo "$help" 1>&2
+  exit 1
+  ;;
+esac
+
+echo
+$echo "Try \`$modename --help' for more information about other modes."
+
+exit 0
+
+# Local Variables:
+# mode:shell-script
+# sh-indentation:2
+# End:
diff --git a/rts/gmp/mdate-sh b/rts/gmp/mdate-sh
new file mode 100644
index 0000000000..37171f21fb
--- /dev/null
+++ b/rts/gmp/mdate-sh
@@ -0,0 +1,92 @@
+#!/bin/sh
+# Get modification time of a file or directory and pretty-print it.
+# Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
+# written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, June 1995
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software Foundation,
+# Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+# Prevent date giving response in another language.
+LANG=C
+export LANG
+LC_ALL=C
+export LC_ALL
+LC_TIME=C
+export LC_TIME
+
+# Get the extended ls output of the file or directory.
+# On HPUX /bin/sh, "set" interprets "-rw-r--r--" as options, so the "x" below.
+if ls -L /dev/null 1>/dev/null 2>&1; then
+  set - x`ls -L -l -d $1`
+else
+  set - x`ls -l -d $1`
+fi
+# The month is at least the fourth argument
+# (3 shifts here, the next inside the loop).
+shift
+shift
+shift
+
+# Find the month.  Next argument is day, followed by the year or time.
+month=
+until test $month
+do
+  shift
+  case $1 in
+    Jan) month=January; nummonth=1;;
+    Feb) month=February; nummonth=2;;
+    Mar) month=March; nummonth=3;;
+    Apr) month=April; nummonth=4;;
+    May) month=May; nummonth=5;;
+    Jun) month=June; nummonth=6;;
+    Jul) month=July; nummonth=7;;
+    Aug) month=August; nummonth=8;;
+    Sep) month=September; nummonth=9;;
+    Oct) month=October; nummonth=10;;
+    Nov) month=November; nummonth=11;;
+    Dec) month=December; nummonth=12;;
+  esac
+done
+
+day=$2
+
+# Here we have to deal with the problem that the ls output gives either
+# the time of day or the year.
+case $3 in
+  *:*) set `date`; eval year=\$$#
+       case $2 in
+	 Jan) nummonthtod=1;;
+	 Feb) nummonthtod=2;;
+	 Mar) nummonthtod=3;;
+	 Apr) nummonthtod=4;;
+	 May) nummonthtod=5;;
+	 Jun) nummonthtod=6;;
+	 Jul) nummonthtod=7;;
+	 Aug) nummonthtod=8;;
+	 Sep) nummonthtod=9;;
+	 Oct) nummonthtod=10;;
+	 Nov) nummonthtod=11;;
+	 Dec) nummonthtod=12;;
+       esac
+       # For the first six month of the year the time notation can also
+       # be used for files modified in the last year.
+       if (expr $nummonth \> $nummonthtod) > /dev/null;
+       then
+	 year=`expr $year - 1`
+       fi;;
+  *) year=$3;;
+esac
+
+# The result.
+echo $day $month $year
diff --git a/rts/gmp/memory.c b/rts/gmp/memory.c
new file mode 100644
index 0000000000..9df440ce22
--- /dev/null
+++ b/rts/gmp/memory.c
@@ -0,0 +1,160 @@
+/* Memory allocation routines.
+
+Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h>
+#include <stdlib.h> /* for malloc, realloc, free */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#ifdef __NeXT__
+#define static
+#endif
+
+
+void *	(*_mp_allocate_func) _PROTO ((size_t)) = _mp_default_allocate;
+void *	(*_mp_reallocate_func) _PROTO ((void *, size_t, size_t))
+     = _mp_default_reallocate;
+void	(*_mp_free_func) _PROTO ((void *, size_t)) = _mp_default_free;
+
+
+/* Default allocation functions.  In case of failure to allocate/reallocate
+   an error message is written to stderr and the program aborts.  */
+
+void *
+#if __STDC__
+_mp_default_allocate (size_t size)
+#else
+_mp_default_allocate (size)
+     size_t size;
+#endif
+{
+  void *ret;
+#ifdef DEBUG
+  size_t req_size = size;
+  size += 2 * BYTES_PER_MP_LIMB;
+#endif
+  ret = malloc (size);
+  if (ret == 0)
+    {
+      perror ("cannot allocate in gmp");
+      abort ();
+    }
+  
+#ifdef DEBUG
+  {
+    mp_ptr p = ret;
+    p++;
+    p[-1] = (0xdeadbeef << 31) + 0xdeafdeed;
+    if (req_size % BYTES_PER_MP_LIMB == 0)
+      p[req_size / BYTES_PER_MP_LIMB] = ~((0xdeadbeef << 31) + 0xdeafdeed);
+    ret = p;
+  }
+#endif
+  return ret;
+}
+
+void *
+#if __STDC__
+_mp_default_reallocate (void *oldptr, size_t old_size, size_t new_size)
+#else
+_mp_default_reallocate (oldptr, old_size, new_size)
+     void *oldptr;
+     size_t old_size;
+     size_t new_size;
+#endif
+{
+  void *ret;
+
+#ifdef DEBUG
+  size_t req_size = new_size;
+
+  if (old_size != 0)
+    {
+      mp_ptr p = oldptr;
+      if (p[-1] != (0xdeadbeef << 31) + 0xdeafdeed)
+	{
+	  fprintf (stderr, "gmp: (realloc) data clobbered before allocation block\n");
+	  abort ();
+	}
+      if (old_size % BYTES_PER_MP_LIMB == 0)
+	if (p[old_size / BYTES_PER_MP_LIMB] != ~((0xdeadbeef << 31) + 0xdeafdeed))
+	  {
+	    fprintf (stderr, "gmp: (realloc) data clobbered after allocation block\n");
+	    abort ();
+	  }
+      oldptr = p - 1;
+    }
+
+  new_size += 2 * BYTES_PER_MP_LIMB;
+#endif
+
+  ret = realloc (oldptr, new_size);
+  if (ret == 0)
+    {
+      perror ("cannot allocate in gmp");
+      abort ();
+    }
+
+#ifdef DEBUG
+  {
+    mp_ptr p = ret;
+    p++;
+    p[-1] = (0xdeadbeef << 31) + 0xdeafdeed;
+    if (req_size % BYTES_PER_MP_LIMB == 0)
+      p[req_size / BYTES_PER_MP_LIMB] = ~((0xdeadbeef << 31) + 0xdeafdeed);
+    ret = p;
+  }
+#endif
+  return ret;
+}
+
+void
+#if __STDC__
+_mp_default_free (void *blk_ptr, size_t blk_size)
+#else
+_mp_default_free (blk_ptr, blk_size)
+     void *blk_ptr;
+     size_t blk_size;
+#endif
+{
+#ifdef DEBUG
+  {
+    mp_ptr p = blk_ptr;
+    if (blk_size != 0)
+      {
+	if (p[-1] != (0xdeadbeef << 31) + 0xdeafdeed)
+	  {
+	    fprintf (stderr, "gmp: (free) data clobbered before allocation block\n");
+	    abort ();
+	  }
+	if (blk_size % BYTES_PER_MP_LIMB == 0)
+	  if (p[blk_size / BYTES_PER_MP_LIMB] != ~((0xdeadbeef << 31) + 0xdeafdeed))
+	    {
+	      fprintf (stderr, "gmp: (free) data clobbered after allocation block\n");
+	      abort ();
+	    }
+      }
+    blk_ptr = p - 1;
+  }
+#endif
+  free (blk_ptr);
+}
diff --git a/rts/gmp/missing b/rts/gmp/missing
new file mode 100644
index 0000000000..c60e9d772f
--- /dev/null
+++ b/rts/gmp/missing
@@ -0,0 +1,244 @@
+#! /bin/sh
+# Common stub for a few missing GNU programs while installing.
+# Copyright (C) 1996, 1997, 1999 Free Software Foundation, Inc.
+# Originally by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+
+if test $# -eq 0; then
+  echo 1>&2 "Try \`$0 --help' for more information"
+  exit 1
+fi
+
+run=:
+
+case "$1" in
+--run)
+  # Try to run requested program, and just exit if it succeeds.
+  run=
+  shift
+  "$@" && exit 0
+  ;;
+esac
+
+# If it does not exist, or fails to run (possibly an outdated version),
+# try to emulate it.
+case "$1" in
+
+  -h|--h|--he|--hel|--help)
+    echo "\
+$0 [OPTION]... PROGRAM [ARGUMENT]...
+
+Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an
+error status if there is no known handling for PROGRAM.
+
+Options:
+  -h, --help      display this help and exit
+  -v, --version   output version information and exit
+  --run           try to run the given command, and emulate it if it fails
+
+Supported PROGRAM values:
+  aclocal      touch file \`aclocal.m4'
+  autoconf     touch file \`configure'
+  autoheader   touch file \`config.h.in'
+  automake     touch all \`Makefile.in' files
+  bison        create \`y.tab.[ch]', if possible, from existing .[ch]
+  flex         create \`lex.yy.c', if possible, from existing .c
+  lex          create \`lex.yy.c', if possible, from existing .c
+  makeinfo     touch the output file
+  tar          try tar, gnutar, gtar, then tar without non-portable flags
+  yacc         create \`y.tab.[ch]', if possible, from existing .[ch]"
+    ;;
+
+  -v|--v|--ve|--ver|--vers|--versi|--versio|--version)
+    echo "missing 0.2 - GNU automake"
+    ;;
+
+  -*)
+    echo 1>&2 "$0: Unknown \`$1' option"
+    echo 1>&2 "Try \`$0 --help' for more information"
+    exit 1
+    ;;
+
+  aclocal)
+    echo 1>&2 "\
+WARNING: \`$1' is missing on your system.  You should only need it if
+         you modified \`acinclude.m4' or \`configure.in'.  You might want
+         to install the \`Automake' and \`Perl' packages.  Grab them from
+         any GNU archive site."
+    touch aclocal.m4
+    ;;
+
+  autoconf)
+    echo 1>&2 "\
+WARNING: \`$1' is missing on your system.  You should only need it if
+         you modified \`configure.in'.  You might want to install the
+         \`Autoconf' and \`GNU m4' packages.  Grab them from any GNU
+         archive site."
+    touch configure
+    ;;
+
+  autoheader)
+    echo 1>&2 "\
+WARNING: \`$1' is missing on your system.  You should only need it if
+         you modified \`acconfig.h' or \`configure.in'.  You might want
+         to install the \`Autoconf' and \`GNU m4' packages.  Grab them
+         from any GNU archive site."
+    files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' configure.in`
+    test -z "$files" && files="config.h"
+    touch_files=
+    for f in $files; do
+      case "$f" in
+      *:*) touch_files="$touch_files "`echo "$f" |
+				       sed -e 's/^[^:]*://' -e 's/:.*//'`;;
+      *) touch_files="$touch_files $f.in";;
+      esac
+    done
+    touch $touch_files
+    ;;
+
+  automake)
+    echo 1>&2 "\
+WARNING: \`$1' is missing on your system.  You should only need it if
+         you modified \`Makefile.am', \`acinclude.m4' or \`configure.in'.
+         You might want to install the \`Automake' and \`Perl' packages.
+         Grab them from any GNU archive site."
+    find . -type f -name Makefile.am -print |
+	   sed 's/\.am$/.in/' |
+	   while read f; do touch "$f"; done
+    ;;
+
+  bison|yacc)
+    echo 1>&2 "\
+WARNING: \`$1' is missing on your system.  You should only need it if
+         you modified a \`.y' file.  You may need the \`Bison' package
+         in order for those modifications to take effect.  You can get
+         \`Bison' from any GNU archive site."
+    rm -f y.tab.c y.tab.h
+    if [ $# -ne 1 ]; then
+        eval LASTARG="\${$#}"
+	case "$LASTARG" in
+	*.y)
+	    SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'`
+	    if [ -f "$SRCFILE" ]; then
+	         cp "$SRCFILE" y.tab.c
+	    fi
+	    SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'`
+	    if [ -f "$SRCFILE" ]; then
+	         cp "$SRCFILE" y.tab.h
+	    fi
+	  ;;
+	esac
+    fi
+    if [ ! -f y.tab.h ]; then
+	echo >y.tab.h
+    fi
+    if [ ! -f y.tab.c ]; then
+	echo 'main() { return 0; }' >y.tab.c
+    fi
+    ;;
+
+  lex|flex)
+    echo 1>&2 "\
+WARNING: \`$1' is missing on your system.  You should only need it if
+         you modified a \`.l' file.  You may need the \`Flex' package
+         in order for those modifications to take effect.  You can get
+         \`Flex' from any GNU archive site."
+    rm -f lex.yy.c
+    if [ $# -ne 1 ]; then
+        eval LASTARG="\${$#}"
+	case "$LASTARG" in
+	*.l)
+	    SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'`
+	    if [ -f "$SRCFILE" ]; then
+	         cp "$SRCFILE" lex.yy.c
+	    fi
+	  ;;
+	esac
+    fi
+    if [ ! -f lex.yy.c ]; then
+	echo 'main() { return 0; }' >lex.yy.c
+    fi
+    ;;
+
+  makeinfo)
+    echo 1>&2 "\
+WARNING: \`$1' is missing on your system.  You should only need it if
+         you modified a \`.texi' or \`.texinfo' file, or any other file
+         indirectly affecting the aspect of the manual.  The spurious
+         call might also be the consequence of using a buggy \`make' (AIX,
+         DU, IRIX).  You might want to install the \`Texinfo' package or
+         the \`GNU make' package.  Grab either from any GNU archive site."
+    file=`echo "$*" | sed -n 's/.*-o \([^ ]*\).*/\1/p'`
+    if test -z "$file"; then
+      file=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'`
+      file=`sed -n '/^@setfilename/ { s/.* \([^ ]*\) *$/\1/; p; q; }' $file`
+    fi
+    touch $file
+    ;;
+
+  tar)
+    shift
+    if test -n "$run"; then
+      echo 1>&2 "ERROR: \`tar' requires --run"
+      exit 1
+    fi
+
+    # We have already tried tar in the generic part.
+    # Look for gnutar/gtar before invocation to avoid ugly error
+    # messages.
+    if (gnutar --version > /dev/null 2>&1); then
+       gnutar ${1+"$@"} && exit 0
+    fi
+    if (gtar --version > /dev/null 2>&1); then
+       gtar ${1+"$@"} && exit 0
+    fi
+    firstarg="$1"
+    if shift; then
+	case "$firstarg" in
+	*o*)
+	    firstarg=`echo "$firstarg" | sed s/o//`
+	    tar "$firstarg" ${1+"$@"} && exit 0
+	    ;;
+	esac
+	case "$firstarg" in
+	*h*)
+	    firstarg=`echo "$firstarg" | sed s/h//`
+	    tar "$firstarg" ${1+"$@"} && exit 0
+	    ;;
+	esac
+    fi
+
+    echo 1>&2 "\
+WARNING: I can't seem to be able to run \`tar' with the given arguments.
+         You may want to install GNU tar or Free paxutils, or check the
+         command line arguments."
+    exit 1
+    ;;
+
+  *)
+    echo 1>&2 "\
+WARNING: \`$1' is needed, and you do not seem to have it handy on your
+         system.  You might have modified some files without having the
+         proper tools for further handling them.  Check the \`README' file,
+         it often tells you about the needed prerequirements for installing
+         this package.  You may also peek at any GNU archive site, in case
+         some other package would contain this missing \`$1' program."
+    exit 1
+    ;;
+esac
+
+exit 0
diff --git a/rts/gmp/mkinstalldirs b/rts/gmp/mkinstalldirs
new file mode 100644
index 0000000000..5e17cd39fb
--- /dev/null
+++ b/rts/gmp/mkinstalldirs
@@ -0,0 +1,38 @@
+#! /bin/sh
+# mkinstalldirs --- make directory hierarchy
+# Author: Noah Friedman <friedman@prep.ai.mit.edu>
+# Created: 1993-05-16
+# Public domain
+
+errstatus=0
+
+for file
+do
+   set fnord `echo ":$file" | sed -ne 's/^:\//#/;s/^://;s/\// /g;s/^#/\//;p'`
+   shift
+
+   pathcomp=
+   for d
+   do
+     pathcomp="$pathcomp$d"
+     case "$pathcomp" in
+       -* ) pathcomp=./$pathcomp ;;
+     esac
+
+     if test ! -d "$pathcomp"; then
+        echo "mkdir $pathcomp"
+
+        mkdir "$pathcomp" || lasterr=$?
+
+        if test ! -d "$pathcomp"; then
+  	  errstatus=$lasterr
+        fi
+     fi
+
+     pathcomp="$pathcomp/"
+   done
+done
+
+exit $errstatus
+
+# mkinstalldirs ends here
diff --git a/rts/gmp/mp.h b/rts/gmp/mp.h
new file mode 100644
index 0000000000..ffab4cba82
--- /dev/null
+++ b/rts/gmp/mp.h
@@ -0,0 +1,124 @@
+/* mp.h -- Definitions for Berkeley compatible multiple precision functions.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#ifndef __MP_H__
+
+#ifndef __GNU_MP__		/* to allow inclusion of both gmp.h and mp.h */
+#define __GNU_MP__ 3
+#define __need_size_t
+#include <stddef.h>
+#undef __need_size_t
+
+#if defined (__STDC__) || defined (__cplusplus)
+#define __gmp_const const
+#else
+#define __gmp_const
+#endif
+
+#if defined (__GNUC__)
+#define __gmp_inline __inline__
+#else
+#define __gmp_inline
+#endif
+
+#ifndef _EXTERN_INLINE
+#ifdef __GNUC__
+#define _EXTERN_INLINE extern __inline__
+#else
+#define _EXTERN_INLINE static
+#endif
+#endif
+
+#ifdef _SHORT_LIMB
+typedef unsigned int		mp_limb_t;
+typedef int			mp_limb_signed_t;
+#else
+#ifdef _LONG_LONG_LIMB
+typedef unsigned long long int	mp_limb_t;
+typedef long long int		mp_limb_signed_t;
+#else
+typedef unsigned long int	mp_limb_t;
+typedef long int		mp_limb_signed_t;
+#endif
+#endif
+
+typedef mp_limb_t *		mp_ptr;
+typedef __gmp_const mp_limb_t *	mp_srcptr;
+typedef int			mp_size_t;
+typedef long int		mp_exp_t;
+
+typedef struct
+{
+  int _mp_alloc;		/* Number of *limbs* allocated and pointed
+				   to by the D field.  */
+  int _mp_size;			/* abs(SIZE) is the number of limbs
+				   the last field points to.  If SIZE
+				   is negative this is a negative
+				   number.  */
+  mp_limb_t *_mp_d;		/* Pointer to the limbs.  */
+} __mpz_struct;
+#endif /* __GNU_MP__ */
+
+/* User-visible types.  */
+typedef __mpz_struct MINT;
+
+
+#ifndef _PROTO
+#if (__STDC__-0) || defined (__cplusplus)
+#define _PROTO(x) x
+#else
+#define _PROTO(x) ()
+#endif
+#endif
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#define mp_set_memory_functions __gmp_set_memory_functions
+void mp_set_memory_functions _PROTO ((void *(*) (size_t),
+                                      void *(*) (void *, size_t, size_t),
+                                      void (*) (void *, size_t)));
+MINT *itom _PROTO ((signed short int));
+MINT *xtom _PROTO ((const char *));
+void move _PROTO ((const MINT *, MINT *));
+void madd _PROTO ((const MINT *, const MINT *, MINT *));
+void msub _PROTO ((const MINT *, const MINT *, MINT *));
+void mult _PROTO ((const MINT *, const MINT *, MINT *));
+void mdiv _PROTO ((const MINT *, const MINT *, MINT *, MINT *));
+void sdiv _PROTO ((const MINT *, signed short int, MINT *, signed short int *));
+void msqrt _PROTO ((const MINT *, MINT *, MINT *));
+void pow _PROTO ((const MINT *, const MINT *, const MINT *, MINT *));
+void rpow _PROTO ((const MINT *, signed short int, MINT *));
+void gcd _PROTO ((const MINT *, const MINT *, MINT *));
+int mcmp _PROTO ((const MINT *, const MINT *));
+void min _PROTO ((MINT *));
+void mout _PROTO ((const MINT *));
+char *mtox _PROTO ((const MINT *));
+void mfree _PROTO ((MINT *));
+
+#if defined (__cplusplus)
+}
+#endif
+
+#define __MP_H__
+#endif /* __MP_H__ */
diff --git a/rts/gmp/mp_bpl.c b/rts/gmp/mp_bpl.c
new file mode 100644
index 0000000000..df8b03e5ab
--- /dev/null
+++ b/rts/gmp/mp_bpl.c
@@ -0,0 +1,27 @@
+/*
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+const int mp_bits_per_limb = BITS_PER_MP_LIMB;
+const int __gmp_0 = 0;
+int __gmp_junk;
diff --git a/rts/gmp/mp_clz_tab.c b/rts/gmp/mp_clz_tab.c
new file mode 100644
index 0000000000..1bbd1d6a66
--- /dev/null
+++ b/rts/gmp/mp_clz_tab.c
@@ -0,0 +1,36 @@
+/* __clz_tab -- support for longlong.h
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+const
+unsigned char __clz_tab[] =
+{
+  0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+};
diff --git a/rts/gmp/mp_minv_tab.c b/rts/gmp/mp_minv_tab.c
new file mode 100644
index 0000000000..4afff85cfc
--- /dev/null
+++ b/rts/gmp/mp_minv_tab.c
@@ -0,0 +1,50 @@
+/* A table of data supporting modlimb_invert().
+
+   THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND MAY CHANGE
+   INCOMPATIBLY OR DISAPPEAR IN A FUTURE GNU MP RELEASE.  */
+
+/*
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+/* modlimb_invert_table[i] is the multiplicative inverse of 2*i+1 mod 256,
+   ie. (modlimb_invert_table[i] * (2*i+1)) % 256 == 1 */
+
+const unsigned char  modlimb_invert_table[128] = {
+  0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
+  0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
+  0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
+  0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
+  0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
+  0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
+  0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
+  0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
+  0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
+  0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
+  0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
+  0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
+  0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
+  0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
+  0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
+  0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
+};
diff --git a/rts/gmp/mp_set_fns.c b/rts/gmp/mp_set_fns.c
new file mode 100644
index 0000000000..55d4d9d6e4
--- /dev/null
+++ b/rts/gmp/mp_set_fns.c
@@ -0,0 +1,48 @@
+/* mp_set_memory_functions -- Set the allocate, reallocate, and free functions
+   for use by the mp package.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mp_set_memory_functions (void *(*alloc_func) (size_t),
+			 void *(*realloc_func) (void *, size_t, size_t),
+			 void (*free_func) (void *, size_t))
+#else
+mp_set_memory_functions (alloc_func, realloc_func, free_func)
+     void *(*alloc_func) ();
+     void *(*realloc_func) ();
+     void (*free_func) ();
+#endif
+{
+  if (alloc_func == 0)
+    alloc_func = _mp_default_allocate;
+  if (realloc_func == 0)
+    realloc_func = _mp_default_reallocate;
+  if (free_func == 0)
+    free_func = _mp_default_free;
+
+  _mp_allocate_func = alloc_func;
+  _mp_reallocate_func = realloc_func;
+  _mp_free_func = free_func;
+}
diff --git a/rts/gmp/mpn/Makefile.am b/rts/gmp/mpn/Makefile.am
new file mode 100644
index 0000000000..1c49ccda25
--- /dev/null
+++ b/rts/gmp/mpn/Makefile.am
@@ -0,0 +1,94 @@
+## Process this file with automake to generate Makefile.in
+
+# Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+AUTOMAKE_OPTIONS = gnu no-dependencies
+SUBDIRS = tests
+
+CPP	= @CPP@
+
+# -DOPERATION_$* tells multi-function files which function to produce.
+INCLUDES = -I$(top_srcdir) -DOPERATION_$*
+
+GENERIC_SOURCES = mp_bases.c
+OFILES = @mpn_objects@
+
+noinst_LTLIBRARIES = libmpn.la
+libmpn_la_SOURCES = $(GENERIC_SOURCES)
+libmpn_la_LIBADD = $(OFILES)
+libmpn_la_DEPENDENCIES = $(OFILES)
+
+TARG_DIST = a29k alpha arm clipper cray generic hppa i960 lisp m68k m88k \
+  mips2 mips3 ns32k pa64 pa64w power powerpc32 powerpc64 pyr sh sparc32 \
+  sparc64 thumb vax x86 z8000 z8000x
+
+EXTRA_DIST = underscore.h asm-defs.m4 $(TARG_DIST)
+
+# COMPILE minus CC.  FIXME: Really pass *_CFLAGS to CPP?
+COMPILE_FLAGS =	\
+	$(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+
+SUFFIXES = .s .S .asm
+
+# *.s are not preprocessed at all.
+.s.o:
+	$(CCAS) $(COMPILE_FLAGS) $<
+.s.obj:
+	$(CCAS) $(COMPILE_FLAGS) `cygpath -w $<`
+.s.lo:
+	$(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) $<
+
+# *.S are preprocessed with CPP.
+.S.o:
+	$(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+.S.obj:
+	$(CPP) $(COMPILE_FLAGS) `cygpath -w $<` | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+
+# We have to rebuild the static object file without passing -DPIC to
+# preprocessor.  The overhead cost is one extra assemblation.  FIXME:
+# Teach libtool how to assemble with a preprocessor pass (CPP or m4).
+
+.S.lo:
+	$(CPP) $(COMPILE_FLAGS) -DPIC $< | grep -v '^#' >tmp-$*.s
+	$(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o
+	rm -f tmp-$*.s
+
+# *.m4 are preprocessed with m4.
+.asm.o:
+	$(M4) -DOPERATION_$* $< >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+.asm.obj:
+	$(M4) -DOPERATION_$* `cygpath -w $<` >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+.asm.lo:
+	$(M4) -DPIC -DOPERATION_$* $< >tmp-$*.s
+	$(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(M4) -DOPERATION_$* $< >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o
+	rm -f tmp-$*.s
diff --git a/rts/gmp/mpn/Makefile.in b/rts/gmp/mpn/Makefile.in
new file mode 100644
index 0000000000..59ee958c92
--- /dev/null
+++ b/rts/gmp/mpn/Makefile.in
@@ -0,0 +1,472 @@
+# Makefile.in generated automatically by automake 1.4a from Makefile.am
+
+# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+SHELL = @SHELL@
+
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+prefix = @prefix@
+exec_prefix = @exec_prefix@
+
+bindir = @bindir@
+sbindir = @sbindir@
+libexecdir = @libexecdir@
+datadir = @datadir@
+sysconfdir = @sysconfdir@
+sharedstatedir = @sharedstatedir@
+localstatedir = @localstatedir@
+libdir = @libdir@
+infodir = @infodir@
+mandir = @mandir@
+includedir = @includedir@
+oldincludedir = /usr/include
+
+DESTDIR =
+
+pkgdatadir = $(datadir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+
+top_builddir = ..
+
+ACLOCAL = @ACLOCAL@
+AUTOCONF = @AUTOCONF@
+AUTOMAKE = @AUTOMAKE@
+AUTOHEADER = @AUTOHEADER@
+
+INSTALL = @INSTALL@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_FLAG =
+transform = @program_transform_name@
+
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+
+@SET_MAKE@
+build_alias = @build_alias@
+build_triplet = @build@
+host_alias = @host_alias@
+host_triplet = @host@
+target_alias = @target_alias@
+target_triplet = @target@
+AMDEP = @AMDEP@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AWK = @AWK@
+CALLING_CONVENTIONS_OBJS = @CALLING_CONVENTIONS_OBJS@
+CC = @CC@
+CCAS = @CCAS@
+CPP = @CPP@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+EXEEXT = @EXEEXT@
+LIBTOOL = @LIBTOOL@
+LN_S = @LN_S@
+M4 = @M4@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+PACKAGE = @PACKAGE@
+RANLIB = @RANLIB@
+SPEED_CYCLECOUNTER_OBJS = @SPEED_CYCLECOUNTER_OBJS@
+STRIP = @STRIP@
+U = @U@
+VERSION = @VERSION@
+gmp_srclinks = @gmp_srclinks@
+install_sh = @install_sh@
+mpn_objects = @mpn_objects@
+mpn_objs_in_libgmp = @mpn_objs_in_libgmp@
+
+# Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+AUTOMAKE_OPTIONS = gnu no-dependencies
+SUBDIRS =
+
+CPP = @CPP@
+
+# -DOPERATION_$* tells multi-function files which function to produce.
+INCLUDES = -I$(top_srcdir) -DOPERATION_$*
+
+GENERIC_SOURCES = mp_bases.c
+OFILES = @mpn_objects@
+
+noinst_LTLIBRARIES = libmpn.la
+libmpn_la_SOURCES = $(GENERIC_SOURCES)
+libmpn_la_LIBADD = $(OFILES)
+libmpn_la_DEPENDENCIES = $(OFILES)
+
+TARG_DIST = a29k alpha arm clipper cray generic hppa i960 lisp m68k m88k \
+  mips2 mips3 ns32k pa64 pa64w power powerpc32 powerpc64 pyr sh sparc32 \
+  sparc64 thumb vax x86 z8000 z8000x
+
+
+EXTRA_DIST = underscore.h asm-defs.m4 $(TARG_DIST)
+
+# COMPILE minus CC.  FIXME: Really pass *_CFLAGS to CPP?
+COMPILE_FLAGS = \
+	$(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+
+
+SUFFIXES = .s .S .asm
+subdir = mpn
+mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs
+CONFIG_HEADER = ../config.h
+CONFIG_CLEAN_FILES = 
+LTLIBRARIES =  $(noinst_LTLIBRARIES)
+
+
+DEFS = @DEFS@ -I. -I$(srcdir) -I..
+CPPFLAGS = @CPPFLAGS@
+LDFLAGS = @LDFLAGS@
+LIBS = @LIBS@
+libmpn_la_LDFLAGS = 
+am_libmpn_la_OBJECTS =  mp_bases.lo
+libmpn_la_OBJECTS =  $(am_libmpn_la_OBJECTS)
+COMPILE = $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CFLAGS = @CFLAGS@
+CCLD = $(CC)
+LINK = $(LIBTOOL) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+DIST_SOURCES =  $(libmpn_la_SOURCES)
+DIST_COMMON =  README Makefile.am Makefile.in
+
+
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+
+GZIP_ENV = --best
+depcomp = 
+SOURCES = $(libmpn_la_SOURCES)
+OBJECTS = $(am_libmpn_la_OBJECTS)
+
+all: all-redirect
+.SUFFIXES:
+.SUFFIXES: .S .asm .c .lo .o .obj .s
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) 
+	cd $(top_srcdir) && $(AUTOMAKE) --gnu mpn/Makefile
+
+Makefile: $(srcdir)/Makefile.in  $(top_builddir)/config.status
+	cd $(top_builddir) \
+	  && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
+
+
+mostlyclean-noinstLTLIBRARIES:
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+
+distclean-noinstLTLIBRARIES:
+
+maintainer-clean-noinstLTLIBRARIES:
+
+mostlyclean-compile:
+	-rm -f *.o core *.core
+	-rm -f *.$(OBJEXT)
+
+clean-compile:
+
+distclean-compile:
+	-rm -f *.tab.c
+
+maintainer-clean-compile:
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+distclean-libtool:
+
+maintainer-clean-libtool:
+
+libmpn.la: $(libmpn_la_OBJECTS) $(libmpn_la_DEPENDENCIES)
+	$(LINK)  $(libmpn_la_LDFLAGS) $(libmpn_la_OBJECTS) $(libmpn_la_LIBADD) $(LIBS)
+.c.o:
+	$(COMPILE) -c $<
+.c.obj:
+	$(COMPILE) -c `cygpath -w $<`
+.c.lo:
+	$(LTCOMPILE) -c -o $@ $<
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+#     (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+
+all-recursive install-data-recursive install-exec-recursive \
+installdirs-recursive install-recursive uninstall-recursive  \
+check-recursive installcheck-recursive info-recursive dvi-recursive:
+	@set fnord $(MAKEFLAGS); amf=$$2; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	   || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+mostlyclean-recursive clean-recursive distclean-recursive \
+maintainer-clean-recursive:
+	@set fnord $(MAKEFLAGS); amf=$$2; \
+	dot_seen=no; \
+	rev=''; list='$(SUBDIRS)'; for subdir in $$list; do \
+	  rev="$$subdir $$rev"; \
+	  if test "$$subdir" = "."; then dot_seen=yes; else :; fi; \
+	done; \
+	test "$$dot_seen" = "no" && rev=". $$rev"; \
+	target=`echo $@ | sed s/-recursive//`; \
+	for subdir in $$rev; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	   || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
+	done && test -z "$$fail"
+tags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+	done
+
+tags: TAGS
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	mkid -f$$here/ID $$unique $(LISP)
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	tags=; \
+	here=`pwd`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+   if test "$$subdir" = .; then :; else \
+	    test -f $$subdir/TAGS && tags="$$tags -i $$here/$$subdir/TAGS"; \
+   fi; \
+	done; \
+	list='$(SOURCES) $(HEADERS) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	test -z "$(ETAGS_ARGS)$$unique$(LISP)$$tags" \
+	  || etags $(ETAGS_ARGS) $$tags  $$unique $(LISP)
+
+mostlyclean-tags:
+
+clean-tags:
+
+distclean-tags:
+	-rm -f TAGS ID
+
+maintainer-clean-tags:
+
+distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir)
+
+distdir: $(DISTFILES)
+	@for file in $(DISTFILES); do \
+	  d=$(srcdir); \
+	  if test -d $$d/$$file; then \
+	    cp -pR $$d/$$file $(distdir); \
+	  else \
+	    test -f $(distdir)/$$file \
+	    || cp -p $$d/$$file $(distdir)/$$file || :; \
+	  fi; \
+	done
+	for subdir in $(SUBDIRS); do \
+	  if test "$$subdir" = .; then :; else \
+	    test -d $(distdir)/$$subdir \
+	    || mkdir $(distdir)/$$subdir \
+	    || exit 1; \
+	    (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir=../$(top_distdir) distdir=../$(distdir)/$$subdir distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+info-am:
+info: info-recursive
+dvi-am:
+dvi: dvi-recursive
+check-am: all-am
+check: check-recursive
+installcheck-am:
+installcheck: installcheck-recursive
+install-exec-am:
+install-exec: install-exec-recursive
+
+install-data-am:
+install-data: install-data-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+install: install-recursive
+uninstall-am:
+uninstall: uninstall-recursive
+all-am: Makefile $(LTLIBRARIES)
+all-redirect: all-recursive
+install-strip:
+	$(MAKE) $(AM_MAKEFLAGS) INSTALL_STRIP_FLAG=-s install
+installdirs: installdirs-recursive
+installdirs-am:
+
+
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-rm -f Makefile $(CONFIG_CLEAN_FILES)
+	-rm -f config.cache config.log stamp-h stamp-h[0-9]*
+
+maintainer-clean-generic:
+	-rm -f Makefile.in
+mostlyclean-am:  mostlyclean-noinstLTLIBRARIES mostlyclean-compile \
+		mostlyclean-libtool mostlyclean-tags \
+		mostlyclean-generic
+
+mostlyclean: mostlyclean-recursive
+
+clean-am:  clean-noinstLTLIBRARIES clean-compile clean-libtool \
+		clean-tags clean-generic mostlyclean-am
+
+clean: clean-recursive
+
+distclean-am:  distclean-noinstLTLIBRARIES distclean-compile \
+		distclean-libtool distclean-tags distclean-generic \
+		clean-am
+	-rm -f libtool
+
+distclean: distclean-recursive
+
+maintainer-clean-am:  maintainer-clean-noinstLTLIBRARIES \
+		maintainer-clean-compile maintainer-clean-libtool \
+		maintainer-clean-tags maintainer-clean-generic \
+		distclean-am
+	@echo "This command is intended for maintainers to use;"
+	@echo "it deletes files that may require special tools to rebuild."
+
+maintainer-clean: maintainer-clean-recursive
+
+.PHONY: mostlyclean-noinstLTLIBRARIES distclean-noinstLTLIBRARIES \
+clean-noinstLTLIBRARIES maintainer-clean-noinstLTLIBRARIES \
+mostlyclean-compile distclean-compile clean-compile \
+maintainer-clean-compile mostlyclean-libtool distclean-libtool \
+clean-libtool maintainer-clean-libtool install-recursive \
+uninstall-recursive install-data-recursive uninstall-data-recursive \
+install-exec-recursive uninstall-exec-recursive installdirs-recursive \
+uninstalldirs-recursive all-recursive check-recursive \
+installcheck-recursive info-recursive dvi-recursive \
+mostlyclean-recursive distclean-recursive clean-recursive \
+maintainer-clean-recursive tags tags-recursive mostlyclean-tags \
+distclean-tags clean-tags maintainer-clean-tags distdir info-am info \
+dvi-am dvi check check-am installcheck-am installcheck install-exec-am \
+install-exec install-data-am install-data install-am install \
+uninstall-am uninstall all-redirect all-am all install-strip \
+installdirs-am installdirs mostlyclean-generic distclean-generic \
+clean-generic maintainer-clean-generic clean mostlyclean distclean \
+maintainer-clean
+
+
+# *.s are not preprocessed at all.
+.s.o:
+	$(CCAS) $(COMPILE_FLAGS) $<
+.s.obj:
+	$(CCAS) $(COMPILE_FLAGS) `cygpath -w $<`
+.s.lo:
+	$(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) $<
+
+# *.S are preprocessed with CPP.
+.S.o:
+	$(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+.S.obj:
+	$(CPP) $(COMPILE_FLAGS) `cygpath -w $<` | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+
+# We have to rebuild the static object file without passing -DPIC to
+# preprocessor.  The overhead cost is one extra assemblation.  FIXME:
+# Teach libtool how to assemble with a preprocessor pass (CPP or m4).
+
+.S.lo:
+	$(CPP) $(COMPILE_FLAGS) -DPIC $< | grep -v '^#' >tmp-$*.s
+	$(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o
+	rm -f tmp-$*.s
+
+# *.m4 are preprocessed with m4.
+.asm.o:
+	$(M4) -DOPERATION_$* $< >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+.asm.obj:
+	$(M4) -DOPERATION_$* `cygpath -w $<` >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+.asm.lo:
+	$(M4) -DPIC -DOPERATION_$* $< >tmp-$*.s
+	$(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(M4) -DOPERATION_$* $< >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o
+	rm -f tmp-$*.s
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/rts/gmp/mpn/README b/rts/gmp/mpn/README
new file mode 100644
index 0000000000..7453c9d03e
--- /dev/null
+++ b/rts/gmp/mpn/README
@@ -0,0 +1,13 @@
+This directory contains all code for the mpn layer of GMP.
+
+Most subdirectories contain machine-dependent code, written in assembly or C.
+The `generic' subdirectory contains default code, used when there is no
+machine-dependent replacement for a particular machine.
+
+There is one subdirectory for each ISA family.  Note that e.g., 32-bit SPARC
+and 64-bit SPARC are very different ISA's, and thus cannot share any code.
+
+A particular compile will only use code from one subdirectory, and the
+`generic' subdirectory.  The ISA-specific subdirectories contain hierachies of
+directories for various architecture variants and implementations; the
+top-most level contains code that runs correctly on all variants.
diff --git a/rts/gmp/mpn/a29k/add_n.s b/rts/gmp/mpn/a29k/add_n.s
new file mode 100644
index 0000000000..e3ee6dfa60
--- /dev/null
+++ b/rts/gmp/mpn/a29k/add_n.s
@@ -0,0 +1,120 @@
+; 29000 __gmpn_add -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; s2_ptr	lr4
+; size		lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+; The 29k has no addition or subtraction instructions that doesn't
+; affect carry, so we need to save and restore that as soon as we
+; adjust the pointers.  gr116 is used for this purpose.  Note that
+; gr116==0 means that carry should be set.
+
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_add_n
+	.word	0x60000
+___gmpn_add_n:
+	srl	gr117,lr5,3
+	sub	gr118,gr117,1
+	jmpt	gr118,Ltail
+	 constn	gr116,-1		; init cy reg
+	sub	gr117,gr117,2		; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop:	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr3
+	add	lr3,lr3,32
+	mtsrim	cr,(8-1)
+	loadm	0,0,gr104,lr4
+	add	lr4,lr4,32
+
+	subr	gr116,gr116,0		; restore carry
+	addc	gr96,gr96,gr104
+	addc	gr97,gr97,gr105
+	addc	gr98,gr98,gr106
+	addc	gr99,gr99,gr107
+	addc	gr100,gr100,gr108
+	addc	gr101,gr101,gr109
+	addc	gr102,gr102,gr110
+	addc	gr103,gr103,gr111
+	subc	gr116,gr116,gr116	; gr116 = not(cy)
+
+	mtsrim	cr,(8-1)
+	storem	0,0,gr96,lr2
+	jmpfdec	gr117,Loop
+	 add	lr2,lr2,32
+
+; Code for the last up-to-7 limbs.
+; This code might look very strange, but it's hard to write it
+; differently without major slowdown.
+
+	and	lr5,lr5,(8-1)
+Ltail:	sub	gr118,lr5,1		; count for CR
+	jmpt	gr118,Lend
+	 sub	gr117,lr5,2		; count for jmpfdec
+
+	mtsr	cr,gr118
+	loadm	0,0,gr96,lr3
+	mtsr	cr,gr118
+	loadm	0,0,gr104,lr4
+
+	subr	gr116,gr116,0		; restore carry
+
+	jmpfdec	gr117,L1
+	 addc	gr96,gr96,gr104
+	jmp	Lstore
+	 mtsr	cr,gr118
+L1:	jmpfdec	gr117,L2
+	 addc	gr97,gr97,gr105
+	jmp	Lstore
+	 mtsr	cr,gr118
+L2:	jmpfdec	gr117,L3
+	 addc	gr98,gr98,gr106
+	jmp	Lstore
+	 mtsr	cr,gr118
+L3:	jmpfdec	gr117,L4
+	 addc	gr99,gr99,gr107
+	jmp	Lstore
+	 mtsr	cr,gr118
+L4:	jmpfdec	gr117,L5
+	 addc	gr100,gr100,gr108
+	jmp	Lstore
+	 mtsr	cr,gr118
+L5:	jmpfdec	gr117,L6
+	 addc	gr101,gr101,gr109
+	jmp	Lstore
+	 mtsr	cr,gr118
+L6:	addc	gr102,gr102,gr110
+
+Lstore:	storem	0,0,gr96,lr2
+	subc	gr116,gr116,gr116	; gr116 = not(cy)
+
+Lend:	jmpi	lr0
+	 add	gr96,gr116,1
diff --git a/rts/gmp/mpn/a29k/addmul_1.s b/rts/gmp/mpn/a29k/addmul_1.s
new file mode 100644
index 0000000000..f51b6d7af6
--- /dev/null
+++ b/rts/gmp/mpn/a29k/addmul_1.s
@@ -0,0 +1,113 @@
+; 29000 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and
+; add the product to a second limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; size		lr4
+; s2_limb	lr5
+
+	.cputype 29050
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_addmul_1
+	.word	0x60000
+___gmpn_addmul_1:
+	sub	lr4,lr4,8
+	jmpt	lr4,Ltail
+	 const	gr120,0			; init cylimb reg
+
+	srl	gr117,lr4,3		; divide by 8
+	sub	gr117,gr117,1		; count for jmpfdec
+
+Loop:	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr3
+	add	lr3,lr3,32
+
+	multiplu gr104,gr96,lr5
+	multmu	 gr96,gr96,lr5
+	multiplu gr105,gr97,lr5
+	multmu	 gr97,gr97,lr5
+	multiplu gr106,gr98,lr5
+	multmu	 gr98,gr98,lr5
+	multiplu gr107,gr99,lr5
+	multmu	 gr99,gr99,lr5
+	multiplu gr108,gr100,lr5
+	multmu	 gr100,gr100,lr5
+	multiplu gr109,gr101,lr5
+	multmu	 gr101,gr101,lr5
+	multiplu gr110,gr102,lr5
+	multmu	 gr102,gr102,lr5
+	multiplu gr111,gr103,lr5
+	multmu	 gr103,gr103,lr5
+
+	add	gr104,gr104,gr120
+	addc	gr105,gr105,gr96
+	addc	gr106,gr106,gr97
+	addc	gr107,gr107,gr98
+	addc	gr108,gr108,gr99
+	addc	gr109,gr109,gr100
+	addc	gr110,gr110,gr101
+	addc	gr111,gr111,gr102
+	addc	gr120,gr103,0
+
+	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr2
+
+	add	gr104,gr96,gr104
+	addc	gr105,gr97,gr105
+	addc	gr106,gr98,gr106
+	addc	gr107,gr99,gr107
+	addc	gr108,gr100,gr108
+	addc	gr109,gr101,gr109
+	addc	gr110,gr102,gr110
+	addc	gr111,gr103,gr111
+	addc	gr120,gr120,0
+
+	mtsrim	cr,(8-1)
+	storem	0,0,gr104,lr2
+	jmpfdec	gr117,Loop
+	 add	lr2,lr2,32
+
+Ltail:	and	lr4,lr4,(8-1)
+	sub	gr118,lr4,1		; count for CR
+	jmpt	gr118,Lend
+	 sub	lr4,lr4,2
+	sub	lr2,lr2,4		; offset res_ptr by one limb
+
+Loop2:	load	0,0,gr116,lr3
+	add	lr3,lr3,4
+	multiplu gr117,gr116,lr5
+	multmu	gr118,gr116,lr5
+	add	lr2,lr2,4
+	load	0,0,gr119,lr2
+	add	gr117,gr117,gr120
+	addc	gr118,gr118,0
+	add	gr117,gr117,gr119
+	store	0,0,gr117,lr2
+	jmpfdec	lr4,Loop2
+	 addc	gr120,gr118,0
+
+Lend:	jmpi	lr0
+	 or	gr96,gr120,0		; copy
diff --git a/rts/gmp/mpn/a29k/lshift.s b/rts/gmp/mpn/a29k/lshift.s
new file mode 100644
index 0000000000..93e1917127
--- /dev/null
+++ b/rts/gmp/mpn/a29k/lshift.s
@@ -0,0 +1,93 @@
+; 29000 __gmpn_lshift --
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; s2_ptr	lr4
+; size		lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_lshift
+	.word	0x60000
+___gmpn_lshift:
+	sll	gr116,lr4,2
+	add	lr3,gr116,lr3
+	add	lr2,gr116,lr2
+	sub	lr3,lr3,4
+	load	0,0,gr119,lr3
+
+	subr	gr116,lr5,32
+	srl	gr96,gr119,gr116	; return value
+	sub	lr4,lr4,1		; actual loop count is SIZE - 1
+
+	srl	gr117,lr4,3		; chuck count = (actual count) / 8
+	cpeq	gr118,gr117,0
+	jmpt	gr118,Ltail
+	 mtsr	fc,lr5
+
+	sub	gr117,gr117,2		; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop:	sub	lr3,lr3,32
+	mtsrim	cr,(8-1)
+	loadm	0,0,gr100,lr3
+
+	extract	gr109,gr119,gr107
+	extract	gr108,gr107,gr106
+	extract	gr107,gr106,gr105
+	extract	gr106,gr105,gr104
+	extract	gr105,gr104,gr103
+	extract	gr104,gr103,gr102
+	extract	gr103,gr102,gr101
+	extract	gr102,gr101,gr100
+
+	sub	lr2,lr2,32
+	mtsrim	cr,(8-1)
+	storem	0,0,gr102,lr2
+	jmpfdec	gr117,Loop
+	 or	gr119,gr100,0
+
+; Code for the last up-to-7 limbs.
+
+	and	lr4,lr4,(8-1)
+Ltail:	cpeq	gr118,lr4,0
+	jmpt	gr118,Lend
+	 sub	lr4,lr4,2		; count for jmpfdec
+
+Loop2:	sub	lr3,lr3,4
+	load	0,0,gr116,lr3
+	extract	gr117,gr119,gr116
+	sub	lr2,lr2,4
+	store	0,0,gr117,lr2
+	jmpfdec	lr4,Loop2
+	 or	gr119,gr116,0
+
+Lend:	extract	gr117,gr119,0
+	sub	lr2,lr2,4
+	jmpi	lr0
+	 store	0,0,gr117,lr2
diff --git a/rts/gmp/mpn/a29k/mul_1.s b/rts/gmp/mpn/a29k/mul_1.s
new file mode 100644
index 0000000000..6bcf7ce0cf
--- /dev/null
+++ b/rts/gmp/mpn/a29k/mul_1.s
@@ -0,0 +1,97 @@
+; 29000 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; size		lr4
+; s2_limb	lr5
+
+	.cputype 29050
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_mul_1
+	.word	0x60000
+___gmpn_mul_1:
+	sub	lr4,lr4,8
+	jmpt	lr4,Ltail
+	 const	gr120,0			; init cylimb reg
+
+	srl	gr117,lr4,3		; divide by 8
+	sub	gr117,gr117,1		; count for jmpfdec
+
+Loop:	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr3
+	add	lr3,lr3,32
+
+	multiplu gr104,gr96,lr5
+	multmu	 gr96,gr96,lr5
+	multiplu gr105,gr97,lr5
+	multmu	 gr97,gr97,lr5
+	multiplu gr106,gr98,lr5
+	multmu	 gr98,gr98,lr5
+	multiplu gr107,gr99,lr5
+	multmu	 gr99,gr99,lr5
+	multiplu gr108,gr100,lr5
+	multmu	 gr100,gr100,lr5
+	multiplu gr109,gr101,lr5
+	multmu	 gr101,gr101,lr5
+	multiplu gr110,gr102,lr5
+	multmu	 gr102,gr102,lr5
+	multiplu gr111,gr103,lr5
+	multmu	 gr103,gr103,lr5
+
+	add	gr104,gr104,gr120
+	addc	gr105,gr105,gr96
+	addc	gr106,gr106,gr97
+	addc	gr107,gr107,gr98
+	addc	gr108,gr108,gr99
+	addc	gr109,gr109,gr100
+	addc	gr110,gr110,gr101
+	addc	gr111,gr111,gr102
+	addc	gr120,gr103,0
+
+	mtsrim	cr,(8-1)
+	storem	0,0,gr104,lr2
+	jmpfdec	gr117,Loop
+	 add	lr2,lr2,32
+
+Ltail:	and	lr4,lr4,(8-1)
+	sub	gr118,lr4,1		; count for CR
+	jmpt	gr118,Lend
+	 sub	lr4,lr4,2
+	sub	lr2,lr2,4		; offset res_ptr by one limb
+
+Loop2:	load	0,0,gr116,lr3
+	add	lr3,lr3,4
+	multiplu gr117,gr116,lr5
+	multmu	gr118,gr116,lr5
+	add	lr2,lr2,4
+	add	gr117,gr117,gr120
+	store	0,0,gr117,lr2
+	jmpfdec	lr4,Loop2
+	 addc	gr120,gr118,0
+
+Lend:	jmpi	lr0
+	 or	gr96,gr120,0		; copy
diff --git a/rts/gmp/mpn/a29k/rshift.s b/rts/gmp/mpn/a29k/rshift.s
new file mode 100644
index 0000000000..ea163bff2b
--- /dev/null
+++ b/rts/gmp/mpn/a29k/rshift.s
@@ -0,0 +1,89 @@
+; 29000 __gmpn_rshift --
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; s2_ptr	lr4
+; size		lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_rshift
+	.word	0x60000
+___gmpn_rshift:
+	load	0,0,gr119,lr3
+	add	lr3,lr3,4
+
+	subr	gr116,lr5,32
+	sll	gr96,gr119,gr116	; return value
+	sub	lr4,lr4,1		; actual loop count is SIZE - 1
+
+	srl	gr117,lr4,3		; chuck count = (actual count) / 8
+	cpeq	gr118,gr117,0
+	jmpt	gr118,Ltail
+	 mtsr	fc,gr116
+
+	sub	gr117,gr117,2		; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop:	mtsrim	cr,(8-1)
+	loadm	0,0,gr100,lr3
+	add	lr3,lr3,32
+
+	extract	gr98,gr100,gr119
+	extract	gr99,gr101,gr100
+	extract	gr100,gr102,gr101
+	extract	gr101,gr103,gr102
+	extract	gr102,gr104,gr103
+	extract	gr103,gr105,gr104
+	extract	gr104,gr106,gr105
+	extract	gr105,gr107,gr106
+
+	mtsrim	cr,(8-1)
+	storem	0,0,gr98,lr2
+	add	lr2,lr2,32
+	jmpfdec	gr117,Loop
+	 or	gr119,gr107,0
+
+; Code for the last up-to-7 limbs.
+
+	and	lr4,lr4,(8-1)
+Ltail:	cpeq	gr118,lr4,0
+	jmpt	gr118,Lend
+	 sub	lr4,lr4,2		; count for jmpfdec
+
+Loop2:	load	0,0,gr100,lr3
+	add	lr3,lr3,4
+	extract	gr117,gr100,gr119
+	store	0,0,gr117,lr2
+	add	lr2,lr2,4
+	jmpfdec	lr4,Loop2
+	 or	gr119,gr100,0
+
+Lend:	srl	gr117,gr119,lr5
+	jmpi	lr0
+	 store	0,0,gr117,lr2
diff --git a/rts/gmp/mpn/a29k/sub_n.s b/rts/gmp/mpn/a29k/sub_n.s
new file mode 100644
index 0000000000..c6b64c5bee
--- /dev/null
+++ b/rts/gmp/mpn/a29k/sub_n.s
@@ -0,0 +1,120 @@
+; 29000 __gmpn_sub -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; s2_ptr	lr4
+; size		lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+; The 29k has no addition or subtraction instructions that doesn't
+; affect carry, so we need to save and restore that as soon as we
+; adjust the pointers.  gr116 is used for this purpose.  Note that
+; gr116==0 means that carry should be set.
+
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_sub_n
+	.word	0x60000
+___gmpn_sub_n:
+	srl	gr117,lr5,3
+	sub	gr118,gr117,1
+	jmpt	gr118,Ltail
+	 constn	gr116,-1		; init cy reg
+	sub	gr117,gr117,2		; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop:	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr3
+	add	lr3,lr3,32
+	mtsrim	cr,(8-1)
+	loadm	0,0,gr104,lr4
+	add	lr4,lr4,32
+
+	subr	gr116,gr116,0		; restore carry
+	subc	gr96,gr96,gr104
+	subc	gr97,gr97,gr105
+	subc	gr98,gr98,gr106
+	subc	gr99,gr99,gr107
+	subc	gr100,gr100,gr108
+	subc	gr101,gr101,gr109
+	subc	gr102,gr102,gr110
+	subc	gr103,gr103,gr111
+	subc	gr116,gr116,gr116	; gr116 = not(cy)
+
+	mtsrim	cr,(8-1)
+	storem	0,0,gr96,lr2
+	jmpfdec	gr117,Loop
+	 add	lr2,lr2,32
+
+; Code for the last up-to-7 limbs.
+; This code might look very strange, but it's hard to write it
+; differently without major slowdown.
+
+	and	lr5,lr5,(8-1)
+Ltail:	sub	gr118,lr5,1		; count for CR
+	jmpt	gr118,Lend
+	 sub	gr117,lr5,2		; count for jmpfdec
+
+	mtsr	cr,gr118
+	loadm	0,0,gr96,lr3
+	mtsr	cr,gr118
+	loadm	0,0,gr104,lr4
+
+	subr	gr116,gr116,0		; restore carry
+
+	jmpfdec	gr117,L1
+	 subc	gr96,gr96,gr104
+	jmp	Lstore
+	 mtsr	cr,gr118
+L1:	jmpfdec	gr117,L2
+	 subc	gr97,gr97,gr105
+	jmp	Lstore
+	 mtsr	cr,gr118
+L2:	jmpfdec	gr117,L3
+	 subc	gr98,gr98,gr106
+	jmp	Lstore
+	 mtsr	cr,gr118
+L3:	jmpfdec	gr117,L4
+	 subc	gr99,gr99,gr107
+	jmp	Lstore
+	 mtsr	cr,gr118
+L4:	jmpfdec	gr117,L5
+	 subc	gr100,gr100,gr108
+	jmp	Lstore
+	 mtsr	cr,gr118
+L5:	jmpfdec	gr117,L6
+	 subc	gr101,gr101,gr109
+	jmp	Lstore
+	 mtsr	cr,gr118
+L6:	subc	gr102,gr102,gr110
+
+Lstore:	storem	0,0,gr96,lr2
+	subc	gr116,gr116,gr116	; gr116 = not(cy)
+
+Lend:	jmpi	lr0
+	 add	gr96,gr116,1
diff --git a/rts/gmp/mpn/a29k/submul_1.s b/rts/gmp/mpn/a29k/submul_1.s
new file mode 100644
index 0000000000..ef97d8d4e5
--- /dev/null
+++ b/rts/gmp/mpn/a29k/submul_1.s
@@ -0,0 +1,116 @@
+; 29000 __gmpn_submul_1 -- Multiply a limb vector with a single limb and
+; subtract the product from a second limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; size		lr4
+; s2_limb	lr5
+
+	.cputype 29050
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_submul_1
+	.word	0x60000
+___gmpn_submul_1:
+	sub	lr4,lr4,8
+	jmpt	lr4,Ltail
+	 const	gr120,0			; init cylimb reg
+
+	srl	gr117,lr4,3		; divide by 8
+	sub	gr117,gr117,1		; count for jmpfdec
+
+Loop:	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr3
+	add	lr3,lr3,32
+
+	multiplu gr104,gr96,lr5
+	multmu	 gr96,gr96,lr5
+	multiplu gr105,gr97,lr5
+	multmu	 gr97,gr97,lr5
+	multiplu gr106,gr98,lr5
+	multmu	 gr98,gr98,lr5
+	multiplu gr107,gr99,lr5
+	multmu	 gr99,gr99,lr5
+	multiplu gr108,gr100,lr5
+	multmu	 gr100,gr100,lr5
+	multiplu gr109,gr101,lr5
+	multmu	 gr101,gr101,lr5
+	multiplu gr110,gr102,lr5
+	multmu	 gr102,gr102,lr5
+	multiplu gr111,gr103,lr5
+	multmu	 gr103,gr103,lr5
+
+	add	gr104,gr104,gr120
+	addc	gr105,gr105,gr96
+	addc	gr106,gr106,gr97
+	addc	gr107,gr107,gr98
+	addc	gr108,gr108,gr99
+	addc	gr109,gr109,gr100
+	addc	gr110,gr110,gr101
+	addc	gr111,gr111,gr102
+	addc	gr120,gr103,0
+
+	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr2
+
+	sub	gr96,gr96,gr104
+	subc	gr97,gr97,gr105
+	subc	gr98,gr98,gr106
+	subc	gr99,gr99,gr107
+	subc	gr100,gr100,gr108
+	subc	gr101,gr101,gr109
+	subc	gr102,gr102,gr110
+	subc	gr103,gr103,gr111
+
+	add	gr104,gr103,gr111	; invert carry from previus sub
+	addc	gr120,gr120,0
+
+	mtsrim	cr,(8-1)
+	storem	0,0,gr96,lr2
+	jmpfdec	gr117,Loop
+	 add	lr2,lr2,32
+
+Ltail:	and	lr4,lr4,(8-1)
+	sub	gr118,lr4,1		; count for CR
+	jmpt	gr118,Lend
+	 sub	lr4,lr4,2
+	sub	lr2,lr2,4		; offset res_ptr by one limb
+
+Loop2:	load	0,0,gr116,lr3
+	add	lr3,lr3,4
+	multiplu gr117,gr116,lr5
+	multmu	gr118,gr116,lr5
+	add	lr2,lr2,4
+	load	0,0,gr119,lr2
+	add	gr117,gr117,gr120
+	addc	gr118,gr118,0
+	sub	gr119,gr119,gr117
+	add	gr104,gr119,gr117	; invert carry from previus sub
+	store	0,0,gr119,lr2
+	jmpfdec	lr4,Loop2
+	 addc	gr120,gr118,0
+
+Lend:	jmpi	lr0
+	 or	gr96,gr120,0		; copy
diff --git a/rts/gmp/mpn/a29k/udiv.s b/rts/gmp/mpn/a29k/udiv.s
new file mode 100644
index 0000000000..fdd53a9a88
--- /dev/null
+++ b/rts/gmp/mpn/a29k/udiv.s
@@ -0,0 +1,30 @@
+; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+	.sect .lit,lit
+	.text
+	.align 4
+	.global ___udiv_qrnnd
+	.word 0x60000
+___udiv_qrnnd:
+	mtsr q,lr3
+	dividu gr96,lr4,lr5
+	mfsr gr116,q
+	jmpi lr0
+	store 0,0,gr116,lr2
diff --git a/rts/gmp/mpn/a29k/umul.s b/rts/gmp/mpn/a29k/umul.s
new file mode 100644
index 0000000000..7741981167
--- /dev/null
+++ b/rts/gmp/mpn/a29k/umul.s
@@ -0,0 +1,29 @@
+; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+	.sect .lit,lit
+	.text
+	.align 4
+	.global ___umul_ppmm
+	.word 0x50000
+___umul_ppmm:
+	multiplu gr116,lr3,lr4
+	multmu gr96,lr3,lr4
+	jmpi lr0
+	store 0,0,gr116,lr2
diff --git a/rts/gmp/mpn/alpha/README b/rts/gmp/mpn/alpha/README
new file mode 100644
index 0000000000..744260c7c5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/README
@@ -0,0 +1,224 @@
+This directory contains mpn functions optimized for DEC Alpha processors.
+
+ALPHA ASSEMBLY RULES AND REGULATIONS
+
+The `.prologue N' pseudo op marks the end of instruction that needs
+special handling by unwinding.  It also says whether $27 is really
+needed for computing the gp.  The `.mask M' pseudo op says which
+registers are saved on the stack, and at what offset in the frame.
+
+Cray code is very very different...
+
+
+RELEVANT OPTIMIZATION ISSUES
+
+EV4
+
+1. This chip has very limited store bandwidth.  The on-chip L1 cache is
+   write-through, and a cache line is transfered from the store buffer to
+   the off-chip L2 in as much 15 cycles on most systems.  This delay hurts
+   mpn_add_n, mpn_sub_n, mpn_lshift, and mpn_rshift.
+
+2. Pairing is possible between memory instructions and integer arithmetic
+   instructions.
+
+3. mulq and umulh are documented to have a latency of 23 cycles, but 2 of
+   these cycles are pipelined.  Thus, multiply instructions can be issued at
+   a rate of one each 21st cycle.
+
+EV5
+
+1. The memory bandwidth of this chip seems excellent, both for loads and
+   stores.  Even when the working set is larger than the on-chip L1 and L2
+   caches, the performance remain almost unaffected.
+
+2. mulq has a latency of 12 cycles and an issue rate of 1 each 8th cycle.
+   umulh has a measured latency of 14 cycles and an issue rate of 1 each
+   10th cycle.  But the exact timing is somewhat confusing.
+
+3. mpn_add_n.  With 4-fold unrolling, we need 37 instructions, whereof 12
+   are memory operations.  This will take at least
+	ceil(37/2) [dual issue] + 1 [taken branch] = 19 cycles
+   We have 12 memory cycles, plus 4 after-store conflict cycles, or 16 data
+   cache cycles, which should be completely hidden in the 19 issue cycles.
+   The computation is inherently serial, with these dependencies:
+
+	       ldq  ldq
+		 \  /\
+	  (or)   addq |
+	   |\   /   \ |
+	   | addq  cmpult
+	    \  |     |
+	     cmpult  |
+		 \  /
+		  or
+
+   I.e., 3 operations are needed between carry-in and carry-out, making 12
+   cycles the absolute minimum for the 4 limbs.  We could replace the `or'
+   with a cmoveq/cmovne, which could issue one cycle earlier that the `or',
+   but that might waste a cycle on EV4.  The total depth remain unaffected,
+   since cmov has a latency of 2 cycles.
+
+     addq
+     /   \
+   addq  cmpult
+     |      \
+   cmpult -> cmovne
+
+Montgomery has a slightly different way of computing carry that requires one
+less instruction, but has depth 4 (instead of the current 3).  Since the
+code is currently instruction issue bound, Montgomery's idea should save us
+1/2 cycle per limb, or bring us down to a total of 17 cycles or 4.25
+cycles/limb.  Unfortunately, this method will not be good for the EV6.
+
+EV6
+
+Here we have a really parallel pipeline, capable of issuing up to 4 integer
+instructions per cycle.  One integer multiply instruction can issue each
+cycle.  To get optimal speed, we need to pretend we are vectorizing the code,
+i.e., minimize the iterative dependencies.
+
+There are two dependencies to watch out for.  1) Address arithmetic
+dependencies, and 2) carry propagation dependencies.
+
+We can avoid serializing due to address arithmetic by unrolling the loop, so
+that addresses don't depend heavily on an index variable.  Avoiding
+serializing because of carry propagation is trickier; the ultimate performance
+of the code will be determined of the number of latency cycles it takes from
+accepting carry-in to a vector point until we can generate carry-out.
+
+Most integer instructions can execute in either the L0, U0, L1, or U1
+pipelines.  Shifts only execute in U0 and U1, and multiply only in U1.
+
+CMOV instructions split into two internal instructions, CMOV1 and CMOV2, but
+the execute efficiently.  But CMOV split the mapping process (see pg 2-26 in
+cmpwrgd.pdf), suggesting the CMOV should always be placed as the last
+instruction of an aligned 4 instruction block (?).
+
+Perhaps the most important issue is the latency between the L0/U0 and L1/U1
+clusters; a result obtained on either cluster has an extra cycle of latency
+for consumers in the opposite cluster.  Because of the dynamic nature of the
+implementation, it is hard to predict where an instruction will execute.
+
+The shift loops need (per limb):
+    1 load (Lx pipes)
+    1 store (Lx pipes)
+    2 shift (Ux pipes)
+    1 iaddlog (Lx pipes, Ux pipes)
+Obviously, since the pipes are very equally loaded, we should get 4 insn/cycle, or 1.25 cycles/limb.
+
+For mpn_add_n, we currently have
+    2 load (Lx pipes)
+    1 store (Lx pipes)
+    5 iaddlog (Lx pipes, Ux pipes)
+
+Again, we have a perfect balance and will be limited by carry propagation
+delays, currently three cycles.  The superoptimizer indicates that ther
+might be sequences that--using a final cmov--have a carry propagation delay
+of just two.  Montgomery's subtraction sequence could perhaps be used, by
+complementing some operands.  All in all, we should get down to 2 cycles
+without much problems.
+
+For mpn_mul_1, we could do, just like for mpn_add_n:
+    not		newlo,notnewlo
+    addq	cylimb,newlo,newlo  ||    cmpult	cylimb,notnewlo,cyout
+    addq	cyout,newhi,cylimb
+and get 2-cycle carry propagation.  The instructions needed will be
+    1 ld (Lx pipes)
+    1 st (Lx pipes)
+    2 mul (U1 pipe)
+    4 iaddlog (Lx pipes, Ux pipes)
+issue1: addq not mul ld
+issue2: cmpult addq mul st
+Conclusion: no cluster delays and 2-cycle carry delays will give us 2 cycles/limb!
+
+Last, we have mpn_addmul_1.  Almost certainly, we will get down to 3
+cycles/limb, which would be absolutely awesome.
+
+Old, perhaps obsolete addmul_1 dependency diagram (needs 175 columns wide screen):
+
+   i  
+   s
+   s  i
+   u  n
+   e  s
+   d  t
+      r
+   i  u
+l  n  c
+i  s  t
+v  t  i
+e  r  o
+   u  n
+v  c	
+a  t  t
+l  i  y
+u  o  p
+e  n  e
+s  s  s
+        issue
+         in
+        cycle
+         -1     ldq
+               /    \
+          0   |      \
+              |       \
+          1   |        |
+              |        |
+          2   |        |                   ldq
+              |        |                  /    \
+          3   |       mulq               |      \
+              |           \              |       \
+          4  umulh         \             |        |
+               |            |            |        |
+          5    |            |            |        |                   ldq
+               |            |            |        |                  /    \
+    4calm 6    |            |   ldq      |       mulq               |      \
+               |            |  /         |           \              |       \
+    4casm 7    |            | /         umulh         \             |        |
+6              |            ||            |            |            |        |
+    3aal  8    |            ||            |            |            |        |                   ldq
+7              |            ||            |            |            |        |                  /    \
+    4calm 9    |            ||            |            |   ldq      |       mulq               |      \
+9              |            ||            |            |  /         |           \              |       \
+    4casm 10   |            ||            |            | /         umulh         \             |        |
+9              |            ||            |            ||            |            |            |        |
+    3aal  11   |           addq           |            ||            |            |            |        |                   ldq
+9              |          //   \          |            ||            |            |            |        |                  /    \
+    4calm 12    \     cmpult    addq<-cy  |            ||            |            |   ldq      |       mulq               |      \
+13               \    /       //   \      |            ||            |            |  /         |           \              |       \
+    4casm 13      addq   cmpult     stq   |            ||            |            | /         umulh         \             |        |
+11                    \  /                |            ||            |            ||            |            |            |        |
+    3aal  14          addq                |           addq           |            ||            |            |            |        |                   ldq
+10                        \               |          //   \          |            ||            |            |            |        |                  /    \
+    4calm 15                cy ---->       \     cmpult    addq<-cy  |            ||            |            |   ldq      |       mulq               |      \
+13                                          \    /       //   \      |            ||            |            |  /         |           \              |       \
+    4casm 16                                 addq   cmpult     stq   |            ||            |            | /         umulh         \             |        |
+11                                               \  /                |            ||            |            ||            |            |            |        |
+    3aal  17                                     addq                |           addq           |            ||            |            |            |        |
+10                                                   \               |          //   \          |            ||            |            |            |        |
+    4calm 18                                           cy ---->       \     cmpult    addq<-cy  |            ||            |            |   ldq      |       mulq
+13                                                                     \    /       //   \      |            ||            |            |  /         |           \
+    4casm 19                                                            addq   cmpult     stq   |            ||            |            | /         umulh         \
+11                                                                          \  /                |            ||            |            ||            |            |
+    3aal  20                                                                addq                |           addq           |            ||            |            |
+10                                                                              \               |          //   \          |            ||            |            |
+    4calm 21                                                                      cy ---->       \     cmpult    addq<-cy  |            ||            |            |   ldq
+                                                                                                  \    /       //   \      |            ||            |            |  /
+          22                                                                                       addq   cmpult     stq   |            ||            |            | /
+                                                                                                       \  /                |            ||            |            ||
+          23                                                                                           addq                |           addq           |            ||
+                                                                                                           \               |          //   \          |            ||
+          24                                                                                                 cy ---->       \     cmpult    addq<-cy  |            ||
+                                                                                                                             \    /       //   \      |            ||
+          25                                                                                                                  addq   cmpult     stq   |            ||
+                                                                                                                                  \  /                |            ||
+          26                                                                                                                      addq                |           addq
+                                                                                                                                      \               |          //   \
+          27                                                                                                                            cy ---->       \     cmpult    addq<-cy
+                                                                                                                                                        \    /       //   \
+          28                                                                                                                                             addq   cmpult     stq
+                                                                                                                                                             \  /
+As many as 6 consecutive points will be under execution simultaneously, or if we                                                                             addq
+schedule loads even further away, maybe 7 or 8.  But the number of live quantities                                                                               \
+is reasonable, and can easily be satisfied.                                                                                                                        cy ---->    
diff --git a/rts/gmp/mpn/alpha/add_n.asm b/rts/gmp/mpn/alpha/add_n.asm
new file mode 100644
index 0000000000..08d6a9f7b8
--- /dev/null
+++ b/rts/gmp/mpn/alpha/add_n.asm
@@ -0,0 +1,114 @@
+dnl  Alpha mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl  store sum in a third limb vector.
+
+dnl  Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  s2_ptr	r18
+dnl  size	r19
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	ldq	r3,0(r17)
+	ldq	r4,0(r18)
+
+	subq	r19,1,r19
+	and	r19,4-1,r2	C number of limbs in first loop
+	bis	r31,r31,r0
+	beq	r2,$L0		C if multiple of 4 limbs, skip first loop
+
+	subq	r19,r2,r19
+
+$Loop0:	subq	r2,1,r2
+	ldq	r5,8(r17)
+	addq	r4,r0,r4
+	ldq	r6,8(r18)
+	cmpult	r4,r0,r1
+	addq	r3,r4,r4
+	cmpult	r4,r3,r0
+	stq	r4,0(r16)
+	bis	r0,r1,r0
+
+	addq	r17,8,r17
+	addq	r18,8,r18
+	bis	r5,r5,r3
+	bis	r6,r6,r4
+	addq	r16,8,r16
+	bne	r2,$Loop0
+
+$L0:	beq	r19,$Lend
+
+	ALIGN(8)
+$Loop:	subq	r19,4,r19
+
+	ldq	r5,8(r17)
+	addq	r4,r0,r4
+	ldq	r6,8(r18)
+	cmpult	r4,r0,r1
+	addq	r3,r4,r4
+	cmpult	r4,r3,r0
+	stq	r4,0(r16)
+	bis	r0,r1,r0
+
+	ldq	r3,16(r17)
+	addq	r6,r0,r6
+	ldq	r4,16(r18)
+	cmpult	r6,r0,r1
+	addq	r5,r6,r6
+	cmpult	r6,r5,r0
+	stq	r6,8(r16)
+	bis	r0,r1,r0
+
+	ldq	r5,24(r17)
+	addq	r4,r0,r4
+	ldq	r6,24(r18)
+	cmpult	r4,r0,r1
+	addq	r3,r4,r4
+	cmpult	r4,r3,r0
+	stq	r4,16(r16)
+	bis	r0,r1,r0
+
+	ldq	r3,32(r17)
+	addq	r6,r0,r6
+	ldq	r4,32(r18)
+	cmpult	r6,r0,r1
+	addq	r5,r6,r6
+	cmpult	r6,r5,r0
+	stq	r6,24(r16)
+	bis	r0,r1,r0
+
+	addq	r17,32,r17
+	addq	r18,32,r18
+	addq	r16,32,r16
+	bne	r19,$Loop
+
+$Lend:	addq	r4,r0,r4
+	cmpult	r4,r0,r1
+	addq	r3,r4,r4
+	cmpult	r4,r3,r0
+	stq	r4,0(r16)
+	bis	r0,r1,r0
+	ret	r31,(r26),1
+EPILOGUE(mpn_add_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/addmul_1.asm b/rts/gmp/mpn/alpha/addmul_1.asm
new file mode 100644
index 0000000000..4ea900be6b
--- /dev/null
+++ b/rts/gmp/mpn/alpha/addmul_1.asm
@@ -0,0 +1,87 @@
+dnl Alpha __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl  Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  s2_limb	r19
+
+dnl  This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7
+dnl  cycles/limb on EV6.
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	subq	r18,1,r18	C size--
+	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	umulh	r2,r19,r0	C r0 = prod_high
+	beq	r18,$Lend1	C jump if size was == 1
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	subq	r18,1,r18	C size--
+	addq	r5,r3,r3
+	cmpult	r3,r5,r4
+	stq	r3,0(r16)
+	addq	r16,8,r16	C res_ptr++
+	beq	r18,$Lend2	C jump if size was == 2
+
+	ALIGN(8)
+$Loop:	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	subq	r18,1,r18	C size--
+	umulh	r2,r19,r4	C r4 = cy_limb
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,r3,r3
+	cmpult	r3,r5,r5
+	stq	r3,0(r16)
+	addq	r16,8,r16	C res_ptr++
+	addq	r5,r0,r0	C combine carries
+	bne	r18,$Loop
+
+$Lend2:	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,r19,r4	C r4 = cy_limb
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,r3,r3
+	cmpult	r3,r5,r5
+	stq	r3,0(r16)
+	addq	r5,r0,r0	C combine carries
+	addq	r4,r0,r0	C cy_limb = prod_high + cy
+	ret	r31,(r26),1
+$Lend1:	addq	r5,r3,r3
+	cmpult	r3,r5,r5
+	stq	r3,0(r16)
+	addq	r0,r5,r0
+	ret	r31,(r26),1
+EPILOGUE(mpn_addmul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/cntlz.asm b/rts/gmp/mpn/alpha/cntlz.asm
new file mode 100644
index 0000000000..febb3b70d9
--- /dev/null
+++ b/rts/gmp/mpn/alpha/cntlz.asm
@@ -0,0 +1,68 @@
+dnl  Alpha auxiliary for longlong.h's count_leading_zeros
+
+dnl  Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  DISCUSSION:
+
+dnl  Other methods have been tried, and using a 128-entry table actually trims
+dnl  about 10% of the execution time (on a 21164) when the table is in the L1
+dnl  cache.  But under non-benchmarking conditions, the table will hardly be in
+dnl  the L1 cache.  Tricky bit-fiddling methods with multiplies and magic tables
+dnl  are also possible, but they require many more instructions than the current
+dnl  code.  (But for count_trailing_zeros, such tricks are beneficial.)
+dnl  Finally, converting to floating-point and extracting the exponent is much
+dnl  slower.
+
+ASM_START()
+PROLOGUE(MPN(count_leading_zeros))
+	bis	r31,63,r0		C initialize partial result count
+
+	srl	r16,32,r1		C shift down 32 steps -> r1
+	cmovne	r1,r1,r16		C select r1 if non-zero
+	cmovne	r1,31,r0		C if r1 is nonzero choose smaller count
+
+	srl	r16,16,r1		C shift down 16 steps -> r1
+	subq	r0,16,r2		C generate new partial result count
+	cmovne	r1,r1,r16		C choose new r1 if non-zero
+	cmovne	r1,r2,r0		C choose new count if r1 was non-zero
+
+	srl	r16,8,r1
+	subq	r0,8,r2
+	cmovne	r1,r1,r16
+	cmovne	r1,r2,r0
+
+	srl	r16,4,r1
+	subq	r0,4,r2
+	cmovne	r1,r1,r16
+	cmovne	r1,r2,r0
+
+	srl	r16,2,r1
+	subq	r0,2,r2
+	cmovne	r1,r1,r16
+	cmovne	r1,r2,r0
+
+	srl	r16,1,r1		C extract bit 1
+	subq	r0,r1,r0		C subtract it from partial result
+
+	ret	r31,(r26),1
+EPILOGUE(MPN(count_leading_zeros))
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/default.m4 b/rts/gmp/mpn/alpha/default.m4
new file mode 100644
index 0000000000..5f4c48dc73
--- /dev/null
+++ b/rts/gmp/mpn/alpha/default.m4
@@ -0,0 +1,77 @@
+divert(-1)
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+define(`ASM_START',
+	`
+	.set noreorder
+	.set noat')
+
+define(`X',`0x$1')
+define(`FLOAT64',
+	`
+	.align	3
+$1:	.t_floating $2')
+
+define(`PROLOGUE',
+	`
+	.text
+	.align	3
+	.globl	$1
+	.ent	$1
+$1:
+	.frame r30,0,r26
+	.prologue 0')
+
+define(`PROLOGUE_GP',
+	`
+	.text
+	.align	3
+	.globl	$1
+	.ent	$1
+$1:
+	ldgp	r29,0(r27)
+	.frame	r30,0,r26
+	.prologue 1')
+
+define(`EPILOGUE',
+	`
+	.end	$1')
+
+dnl Map register names r0, r1, etc, to `$0', `$1', etc.
+dnl This is needed on all systems but Unicos
+forloop(i,0,31,
+`define(`r'i,``$''i)'
+)
+forloop(i,0,31,
+`define(`f'i,``$f''i)'
+)
+
+define(`DATASTART',
+	`dnl
+	DATA
+$1:')
+define(`DATAEND',`dnl')
+
+define(`ASM_END',`dnl')
+
+divert
diff --git a/rts/gmp/mpn/alpha/ev5/add_n.asm b/rts/gmp/mpn/alpha/ev5/add_n.asm
new file mode 100644
index 0000000000..716d6404ae
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/add_n.asm
@@ -0,0 +1,143 @@
+dnl  Alpha EV5 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl  store sum in a third limb vector.
+
+dnl  Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  s2_ptr	r18
+dnl  size	r19
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	bis	r31,r31,r25		C clear cy
+	subq	r19,4,r19		C decr loop cnt
+	blt	r19,$Lend2		C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+	ldq	r0,0(r18)
+	ldq	r4,0(r17)
+	ldq	r1,8(r18)
+	ldq	r5,8(r17)
+	addq	r17,32,r17		C update s1_ptr
+	ldq	r2,16(r18)
+	addq	r0,r4,r20		C 1st main add
+	ldq	r3,24(r18)
+	subq	r19,4,r19		C decr loop cnt
+	ldq	r6,-16(r17)
+	cmpult	r20,r0,r25		C compute cy from last add
+	ldq	r7,-8(r17)
+	addq	r1,r5,r28		C 2nd main add
+	addq	r18,32,r18		C update s2_ptr
+	addq	r28,r25,r21		C 2nd carry add
+	cmpult	r28,r5,r8		C compute cy from last add
+	blt	r19,$Lend1		C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+	ALIGN(16)
+$Loop:	cmpult	r21,r28,r25		C compute cy from last add
+	ldq	r0,0(r18)
+	bis	r8,r25,r25		C combine cy from the two adds
+	ldq	r1,8(r18)
+	addq	r2,r6,r28		C 3rd main add
+	ldq	r4,0(r17)
+	addq	r28,r25,r22		C 3rd carry add
+	ldq	r5,8(r17)
+	cmpult	r28,r6,r8		C compute cy from last add
+	cmpult	r22,r28,r25		C compute cy from last add
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two adds
+	stq	r21,8(r16)
+	addq	r3,r7,r28		C 4th main add
+	addq	r28,r25,r23		C 4th carry add
+	cmpult	r28,r7,r8		C compute cy from last add
+	cmpult	r23,r28,r25		C compute cy from last add
+		addq	r17,32,r17		C update s1_ptr
+	bis	r8,r25,r25		C combine cy from the two adds
+		addq	r16,32,r16		C update res_ptr
+	addq	r0,r4,r28		C 1st main add
+	ldq	r2,16(r18)
+	addq	r25,r28,r20		C 1st carry add
+	ldq	r3,24(r18)
+	cmpult	r28,r4,r8		C compute cy from last add
+	ldq	r6,-16(r17)
+	cmpult	r20,r28,r25		C compute cy from last add
+	ldq	r7,-8(r17)
+	bis	r8,r25,r25		C combine cy from the two adds
+	subq	r19,4,r19		C decr loop cnt
+	stq	r22,-16(r16)
+	addq	r1,r5,r28		C 2nd main add
+	stq	r23,-8(r16)
+	addq	r25,r28,r21		C 2nd carry add
+		addq	r18,32,r18		C update s2_ptr
+	cmpult	r28,r5,r8		C compute cy from last add
+	bge	r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1:	cmpult	r21,r28,r25		C compute cy from last add
+	bis	r8,r25,r25		C combine cy from the two adds
+	addq	r2,r6,r28		C 3rd main add
+	addq	r28,r25,r22		C 3rd carry add
+	cmpult	r28,r6,r8		C compute cy from last add
+	cmpult	r22,r28,r25		C compute cy from last add
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two adds
+	stq	r21,8(r16)
+	addq	r3,r7,r28		C 4th main add
+	addq	r28,r25,r23		C 4th carry add
+	cmpult	r28,r7,r8		C compute cy from last add
+	cmpult	r23,r28,r25		C compute cy from last add
+	bis	r8,r25,r25		C combine cy from the two adds
+	addq	r16,32,r16		C update res_ptr
+	stq	r22,-16(r16)
+	stq	r23,-8(r16)
+$Lend2:	addq	r19,4,r19		C restore loop cnt
+	beq	r19,$Lret
+C Start software pipeline for 2nd loop
+	ldq	r0,0(r18)
+	ldq	r4,0(r17)
+	subq	r19,1,r19
+	beq	r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+	ALIGN(16)
+$Loop0:	addq	r0,r4,r28		C main add
+	ldq	r0,8(r18)
+	cmpult	r28,r4,r8		C compute cy from last add
+	ldq	r4,8(r17)
+	addq	r28,r25,r20		C carry add
+	addq	r18,8,r18
+	addq	r17,8,r17
+	stq	r20,0(r16)
+	cmpult	r20,r28,r25		C compute cy from last add
+	subq	r19,1,r19		C decr loop cnt
+	bis	r8,r25,r25		C combine cy from the two adds
+	addq	r16,8,r16
+	bne	r19,$Loop0
+$Lend0:	addq	r0,r4,r28		C main add
+	addq	r28,r25,r20		C carry add
+	cmpult	r28,r4,r8		C compute cy from last add
+	cmpult	r20,r28,r25		C compute cy from last add
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two adds
+
+$Lret:	bis	r25,r31,r0		C return cy
+	ret	r31,(r26),1
+EPILOGUE(mpn_add_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev5/lshift.asm b/rts/gmp/mpn/alpha/ev5/lshift.asm
new file mode 100644
index 0000000000..cb181dda66
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/lshift.asm
@@ -0,0 +1,169 @@
+dnl  Alpha EV5 __gmpn_lshift -- Shift a number left.
+
+dnl  Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  cnt	r19
+
+dnl  This code runs at 3.25 cycles/limb on the EV5.
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	s8addq	r18,r17,r17	C make r17 point at end of s1
+	ldq	r4,-8(r17)	C load first limb
+	subq	r31,r19,r20
+	s8addq	r18,r16,r16	C make r16 point at end of RES
+	subq	r18,1,r18
+	and	r18,4-1,r28	C number of limbs in first loop
+	srl	r4,r20,r0	C compute function result
+
+	beq	r28,$L0
+	subq	r18,r28,r18
+
+	ALIGN(8)
+$Loop0:	ldq	r3,-16(r17)
+	subq	r16,8,r16
+	sll	r4,r19,r5
+	subq	r17,8,r17
+	subq	r28,1,r28
+	srl	r3,r20,r6
+	bis	r3,r3,r4
+	bis	r5,r6,r8
+	stq	r8,0(r16)
+	bne	r28,$Loop0
+
+$L0:	sll	r4,r19,r24
+	beq	r18,$Lend
+C warm up phase 1
+	ldq	r1,-16(r17)
+	subq	r18,4,r18
+	ldq	r2,-24(r17)
+	ldq	r3,-32(r17)
+	ldq	r4,-40(r17)
+	beq	r18,$Lend1
+C warm up phase 2
+	srl	r1,r20,r7
+	sll	r1,r19,r21
+	srl	r2,r20,r8
+	ldq	r1,-48(r17)
+	sll	r2,r19,r22
+	ldq	r2,-56(r17)
+	srl	r3,r20,r5
+	bis	r7,r24,r7
+	sll	r3,r19,r23
+	bis	r8,r21,r8
+	srl	r4,r20,r6
+	ldq	r3,-64(r17)
+	sll	r4,r19,r24
+	ldq	r4,-72(r17)
+	subq	r18,4,r18
+	beq	r18,$Lend2
+	ALIGN(16)
+C main loop
+$Loop:	stq	r7,-8(r16)
+	bis	r5,r22,r5
+	stq	r8,-16(r16)
+	bis	r6,r23,r6
+
+	srl	r1,r20,r7
+	subq	r18,4,r18
+	sll	r1,r19,r21
+	unop	C ldq	r31,-96(r17)
+
+	srl	r2,r20,r8
+	ldq	r1,-80(r17)
+	sll	r2,r19,r22
+	ldq	r2,-88(r17)
+
+	stq	r5,-24(r16)
+	bis	r7,r24,r7
+	stq	r6,-32(r16)
+	bis	r8,r21,r8
+
+	srl	r3,r20,r5
+	unop	C ldq	r31,-96(r17)
+	sll	r3,r19,r23
+	subq	r16,32,r16
+
+	srl	r4,r20,r6
+	ldq	r3,-96(r17)
+	sll	r4,r19,r24
+	ldq	r4,-104(r17)
+
+	subq	r17,32,r17
+	bne	r18,$Loop
+C cool down phase 2/1
+$Lend2:	stq	r7,-8(r16)
+	bis	r5,r22,r5
+	stq	r8,-16(r16)
+	bis	r6,r23,r6
+	srl	r1,r20,r7
+	sll	r1,r19,r21
+	srl	r2,r20,r8
+	sll	r2,r19,r22
+	stq	r5,-24(r16)
+	bis	r7,r24,r7
+	stq	r6,-32(r16)
+	bis	r8,r21,r8
+	srl	r3,r20,r5
+	sll	r3,r19,r23
+	srl	r4,r20,r6
+	sll	r4,r19,r24
+C cool down phase 2/2
+	stq	r7,-40(r16)
+	bis	r5,r22,r5
+	stq	r8,-48(r16)
+	bis	r6,r23,r6
+	stq	r5,-56(r16)
+	stq	r6,-64(r16)
+C cool down phase 2/3
+	stq	r24,-72(r16)
+	ret	r31,(r26),1
+
+C cool down phase 1/1
+$Lend1:	srl	r1,r20,r7
+	sll	r1,r19,r21
+	srl	r2,r20,r8
+	sll	r2,r19,r22
+	srl	r3,r20,r5
+	bis	r7,r24,r7
+	sll	r3,r19,r23
+	bis	r8,r21,r8
+	srl	r4,r20,r6
+	sll	r4,r19,r24
+C cool down phase 1/2
+	stq	r7,-8(r16)
+	bis	r5,r22,r5
+	stq	r8,-16(r16)
+	bis	r6,r23,r6
+	stq	r5,-24(r16)
+	stq	r6,-32(r16)
+	stq	r24,-40(r16)
+	ret	r31,(r26),1
+
+$Lend:	stq	r24,-8(r16)
+	ret	r31,(r26),1
+EPILOGUE(mpn_lshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev5/rshift.asm b/rts/gmp/mpn/alpha/ev5/rshift.asm
new file mode 100644
index 0000000000..9940d83fad
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/rshift.asm
@@ -0,0 +1,167 @@
+dnl  Alpha EV5 __gmpn_rshift -- Shift a number right.
+
+dnl  Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  cnt	r19
+
+dnl  This code runs at 3.25 cycles/limb on the EV5.
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	ldq	r4,0(r17)	C load first limb
+	subq	r31,r19,r20
+	subq	r18,1,r18
+	and	r18,4-1,r28	C number of limbs in first loop
+	sll	r4,r20,r0	C compute function result
+
+	beq	r28,$L0
+	subq	r18,r28,r18
+
+	ALIGN(8)
+$Loop0:	ldq	r3,8(r17)
+	addq	r16,8,r16
+	srl	r4,r19,r5
+	addq	r17,8,r17
+	subq	r28,1,r28
+	sll	r3,r20,r6
+	bis	r3,r3,r4
+	bis	r5,r6,r8
+	stq	r8,-8(r16)
+	bne	r28,$Loop0
+
+$L0:	srl	r4,r19,r24
+	beq	r18,$Lend
+C warm up phase 1
+	ldq	r1,8(r17)
+	subq	r18,4,r18
+	ldq	r2,16(r17)
+	ldq	r3,24(r17)
+	ldq	r4,32(r17)
+	beq	r18,$Lend1
+C warm up phase 2
+	sll	r1,r20,r7
+	srl	r1,r19,r21
+	sll	r2,r20,r8
+	ldq	r1,40(r17)
+	srl	r2,r19,r22
+	ldq	r2,48(r17)
+	sll	r3,r20,r5
+	bis	r7,r24,r7
+	srl	r3,r19,r23
+	bis	r8,r21,r8
+	sll	r4,r20,r6
+	ldq	r3,56(r17)
+	srl	r4,r19,r24
+	ldq	r4,64(r17)
+	subq	r18,4,r18
+	beq	r18,$Lend2
+	ALIGN(16)
+C main loop
+$Loop:	stq	r7,0(r16)
+	bis	r5,r22,r5
+	stq	r8,8(r16)
+	bis	r6,r23,r6
+
+	sll	r1,r20,r7
+	subq	r18,4,r18
+	srl	r1,r19,r21
+	unop	C ldq	r31,-96(r17)
+
+	sll	r2,r20,r8
+	ldq	r1,72(r17)
+	srl	r2,r19,r22
+	ldq	r2,80(r17)
+
+	stq	r5,16(r16)
+	bis	r7,r24,r7
+	stq	r6,24(r16)
+	bis	r8,r21,r8
+
+	sll	r3,r20,r5
+	unop	C ldq	r31,-96(r17)
+	srl	r3,r19,r23
+	addq	r16,32,r16
+
+	sll	r4,r20,r6
+	ldq	r3,88(r17)
+	srl	r4,r19,r24
+	ldq	r4,96(r17)
+
+	addq	r17,32,r17
+	bne	r18,$Loop
+C cool down phase 2/1
+$Lend2:	stq	r7,0(r16)
+	bis	r5,r22,r5
+	stq	r8,8(r16)
+	bis	r6,r23,r6
+	sll	r1,r20,r7
+	srl	r1,r19,r21
+	sll	r2,r20,r8
+	srl	r2,r19,r22
+	stq	r5,16(r16)
+	bis	r7,r24,r7
+	stq	r6,24(r16)
+	bis	r8,r21,r8
+	sll	r3,r20,r5
+	srl	r3,r19,r23
+	sll	r4,r20,r6
+	srl	r4,r19,r24
+C cool down phase 2/2
+	stq	r7,32(r16)
+	bis	r5,r22,r5
+	stq	r8,40(r16)
+	bis	r6,r23,r6
+	stq	r5,48(r16)
+	stq	r6,56(r16)
+C cool down phase 2/3
+	stq	r24,64(r16)
+	ret	r31,(r26),1
+
+C cool down phase 1/1
+$Lend1:	sll	r1,r20,r7
+	srl	r1,r19,r21
+	sll	r2,r20,r8
+	srl	r2,r19,r22
+	sll	r3,r20,r5
+	bis	r7,r24,r7
+	srl	r3,r19,r23
+	bis	r8,r21,r8
+	sll	r4,r20,r6
+	srl	r4,r19,r24
+C cool down phase 1/2
+	stq	r7,0(r16)
+	bis	r5,r22,r5
+	stq	r8,8(r16)
+	bis	r6,r23,r6
+	stq	r5,16(r16)
+	stq	r6,24(r16)
+	stq	r24,32(r16)
+	ret	r31,(r26),1
+
+$Lend:	stq	r24,0(r16)
+	ret	r31,(r26),1
+EPILOGUE(mpn_rshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev5/sub_n.asm b/rts/gmp/mpn/alpha/ev5/sub_n.asm
new file mode 100644
index 0000000000..5248a2aa38
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/sub_n.asm
@@ -0,0 +1,143 @@
+dnl  Alpha EV5 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0
+dnl  and store difference in a third limb vector.
+
+dnl  Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  s2_ptr	r18
+dnl  size	r19
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	bis	r31,r31,r25		C clear cy
+	subq	r19,4,r19		C decr loop cnt
+	blt	r19,$Lend2		C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+	ldq	r0,0(r18)
+	ldq	r4,0(r17)
+	ldq	r1,8(r18)
+	ldq	r5,8(r17)
+	addq	r17,32,r17		C update s1_ptr
+	ldq	r2,16(r18)
+	subq	r4,r0,r20		C 1st main subtract
+	ldq	r3,24(r18)
+	subq	r19,4,r19		C decr loop cnt
+	ldq	r6,-16(r17)
+	cmpult	r4,r0,r25		C compute cy from last subtract
+	ldq	r7,-8(r17)
+	subq	r5,r1,r28		C 2nd main subtract
+	addq	r18,32,r18		C update s2_ptr
+	subq	r28,r25,r21		C 2nd carry subtract
+	cmpult	r5,r1,r8		C compute cy from last subtract
+	blt	r19,$Lend1		C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+	ALIGN(16)
+$Loop:	cmpult	r28,r25,r25		C compute cy from last subtract
+	ldq	r0,0(r18)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	ldq	r1,8(r18)
+	subq	r6,r2,r28		C 3rd main subtract
+	ldq	r4,0(r17)
+	subq	r28,r25,r22		C 3rd carry subtract
+	ldq	r5,8(r17)
+	cmpult	r6,r2,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	stq	r21,8(r16)
+	subq	r7,r3,r28		C 4th main subtract
+	subq	r28,r25,r23		C 4th carry subtract
+	cmpult	r7,r3,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+		addq	r17,32,r17		C update s1_ptr
+	bis	r8,r25,r25		C combine cy from the two subtracts
+		addq	r16,32,r16		C update res_ptr
+	subq	r4,r0,r28		C 1st main subtract
+	ldq	r2,16(r18)
+	subq	r28,r25,r20		C 1st carry subtract
+	ldq	r3,24(r18)
+	cmpult	r4,r0,r8		C compute cy from last subtract
+	ldq	r6,-16(r17)
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	ldq	r7,-8(r17)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	subq	r19,4,r19		C decr loop cnt
+	stq	r22,-16(r16)
+	subq	r5,r1,r28		C 2nd main subtract
+	stq	r23,-8(r16)
+	subq	r28,r25,r21		C 2nd carry subtract
+		addq	r18,32,r18		C update s2_ptr
+	cmpult	r5,r1,r8		C compute cy from last subtract
+	bge	r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1:	cmpult	r28,r25,r25		C compute cy from last subtract
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	subq	r6,r2,r28		C cy add
+	subq	r28,r25,r22		C 3rd main subtract
+	cmpult	r6,r2,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	stq	r21,8(r16)
+	subq	r7,r3,r28		C cy add
+	subq	r28,r25,r23		C 4th main subtract
+	cmpult	r7,r3,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	addq	r16,32,r16		C update res_ptr
+	stq	r22,-16(r16)
+	stq	r23,-8(r16)
+$Lend2:	addq	r19,4,r19		C restore loop cnt
+	beq	r19,$Lret
+C Start software pipeline for 2nd loop
+	ldq	r0,0(r18)
+	ldq	r4,0(r17)
+	subq	r19,1,r19
+	beq	r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+	ALIGN(16)
+$Loop0:	subq	r4,r0,r28		C main subtract
+	cmpult	r4,r0,r8		C compute cy from last subtract
+	ldq	r0,8(r18)
+	ldq	r4,8(r17)
+	subq	r28,r25,r20		C carry subtract
+	addq	r18,8,r18
+	addq	r17,8,r17
+	stq	r20,0(r16)
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	subq	r19,1,r19		C decr loop cnt
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	addq	r16,8,r16
+	bne	r19,$Loop0
+$Lend0:	subq	r4,r0,r28		C main subtract
+	subq	r28,r25,r20		C carry subtract
+	cmpult	r4,r0,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+
+$Lret:	bis	r25,r31,r0		C return cy
+	ret	r31,(r26),1
+EPILOGUE(mpn_sub_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev6/addmul_1.asm b/rts/gmp/mpn/alpha/ev6/addmul_1.asm
new file mode 100644
index 0000000000..2f588626a5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev6/addmul_1.asm
@@ -0,0 +1,474 @@
+dnl Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  s2_limb	r19
+
+dnl  This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
+dnl  exactly 3.625 cycles/limb on EV6...
+
+dnl This code was written in close cooperation with ev6 pipeline expert
+dnl Steve Root (root@toober.hlo.dec.com).  Any errors are tege's fault, though.
+dnl
+dnl   Register usages for unrolled loop:
+dnl	  0-3     mul's
+dnl	  4-7     acc's
+dnl	  8-15    mul results
+dnl	  20,21   carry's
+dnl	  22,23   save for stores
+
+dnl   Sustains 8 mul-adds in 29 cycles in the unrolled inner loop.
+
+dnl   The stores can issue a cycle late so we have paired no-op's to 'catch'
+dnl   them, so that further disturbance to the schedule is damped.
+
+dnl   We couldn't pair the loads, because the entangled schedule of the
+dnl   carry's has to happen on one side {0} of the machine. Note, the total
+dnl   use of U0, and the total use of L0 (after attending to the stores).
+dnl   which is part of the reason why....
+
+dnl   This is a great schedule for the d_cache, a poor schedule for the
+dnl   b_cache. The lockup on U0 means that any stall can't be recovered
+dnl   from. Consider a ldq in L1.  say that load gets stalled because it
+dnl   collides with a fill from the b_Cache. On the next cycle, this load
+dnl   gets priority. If first looks at L0, and goes there. The instruction
+dnl   we intended for L0 gets to look at L1, which is NOT where we want
+dnl   it. It either stalls 1, because it can't go in L0, or goes there, and
+dnl   causes a further instruction to stall.
+
+dnl   So for b_cache, we're likely going to want to put one or more cycles
+dnl   back into the code! And, of course, put in prefetches. For the
+dnl   accumulator, lds, intent to modify.  For the multiplier, you might
+dnl   want ldq, evict next, if you're not wanting to use it again soon. Use
+dnl   256 ahead of present pointer value. At a place where we have an mt
+dnl   followed by a bookkeeping, put the bookkeeping in upper, and the
+dnl   prefetch into lower.
+
+dnl   Note, the usage of physical registers per cycle is smoothed off, as
+dnl   much as possible.
+
+dnl   Note, the ldq's and stq's are at the end of the quadpacks.  note, we'd
+dnl   like not to have a ldq or stq to preceded a conditional branch in a
+dnl   quadpack. The conditional branch moves the retire pointer one cycle
+dnl   later.
+
+dnl   Optimization notes:
+dnl   Callee-saves regs: r9 r10 r11 r12 r13 r14 r15 r26 ?r27?
+dnl   Reserved regs:	 r29 r30 r31
+dnl   Free caller-saves regs in unrolled code: r24 r25 r28
+dnl   We should swap some of the callee-saves regs for some of the free
+dnl   caller-saves regs, saving some overhead cycles.
+dnl   Most importantly, we should write fast code for the 0-7 case.
+dnl   The code we use there are for the 21164, and runs at 7 cycles/limb
+dnl   on the 21264.  Should not be hard, if we write specialized code for
+dnl   1-7 limbs (the one for 0 limbs should be straightforward).  We then just
+dnl   need a jump table indexed by the low 3 bits of the count argument.
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	cmpult	r18,	8,	r1
+	beq	r1,	$Large
+
+	ldq	r2,	0(r17)		C r2 = s1_limb
+	addq	r17,	8,	r17	C s1_ptr++
+	subq	r18,	1,	r18	C size--
+	mulq	r2,	r19,	r3	C r3 = prod_low
+	ldq	r5,	0(r16)		C r5 = *res_ptr
+	umulh	r2,	r19,	r0	C r0 = prod_high
+	beq	r18,	$Lend0b		C jump if size was == 1
+	ldq	r2,	0(r17)		C r2 = s1_limb
+	addq	r17,	8,	r17	C s1_ptr++
+	subq	r18,	1,	r18	C size--
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r4
+	stq	r3,	0(r16)
+	addq	r16,	8,	r16	C res_ptr++
+	beq	r18,	$Lend0a		C jump if size was == 2
+
+	ALIGN(8)
+$Loop0:	mulq	r2,	r19,	r3	C r3 = prod_low
+	ldq	r5,	0(r16)		C r5 = *res_ptr
+	addq	r4,	r0,	r0	C cy_limb = cy_limb + 'cy'
+	subq	r18,	1,	r18	C size--
+	umulh	r2,	r19,	r4	C r4 = cy_limb
+	ldq	r2,	0(r17)		C r2 = s1_limb
+	addq	r17,	8,	r17	C s1_ptr++
+	addq	r3,	r0,	r3	C r3 = cy_limb + prod_low
+	cmpult	r3,	r0,	r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r5
+	stq	r3,	0(r16)
+	addq	r16,	8,	r16	C res_ptr++
+	addq	r5,	r0,	r0	C combine carries
+	bne	r18,	$Loop0
+$Lend0a:
+	mulq	r2,	r19,	r3	C r3 = prod_low
+	ldq	r5,	0(r16)		C r5 = *res_ptr
+	addq	r4,	r0,	r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,	r19,	r4	C r4 = cy_limb
+	addq	r3,	r0,	r3	C r3 = cy_limb + prod_low
+	cmpult	r3,	r0,	r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r5
+	stq	r3,	0(r16)
+	addq	r5,	r0,	r0	C combine carries
+	addq	r4,	r0,	r0	C cy_limb = prod_high + cy
+	ret	r31,	(r26),	1
+$Lend0b:
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r5
+	stq	r3,	0(r16)
+	addq	r0,	r5,	r0
+	ret	r31,	(r26),	1
+
+$Large:
+	lda	$30,	-240($30)
+	stq	$9,	8($30)
+	stq	$10,	16($30)
+	stq	$11,	24($30)
+	stq	$12,	32($30)
+	stq	$13,	40($30)
+	stq	$14,	48($30)
+	stq	$15,	56($30)
+
+	and	r18,	7,	r20	C count for the first loop, 0-7
+	srl	r18,	3,	r18	C count for unrolled loop
+	bis	r31,	r31,	r0
+	beq	r20,	$Lunroll
+	ldq	r2,	0(r17)		C r2 = s1_limb
+	addq	r17,	8,	r17	C s1_ptr++
+	subq	r20,	1,	r20	C size--
+	mulq	r2,	r19,	r3	C r3 = prod_low
+	ldq	r5,	0(r16)		C r5 = *res_ptr
+	umulh	r2,	r19,	r0	C r0 = prod_high
+	beq	r20,	$Lend1b		C jump if size was == 1
+	ldq	r2,	0(r17)		C r2 = s1_limb
+	addq	r17,	8,	r17	C s1_ptr++
+	subq	r20,	1,	r20	C size--
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r4
+	stq	r3,	0(r16)
+	addq	r16,	8,	r16	C res_ptr++
+	beq	r20,	$Lend1a		C jump if size was == 2
+
+	ALIGN(8)
+$Loop1:	mulq	r2,	r19,	r3	C r3 = prod_low
+	ldq	r5,	0(r16)		C r5 = *res_ptr
+	addq	r4,	r0,	r0	C cy_limb = cy_limb + 'cy'
+	subq	r20,	1,	r20	C size--
+	umulh	r2,	r19,	r4	C r4 = cy_limb
+	ldq	r2,	0(r17)		C r2 = s1_limb
+	addq	r17,	8,	r17	C s1_ptr++
+	addq	r3,	r0,	r3	C r3 = cy_limb + prod_low
+	cmpult	r3,	r0,	r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r5
+	stq	r3,	0(r16)
+	addq	r16,	8,	r16	C res_ptr++
+	addq	r5,	r0,	r0	C combine carries
+	bne	r20,	$Loop1
+
+$Lend1a:
+	mulq	r2,	r19,	r3	C r3 = prod_low
+	ldq	r5,	0(r16)		C r5 = *res_ptr
+	addq	r4,	r0,	r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,	r19,	r4	C r4 = cy_limb
+	addq	r3,	r0,	r3	C r3 = cy_limb + prod_low
+	cmpult	r3,	r0,	r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r5
+	stq	r3,	0(r16)
+	addq	r16,	8,	r16	C res_ptr++
+	addq	r5,	r0,	r0	C combine carries
+	addq	r4,	r0,	r0	C cy_limb = prod_high + cy
+	br	r31,	$Lunroll
+$Lend1b:
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r5
+	stq	r3,	0(r16)
+	addq	r16,	8,	r16	C res_ptr++
+	addq	r0,	r5,	r0
+
+$Lunroll:
+	lda	r17,	-16(r17)	C L1 bookkeeping
+	lda	r16,	-16(r16)	C L1 bookkeeping
+	bis	r0,	r31,	r12
+
+C ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
+
+	ldq	r2,	16(r17)		C L1
+	ldq	r3,	24(r17)		C L1
+	lda	r18,	-1(r18)		C L1 bookkeeping
+	ldq	r6,	16(r16)		C L1
+	ldq	r7,	24(r16)		C L1
+	ldq	r0,	32(r17)		C L1
+	mulq	r19,	r2,	r13	C U1
+	ldq	r1,	40(r17)		C L1
+	umulh	r19,	r2,	r14	C U1
+	mulq	r19,	r3,	r15	C U1
+	lda	r17,	64(r17)		C L1 bookkeeping
+	ldq	r4,	32(r16)		C L1
+	ldq	r5,	40(r16)		C L1
+	umulh	r19,	r3,	r8	C U1
+	ldq	r2,	-16(r17)	C L1
+	mulq	r19,	r0,	r9	C U1
+	ldq	r3,	-8(r17)		C L1
+	umulh	r19,	r0,	r10	C U1
+	addq	r6,	r13,	r6	C L0 lo + acc
+	mulq	r19,	r1,	r11	C U1
+	cmpult	r6,	r13,	r20	C L0 lo add => carry
+	lda	r16,	64(r16)		C L1 bookkeeping
+	addq	r6,	r12,	r22	C U0 hi add => answer
+	cmpult	r22,	r12,	r21	C L0 hi add => carry
+	addq	r14,	r20,	r14	C U0 hi mul + carry
+	ldq	r6,	-16(r16)	C L1
+	addq	r7,	r15,	r23	C L0 lo + acc
+	addq	r14,	r21,	r14	C U0 hi mul + carry
+	ldq	r7,	-8(r16)		C L1
+	umulh	r19,	r1,	r12	C U1
+	cmpult	r23,	r15,	r20	C L0 lo add => carry
+	addq	r23,	r14,	r23	C U0 hi add => answer
+	ldq	r0,	0(r17)		C L1
+	mulq	r19,	r2,	r13	C U1
+	cmpult	r23,	r14,	r21	C L0 hi add => carry
+	addq	r8,	r20,	r8	C U0 hi mul + carry
+	ldq	r1,	8(r17)		C L1
+	umulh	r19,	r2,	r14	C U1
+	addq	r4,	r9,	r4	C L0 lo + acc
+	stq	r22,	-48(r16)	C L0
+	stq	r23,	-40(r16)	C L1
+	mulq	r19,	r3,	r15	C U1
+	addq	r8,	r21,	r8	C U0 hi mul + carry
+	cmpult	r4,	r9,	r20	C L0 lo add => carry
+	addq	r4,	r8,	r22	C U0 hi add => answer
+	ble	r18,	$Lend		C U1 bookkeeping
+
+C ____ MAIN UNROLLED LOOP ____
+	ALIGN(16)
+$Loop:
+	bis	r31,	r31,	r31	C U1 mt
+	cmpult	r22,	r8,	r21	C L0 hi add => carry
+	addq	r10,	r20,	r10	C U0 hi mul + carry
+	ldq	r4,	0(r16)		C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	addq	r5,	r11,	r23	C L0 lo + acc
+	addq	r10,	r21,	r10	C L0 hi mul + carry
+	ldq	r5,	8(r16)		C L1
+
+	umulh	r19,	r3,	r8	C U1
+	cmpult	r23,	r11,	r20	C L0 lo add => carry
+	addq	r23,	r10,	r23	C U0 hi add => answer
+	ldq	r2,	16(r17)		C L1
+
+	mulq	r19,	r0,	r9	C U1
+	cmpult	r23,	r10,	r21	C L0 hi add => carry
+	addq	r12,	r20,	r12	C U0 hi mul + carry
+	ldq	r3,	24(r17)		C L1
+
+	umulh	r19,	r0,	r10	C U1
+	addq	r6,	r13,	r6	C L0 lo + acc
+	stq	r22,	-32(r16)	C L0
+	stq	r23,	-24(r16)	C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	r19,	r1,	r11	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r12,	r21,	r12	C U0 hi mul + carry
+
+	cmpult	r6,	r13,	r20	C L0 lo add => carry
+	bis	r31,	r31,	r31	C U1 mt
+	lda	r18,	-1(r18)		C L1 bookkeeping
+	addq	r6,	r12,	r22	C U0 hi add => answer
+
+	bis	r31,	r31,	r31	C U1 mt
+	cmpult	r22,	r12,	r21	C L0 hi add => carry
+	addq	r14,	r20,	r14	C U0 hi mul + carry
+	ldq	r6,	16(r16)		C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	addq	r7,	r15,	r23	C L0 lo + acc
+	addq	r14,	r21,	r14	C U0 hi mul + carry
+	ldq	r7,	24(r16)		C L1
+
+	umulh	r19,	r1,	r12	C U1
+	cmpult	r23,	r15,	r20	C L0 lo add => carry
+	addq	r23,	r14,	r23	C U0 hi add => answer
+	ldq	r0,	32(r17)		C L1
+
+	mulq	r19,	r2,	r13	C U1
+	cmpult	r23,	r14,	r21	C L0 hi add => carry
+	addq	r8,	r20,	r8	C U0 hi mul + carry
+	ldq	r1,	40(r17)		C L1
+
+	umulh	r19,	r2,	r14	C U1
+	addq	r4,	r9,	r4	C U0 lo + acc
+	stq	r22,	-16(r16)	C L0
+	stq	r23,	-8(r16)		C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	r19,	r3,	r15	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r8,	r21,	r8	C L0 hi mul + carry
+
+	cmpult	r4,	r9,	r20	C L0 lo add => carry
+	bis	r31,	r31,	r31	C U1 mt
+	lda	r17,	64(r17)		C L1 bookkeeping
+	addq	r4,	r8,	r22	C U0 hi add => answer
+
+	bis	r31,	r31,	r31	C U1 mt
+	cmpult	r22,	r8,	r21	C L0 hi add => carry
+	addq	r10,	r20,	r10	C U0 hi mul + carry
+	ldq	r4,	32(r16)		C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	addq	r5,	r11,	r23	C L0 lo + acc
+	addq	r10,	r21,	r10	C L0 hi mul + carry
+	ldq	r5,	40(r16)		C L1
+
+	umulh	r19,	r3,	r8	C U1
+	cmpult	r23,	r11,	r20	C L0 lo add => carry
+	addq	r23,	r10,	r23	C U0 hi add => answer
+	ldq	r2,	-16(r17)	C L1
+
+	mulq	r19,	r0,	r9	C U1
+	cmpult	r23,	r10,	r21	C L0 hi add => carry
+	addq	r12,	r20,	r12	C U0 hi mul + carry
+	ldq	r3,	-8(r17)		C L1
+
+	umulh	r19,	r0,	r10	C U1
+	addq	r6,	r13,	r6	C L0 lo + acc
+	stq	r22,	0(r16)		C L0
+	stq	r23,	8(r16)		C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	r19,	r1,	r11	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r12,	r21,	r12	C U0 hi mul + carry
+
+	cmpult	r6,	r13,	r20	C L0 lo add => carry
+	bis	r31,	r31,	r31	C U1 mt
+	lda	r16,	64(r16)		C L1 bookkeeping
+	addq	r6,	r12,	r22	C U0 hi add => answer
+
+	bis	r31,	r31,	r31	C U1 mt
+	cmpult	r22,	r12,	r21	C L0 hi add => carry
+	addq	r14,	r20,	r14	C U0 hi mul + carry
+	ldq	r6,	-16(r16)	C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	addq	r7,	r15,	r23	C L0 lo + acc
+	addq	r14,	r21,	r14	C U0 hi mul + carry
+	ldq	r7,	-8(r16)		C L1
+
+	umulh	r19,	r1,	r12	C U1
+	cmpult	r23,	r15,	r20	C L0 lo add => carry
+	addq	r23,	r14,	r23	C U0 hi add => answer
+	ldq	r0,	0(r17)		C L1
+
+	mulq	r19,	r2,	r13	C U1
+	cmpult	r23,	r14,	r21	C L0 hi add => carry
+	addq	r8,	r20,	r8	C U0 hi mul + carry
+	ldq	r1,	8(r17)		C L1
+
+	umulh	r19,	r2,	r14	C U1
+	addq	r4,	r9,	r4	C L0 lo + acc
+	stq	r22,	-48(r16)	C L0
+	stq	r23,	-40(r16)	C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	r19,	r3,	r15	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r8,	r21,	r8	C U0 hi mul + carry
+
+	cmpult	r4,	r9,	r20	C L0 lo add => carry
+	addq	r4,	r8,	r22	C U0 hi add => answer
+	bis	r31,	r31,	r31	C L1 mt
+	bgt	r18,	$Loop		C U1 bookkeeping
+
+C ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
+$Lend:
+	cmpult	r22,	r8,	r21	C L0 hi add => carry
+	addq	r10,	r20,	r10	C U0 hi mul + carry
+	ldq	r4,	0(r16)		C L1
+	addq	r5,	r11,	r23	C L0 lo + acc
+	addq	r10,	r21,	r10	C L0 hi mul + carry
+	ldq	r5,	8(r16)		C L1
+	umulh	r19,	r3,	r8	C U1
+	cmpult	r23,	r11,	r20	C L0 lo add => carry
+	addq	r23,	r10,	r23	C U0 hi add => answer
+	mulq	r19,	r0,	r9	C U1
+	cmpult	r23,	r10,	r21	C L0 hi add => carry
+	addq	r12,	r20,	r12	C U0 hi mul + carry
+	umulh	r19,	r0,	r10	C U1
+	addq	r6,	r13,	r6	C L0 lo + acc
+	stq	r22,	-32(r16)	C L0
+	stq	r23,	-24(r16)	C L1
+	mulq	r19,	r1,	r11	C U1
+	addq	r12,	r21,	r12	C U0 hi mul + carry
+	cmpult	r6,	r13,	r20	C L0 lo add => carry
+	addq	r6,	r12,	r22	C U0 hi add => answer
+	cmpult	r22,	r12,	r21	C L0 hi add => carry
+	addq	r14,	r20,	r14	C U0 hi mul + carry
+	addq	r7,	r15,	r23	C L0 lo + acc
+	addq	r14,	r21,	r14	C U0 hi mul + carry
+	umulh	r19,	r1,	r12	C U1
+	cmpult	r23,	r15,	r20	C L0 lo add => carry
+	addq	r23,	r14,	r23	C U0 hi add => answer
+	cmpult	r23,	r14,	r21	C L0 hi add => carry
+	addq	r8,	r20,	r8	C U0 hi mul + carry
+	addq	r4,	r9,	r4	C U0 lo + acc
+	stq	r22,	-16(r16)	C L0
+	stq	r23,	-8(r16)		C L1
+	bis	r31,	r31,	r31	C L0 st slosh
+	addq	r8,	r21,	r8	C L0 hi mul + carry
+	cmpult	r4,	r9,	r20	C L0 lo add => carry
+	addq	r4,	r8,	r22	C U0 hi add => answer
+	cmpult	r22,	r8,	r21	C L0 hi add => carry
+	addq	r10,	r20,	r10	C U0 hi mul + carry
+	addq	r5,	r11,	r23	C L0 lo + acc
+	addq	r10,	r21,	r10	C L0 hi mul + carry
+	cmpult	r23,	r11,	r20	C L0 lo add => carry
+	addq	r23,	r10,	r23	C U0 hi add => answer
+	cmpult	r23,	r10,	r21	C L0 hi add => carry
+	addq	r12,	r20,	r12	C U0 hi mul + carry
+	stq	r22,	0(r16)		C L0
+	stq	r23,	8(r16)		C L1
+	addq	r12,	r21,	r0	C U0 hi mul + carry
+
+	ldq	$9,	8($30)
+	ldq	$10,	16($30)
+	ldq	$11,	24($30)
+	ldq	$12,	32($30)
+	ldq	$13,	40($30)
+	ldq	$14,	48($30)
+	ldq	$15,	56($30)
+	lda	$30,	240($30)
+	ret	r31,	(r26),	1
+EPILOGUE(mpn_addmul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev6/gmp-mparam.h b/rts/gmp/mpn/alpha/ev6/gmp-mparam.h
new file mode 100644
index 0000000000..7ea20577f8
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev6/gmp-mparam.h
@@ -0,0 +1,62 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* Generated by tuneup.c, 2000-08-02. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD     47
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD         70
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD     94
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD        101
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD                33
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD               70
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD              29
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD         46
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD            33
+#endif
diff --git a/rts/gmp/mpn/alpha/gmp-mparam.h b/rts/gmp/mpn/alpha/gmp-mparam.h
new file mode 100644
index 0000000000..054ff2fe5f
--- /dev/null
+++ b/rts/gmp/mpn/alpha/gmp-mparam.h
@@ -0,0 +1,64 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values are for the 21164 family.  The 21264 will require
+   different values, since it has such quick multiplication.  */
+/* Generated by tuneup.c, 2000-07-19. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   22
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD       53
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   31
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD       47
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              64
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             98
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            17
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD           4
+#endif
diff --git a/rts/gmp/mpn/alpha/invert_limb.asm b/rts/gmp/mpn/alpha/invert_limb.asm
new file mode 100644
index 0000000000..a921b32b3f
--- /dev/null
+++ b/rts/gmp/mpn/alpha/invert_limb.asm
@@ -0,0 +1,345 @@
+dnl  Alpha mpn_invert_limb -- Invert a normalized limb.
+
+dnl  Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+dnl 
+dnl  This is based on sophie:/gmp-stuff/dbg-inv-limb.c.
+dnl  The ideas are due to Peter L. Montgomery
+dnl 
+dnl  The table below uses 4096 bytes.  The file mentioned above has an
+dnl  alternative function that doesn't require the table, but it runs 50%
+dnl  slower than this.
+
+include(`../config.m4')
+
+ASM_START()
+
+FLOAT64($C36,9223372036854775808.0)		C 2^63
+
+PROLOGUE_GP(mpn_invert_limb)
+	lda	r30,-16(r30)
+	addq	r16,r16,r1
+	bne	r1,$73
+	lda	r0,-1
+	br	r31,$Lend
+$73:
+	srl	r16,1,r1
+	stq	r1,0(r30)
+	ldt	f11,0(r30)
+	cvtqt	f11,f1
+	lda	r1,$C36
+	ldt	f10,0(r1)
+	divt	f10,f1,f10
+	lda	r2,$invtab-4096
+	srl	r16,52,r1
+	addq	r1,r1,r1
+	addq	r1,r2,r1
+	bic	r1,6,r2
+	ldq	r2,0(r2)
+	bic	r1,1,r1
+	extwl	r2,r1,r2
+	sll	r2,48,r0
+	umulh	r16,r0,r1
+	addq	r16,r1,r3
+	stq	r3,0(r30)
+	ldt	f11,0(r30)
+	cvtqt	f11,f1
+	mult	f1,f10,f1
+	cvttqc	f1,f1
+	stt	f1,0(r30)
+	ldq	r4,0(r30)
+	subq	r0,r4,r0
+	umulh	r16,r0,r1
+	mulq	r16,r0,r2
+	addq	r16,r1,r3
+	bge	r3,$Loop2
+$Loop1:	addq	r2,r16,r2
+	cmpult	r2,r16,r1
+	addq	r3,r1,r3
+	addq	r0,1,r0
+	blt	r3,$Loop1
+$Loop2:	cmpult	r2,r16,r1
+	subq	r0,1,r0
+	subq	r3,r1,r3
+	subq	r2,r16,r2
+	bge	r3,$Loop2
+$Lend:
+	lda	r30,16(r30)
+	ret	r31,(r26),1
+EPILOGUE(mpn_invert_limb)
+DATASTART(`$invtab',4)
+	.word 0xffff,0xffc0,0xff80,0xff40,0xff00,0xfec0,0xfe81,0xfe41
+	.word 0xfe01,0xfdc2,0xfd83,0xfd43,0xfd04,0xfcc5,0xfc86,0xfc46
+	.word 0xfc07,0xfbc8,0xfb8a,0xfb4b,0xfb0c,0xfacd,0xfa8e,0xfa50
+	.word 0xfa11,0xf9d3,0xf994,0xf956,0xf918,0xf8d9,0xf89b,0xf85d
+	.word 0xf81f,0xf7e1,0xf7a3,0xf765,0xf727,0xf6ea,0xf6ac,0xf66e
+	.word 0xf631,0xf5f3,0xf5b6,0xf578,0xf53b,0xf4fd,0xf4c0,0xf483
+	.word 0xf446,0xf409,0xf3cc,0xf38f,0xf352,0xf315,0xf2d8,0xf29c
+	.word 0xf25f,0xf222,0xf1e6,0xf1a9,0xf16d,0xf130,0xf0f4,0xf0b8
+	.word 0xf07c,0xf03f,0xf003,0xefc7,0xef8b,0xef4f,0xef14,0xeed8
+	.word 0xee9c,0xee60,0xee25,0xede9,0xedae,0xed72,0xed37,0xecfb
+	.word 0xecc0,0xec85,0xec4a,0xec0e,0xebd3,0xeb98,0xeb5d,0xeb22
+	.word 0xeae8,0xeaad,0xea72,0xea37,0xe9fd,0xe9c2,0xe988,0xe94d
+	.word 0xe913,0xe8d8,0xe89e,0xe864,0xe829,0xe7ef,0xe7b5,0xe77b
+	.word 0xe741,0xe707,0xe6cd,0xe694,0xe65a,0xe620,0xe5e6,0xe5ad
+	.word 0xe573,0xe53a,0xe500,0xe4c7,0xe48d,0xe454,0xe41b,0xe3e2
+	.word 0xe3a9,0xe370,0xe336,0xe2fd,0xe2c5,0xe28c,0xe253,0xe21a
+	.word 0xe1e1,0xe1a9,0xe170,0xe138,0xe0ff,0xe0c7,0xe08e,0xe056
+	.word 0xe01e,0xdfe5,0xdfad,0xdf75,0xdf3d,0xdf05,0xdecd,0xde95
+	.word 0xde5d,0xde25,0xdded,0xddb6,0xdd7e,0xdd46,0xdd0f,0xdcd7
+	.word 0xdca0,0xdc68,0xdc31,0xdbf9,0xdbc2,0xdb8b,0xdb54,0xdb1d
+	.word 0xdae6,0xdaae,0xda78,0xda41,0xda0a,0xd9d3,0xd99c,0xd965
+	.word 0xd92f,0xd8f8,0xd8c1,0xd88b,0xd854,0xd81e,0xd7e8,0xd7b1
+	.word 0xd77b,0xd745,0xd70e,0xd6d8,0xd6a2,0xd66c,0xd636,0xd600
+	.word 0xd5ca,0xd594,0xd55f,0xd529,0xd4f3,0xd4bd,0xd488,0xd452
+	.word 0xd41d,0xd3e7,0xd3b2,0xd37c,0xd347,0xd312,0xd2dd,0xd2a7
+	.word 0xd272,0xd23d,0xd208,0xd1d3,0xd19e,0xd169,0xd134,0xd100
+	.word 0xd0cb,0xd096,0xd061,0xd02d,0xcff8,0xcfc4,0xcf8f,0xcf5b
+	.word 0xcf26,0xcef2,0xcebe,0xce89,0xce55,0xce21,0xcded,0xcdb9
+	.word 0xcd85,0xcd51,0xcd1d,0xcce9,0xccb5,0xcc81,0xcc4e,0xcc1a
+	.word 0xcbe6,0xcbb3,0xcb7f,0xcb4c,0xcb18,0xcae5,0xcab1,0xca7e
+	.word 0xca4b,0xca17,0xc9e4,0xc9b1,0xc97e,0xc94b,0xc918,0xc8e5
+	.word 0xc8b2,0xc87f,0xc84c,0xc819,0xc7e7,0xc7b4,0xc781,0xc74f
+	.word 0xc71c,0xc6e9,0xc6b7,0xc684,0xc652,0xc620,0xc5ed,0xc5bb
+	.word 0xc589,0xc557,0xc524,0xc4f2,0xc4c0,0xc48e,0xc45c,0xc42a
+	.word 0xc3f8,0xc3c7,0xc395,0xc363,0xc331,0xc300,0xc2ce,0xc29c
+	.word 0xc26b,0xc239,0xc208,0xc1d6,0xc1a5,0xc174,0xc142,0xc111
+	.word 0xc0e0,0xc0af,0xc07e,0xc04d,0xc01c,0xbfeb,0xbfba,0xbf89
+	.word 0xbf58,0xbf27,0xbef6,0xbec5,0xbe95,0xbe64,0xbe33,0xbe03
+	.word 0xbdd2,0xbda2,0xbd71,0xbd41,0xbd10,0xbce0,0xbcb0,0xbc80
+	.word 0xbc4f,0xbc1f,0xbbef,0xbbbf,0xbb8f,0xbb5f,0xbb2f,0xbaff
+	.word 0xbacf,0xba9f,0xba6f,0xba40,0xba10,0xb9e0,0xb9b1,0xb981
+	.word 0xb951,0xb922,0xb8f2,0xb8c3,0xb894,0xb864,0xb835,0xb806
+	.word 0xb7d6,0xb7a7,0xb778,0xb749,0xb71a,0xb6eb,0xb6bc,0xb68d
+	.word 0xb65e,0xb62f,0xb600,0xb5d1,0xb5a2,0xb574,0xb545,0xb516
+	.word 0xb4e8,0xb4b9,0xb48a,0xb45c,0xb42e,0xb3ff,0xb3d1,0xb3a2
+	.word 0xb374,0xb346,0xb318,0xb2e9,0xb2bb,0xb28d,0xb25f,0xb231
+	.word 0xb203,0xb1d5,0xb1a7,0xb179,0xb14b,0xb11d,0xb0f0,0xb0c2
+	.word 0xb094,0xb067,0xb039,0xb00b,0xafde,0xafb0,0xaf83,0xaf55
+	.word 0xaf28,0xaefb,0xaecd,0xaea0,0xae73,0xae45,0xae18,0xadeb
+	.word 0xadbe,0xad91,0xad64,0xad37,0xad0a,0xacdd,0xacb0,0xac83
+	.word 0xac57,0xac2a,0xabfd,0xabd0,0xaba4,0xab77,0xab4a,0xab1e
+	.word 0xaaf1,0xaac5,0xaa98,0xaa6c,0xaa40,0xaa13,0xa9e7,0xa9bb
+	.word 0xa98e,0xa962,0xa936,0xa90a,0xa8de,0xa8b2,0xa886,0xa85a
+	.word 0xa82e,0xa802,0xa7d6,0xa7aa,0xa77e,0xa753,0xa727,0xa6fb
+	.word 0xa6d0,0xa6a4,0xa678,0xa64d,0xa621,0xa5f6,0xa5ca,0xa59f
+	.word 0xa574,0xa548,0xa51d,0xa4f2,0xa4c6,0xa49b,0xa470,0xa445
+	.word 0xa41a,0xa3ef,0xa3c4,0xa399,0xa36e,0xa343,0xa318,0xa2ed
+	.word 0xa2c2,0xa297,0xa26d,0xa242,0xa217,0xa1ed,0xa1c2,0xa197
+	.word 0xa16d,0xa142,0xa118,0xa0ed,0xa0c3,0xa098,0xa06e,0xa044
+	.word 0xa01a,0x9fef,0x9fc5,0x9f9b,0x9f71,0x9f47,0x9f1c,0x9ef2
+	.word 0x9ec8,0x9e9e,0x9e74,0x9e4b,0x9e21,0x9df7,0x9dcd,0x9da3
+	.word 0x9d79,0x9d50,0x9d26,0x9cfc,0x9cd3,0x9ca9,0x9c80,0x9c56
+	.word 0x9c2d,0x9c03,0x9bda,0x9bb0,0x9b87,0x9b5e,0x9b34,0x9b0b
+	.word 0x9ae2,0x9ab9,0x9a8f,0x9a66,0x9a3d,0x9a14,0x99eb,0x99c2
+	.word 0x9999,0x9970,0x9947,0x991e,0x98f6,0x98cd,0x98a4,0x987b
+	.word 0x9852,0x982a,0x9801,0x97d8,0x97b0,0x9787,0x975f,0x9736
+	.word 0x970e,0x96e5,0x96bd,0x9695,0x966c,0x9644,0x961c,0x95f3
+	.word 0x95cb,0x95a3,0x957b,0x9553,0x952b,0x9503,0x94db,0x94b3
+	.word 0x948b,0x9463,0x943b,0x9413,0x93eb,0x93c3,0x939b,0x9374
+	.word 0x934c,0x9324,0x92fd,0x92d5,0x92ad,0x9286,0x925e,0x9237
+	.word 0x920f,0x91e8,0x91c0,0x9199,0x9172,0x914a,0x9123,0x90fc
+	.word 0x90d4,0x90ad,0x9086,0x905f,0x9038,0x9011,0x8fea,0x8fc3
+	.word 0x8f9c,0x8f75,0x8f4e,0x8f27,0x8f00,0x8ed9,0x8eb2,0x8e8b
+	.word 0x8e65,0x8e3e,0x8e17,0x8df1,0x8dca,0x8da3,0x8d7d,0x8d56
+	.word 0x8d30,0x8d09,0x8ce3,0x8cbc,0x8c96,0x8c6f,0x8c49,0x8c23
+	.word 0x8bfc,0x8bd6,0x8bb0,0x8b8a,0x8b64,0x8b3d,0x8b17,0x8af1
+	.word 0x8acb,0x8aa5,0x8a7f,0x8a59,0x8a33,0x8a0d,0x89e7,0x89c1
+	.word 0x899c,0x8976,0x8950,0x892a,0x8904,0x88df,0x88b9,0x8893
+	.word 0x886e,0x8848,0x8823,0x87fd,0x87d8,0x87b2,0x878d,0x8767
+	.word 0x8742,0x871d,0x86f7,0x86d2,0x86ad,0x8687,0x8662,0x863d
+	.word 0x8618,0x85f3,0x85ce,0x85a9,0x8583,0x855e,0x8539,0x8514
+	.word 0x84f0,0x84cb,0x84a6,0x8481,0x845c,0x8437,0x8412,0x83ee
+	.word 0x83c9,0x83a4,0x8380,0x835b,0x8336,0x8312,0x82ed,0x82c9
+	.word 0x82a4,0x8280,0x825b,0x8237,0x8212,0x81ee,0x81ca,0x81a5
+	.word 0x8181,0x815d,0x8138,0x8114,0x80f0,0x80cc,0x80a8,0x8084
+	.word 0x8060,0x803c,0x8018,0x7ff4,0x7fd0,0x7fac,0x7f88,0x7f64
+	.word 0x7f40,0x7f1c,0x7ef8,0x7ed4,0x7eb1,0x7e8d,0x7e69,0x7e45
+	.word 0x7e22,0x7dfe,0x7ddb,0x7db7,0x7d93,0x7d70,0x7d4c,0x7d29
+	.word 0x7d05,0x7ce2,0x7cbf,0x7c9b,0x7c78,0x7c55,0x7c31,0x7c0e
+	.word 0x7beb,0x7bc7,0x7ba4,0x7b81,0x7b5e,0x7b3b,0x7b18,0x7af5
+	.word 0x7ad2,0x7aaf,0x7a8c,0x7a69,0x7a46,0x7a23,0x7a00,0x79dd
+	.word 0x79ba,0x7997,0x7975,0x7952,0x792f,0x790c,0x78ea,0x78c7
+	.word 0x78a4,0x7882,0x785f,0x783c,0x781a,0x77f7,0x77d5,0x77b2
+	.word 0x7790,0x776e,0x774b,0x7729,0x7706,0x76e4,0x76c2,0x76a0
+	.word 0x767d,0x765b,0x7639,0x7617,0x75f5,0x75d2,0x75b0,0x758e
+	.word 0x756c,0x754a,0x7528,0x7506,0x74e4,0x74c2,0x74a0,0x747e
+	.word 0x745d,0x743b,0x7419,0x73f7,0x73d5,0x73b4,0x7392,0x7370
+	.word 0x734f,0x732d,0x730b,0x72ea,0x72c8,0x72a7,0x7285,0x7264
+	.word 0x7242,0x7221,0x71ff,0x71de,0x71bc,0x719b,0x717a,0x7158
+	.word 0x7137,0x7116,0x70f5,0x70d3,0x70b2,0x7091,0x7070,0x704f
+	.word 0x702e,0x700c,0x6feb,0x6fca,0x6fa9,0x6f88,0x6f67,0x6f46
+	.word 0x6f26,0x6f05,0x6ee4,0x6ec3,0x6ea2,0x6e81,0x6e60,0x6e40
+	.word 0x6e1f,0x6dfe,0x6dde,0x6dbd,0x6d9c,0x6d7c,0x6d5b,0x6d3a
+	.word 0x6d1a,0x6cf9,0x6cd9,0x6cb8,0x6c98,0x6c77,0x6c57,0x6c37
+	.word 0x6c16,0x6bf6,0x6bd6,0x6bb5,0x6b95,0x6b75,0x6b54,0x6b34
+	.word 0x6b14,0x6af4,0x6ad4,0x6ab4,0x6a94,0x6a73,0x6a53,0x6a33
+	.word 0x6a13,0x69f3,0x69d3,0x69b3,0x6993,0x6974,0x6954,0x6934
+	.word 0x6914,0x68f4,0x68d4,0x68b5,0x6895,0x6875,0x6855,0x6836
+	.word 0x6816,0x67f6,0x67d7,0x67b7,0x6798,0x6778,0x6758,0x6739
+	.word 0x6719,0x66fa,0x66db,0x66bb,0x669c,0x667c,0x665d,0x663e
+	.word 0x661e,0x65ff,0x65e0,0x65c0,0x65a1,0x6582,0x6563,0x6544
+	.word 0x6524,0x6505,0x64e6,0x64c7,0x64a8,0x6489,0x646a,0x644b
+	.word 0x642c,0x640d,0x63ee,0x63cf,0x63b0,0x6391,0x6373,0x6354
+	.word 0x6335,0x6316,0x62f7,0x62d9,0x62ba,0x629b,0x627c,0x625e
+	.word 0x623f,0x6221,0x6202,0x61e3,0x61c5,0x61a6,0x6188,0x6169
+	.word 0x614b,0x612c,0x610e,0x60ef,0x60d1,0x60b3,0x6094,0x6076
+	.word 0x6058,0x6039,0x601b,0x5ffd,0x5fdf,0x5fc0,0x5fa2,0x5f84
+	.word 0x5f66,0x5f48,0x5f2a,0x5f0b,0x5eed,0x5ecf,0x5eb1,0x5e93
+	.word 0x5e75,0x5e57,0x5e39,0x5e1b,0x5dfd,0x5de0,0x5dc2,0x5da4
+	.word 0x5d86,0x5d68,0x5d4a,0x5d2d,0x5d0f,0x5cf1,0x5cd3,0x5cb6
+	.word 0x5c98,0x5c7a,0x5c5d,0x5c3f,0x5c21,0x5c04,0x5be6,0x5bc9
+	.word 0x5bab,0x5b8e,0x5b70,0x5b53,0x5b35,0x5b18,0x5afb,0x5add
+	.word 0x5ac0,0x5aa2,0x5a85,0x5a68,0x5a4b,0x5a2d,0x5a10,0x59f3
+	.word 0x59d6,0x59b8,0x599b,0x597e,0x5961,0x5944,0x5927,0x590a
+	.word 0x58ed,0x58d0,0x58b3,0x5896,0x5879,0x585c,0x583f,0x5822
+	.word 0x5805,0x57e8,0x57cb,0x57ae,0x5791,0x5775,0x5758,0x573b
+	.word 0x571e,0x5702,0x56e5,0x56c8,0x56ac,0x568f,0x5672,0x5656
+	.word 0x5639,0x561c,0x5600,0x55e3,0x55c7,0x55aa,0x558e,0x5571
+	.word 0x5555,0x5538,0x551c,0x5500,0x54e3,0x54c7,0x54aa,0x548e
+	.word 0x5472,0x5456,0x5439,0x541d,0x5401,0x53e5,0x53c8,0x53ac
+	.word 0x5390,0x5374,0x5358,0x533c,0x5320,0x5304,0x52e8,0x52cb
+	.word 0x52af,0x5293,0x5277,0x525c,0x5240,0x5224,0x5208,0x51ec
+	.word 0x51d0,0x51b4,0x5198,0x517c,0x5161,0x5145,0x5129,0x510d
+	.word 0x50f2,0x50d6,0x50ba,0x509f,0x5083,0x5067,0x504c,0x5030
+	.word 0x5015,0x4ff9,0x4fdd,0x4fc2,0x4fa6,0x4f8b,0x4f6f,0x4f54
+	.word 0x4f38,0x4f1d,0x4f02,0x4ee6,0x4ecb,0x4eb0,0x4e94,0x4e79
+	.word 0x4e5e,0x4e42,0x4e27,0x4e0c,0x4df0,0x4dd5,0x4dba,0x4d9f
+	.word 0x4d84,0x4d69,0x4d4d,0x4d32,0x4d17,0x4cfc,0x4ce1,0x4cc6
+	.word 0x4cab,0x4c90,0x4c75,0x4c5a,0x4c3f,0x4c24,0x4c09,0x4bee
+	.word 0x4bd3,0x4bb9,0x4b9e,0x4b83,0x4b68,0x4b4d,0x4b32,0x4b18
+	.word 0x4afd,0x4ae2,0x4ac7,0x4aad,0x4a92,0x4a77,0x4a5d,0x4a42
+	.word 0x4a27,0x4a0d,0x49f2,0x49d8,0x49bd,0x49a3,0x4988,0x496e
+	.word 0x4953,0x4939,0x491e,0x4904,0x48e9,0x48cf,0x48b5,0x489a
+	.word 0x4880,0x4865,0x484b,0x4831,0x4817,0x47fc,0x47e2,0x47c8
+	.word 0x47ae,0x4793,0x4779,0x475f,0x4745,0x472b,0x4711,0x46f6
+	.word 0x46dc,0x46c2,0x46a8,0x468e,0x4674,0x465a,0x4640,0x4626
+	.word 0x460c,0x45f2,0x45d8,0x45be,0x45a5,0x458b,0x4571,0x4557
+	.word 0x453d,0x4523,0x4509,0x44f0,0x44d6,0x44bc,0x44a2,0x4489
+	.word 0x446f,0x4455,0x443c,0x4422,0x4408,0x43ef,0x43d5,0x43bc
+	.word 0x43a2,0x4388,0x436f,0x4355,0x433c,0x4322,0x4309,0x42ef
+	.word 0x42d6,0x42bc,0x42a3,0x428a,0x4270,0x4257,0x423d,0x4224
+	.word 0x420b,0x41f2,0x41d8,0x41bf,0x41a6,0x418c,0x4173,0x415a
+	.word 0x4141,0x4128,0x410e,0x40f5,0x40dc,0x40c3,0x40aa,0x4091
+	.word 0x4078,0x405f,0x4046,0x402d,0x4014,0x3ffb,0x3fe2,0x3fc9
+	.word 0x3fb0,0x3f97,0x3f7e,0x3f65,0x3f4c,0x3f33,0x3f1a,0x3f01
+	.word 0x3ee8,0x3ed0,0x3eb7,0x3e9e,0x3e85,0x3e6c,0x3e54,0x3e3b
+	.word 0x3e22,0x3e0a,0x3df1,0x3dd8,0x3dc0,0x3da7,0x3d8e,0x3d76
+	.word 0x3d5d,0x3d45,0x3d2c,0x3d13,0x3cfb,0x3ce2,0x3cca,0x3cb1
+	.word 0x3c99,0x3c80,0x3c68,0x3c50,0x3c37,0x3c1f,0x3c06,0x3bee
+	.word 0x3bd6,0x3bbd,0x3ba5,0x3b8d,0x3b74,0x3b5c,0x3b44,0x3b2b
+	.word 0x3b13,0x3afb,0x3ae3,0x3acb,0x3ab2,0x3a9a,0x3a82,0x3a6a
+	.word 0x3a52,0x3a3a,0x3a22,0x3a09,0x39f1,0x39d9,0x39c1,0x39a9
+	.word 0x3991,0x3979,0x3961,0x3949,0x3931,0x3919,0x3901,0x38ea
+	.word 0x38d2,0x38ba,0x38a2,0x388a,0x3872,0x385a,0x3843,0x382b
+	.word 0x3813,0x37fb,0x37e3,0x37cc,0x37b4,0x379c,0x3785,0x376d
+	.word 0x3755,0x373e,0x3726,0x370e,0x36f7,0x36df,0x36c8,0x36b0
+	.word 0x3698,0x3681,0x3669,0x3652,0x363a,0x3623,0x360b,0x35f4
+	.word 0x35dc,0x35c5,0x35ae,0x3596,0x357f,0x3567,0x3550,0x3539
+	.word 0x3521,0x350a,0x34f3,0x34db,0x34c4,0x34ad,0x3496,0x347e
+	.word 0x3467,0x3450,0x3439,0x3422,0x340a,0x33f3,0x33dc,0x33c5
+	.word 0x33ae,0x3397,0x3380,0x3368,0x3351,0x333a,0x3323,0x330c
+	.word 0x32f5,0x32de,0x32c7,0x32b0,0x3299,0x3282,0x326c,0x3255
+	.word 0x323e,0x3227,0x3210,0x31f9,0x31e2,0x31cb,0x31b5,0x319e
+	.word 0x3187,0x3170,0x3159,0x3143,0x312c,0x3115,0x30fe,0x30e8
+	.word 0x30d1,0x30ba,0x30a4,0x308d,0x3076,0x3060,0x3049,0x3033
+	.word 0x301c,0x3005,0x2fef,0x2fd8,0x2fc2,0x2fab,0x2f95,0x2f7e
+	.word 0x2f68,0x2f51,0x2f3b,0x2f24,0x2f0e,0x2ef8,0x2ee1,0x2ecb
+	.word 0x2eb4,0x2e9e,0x2e88,0x2e71,0x2e5b,0x2e45,0x2e2e,0x2e18
+	.word 0x2e02,0x2dec,0x2dd5,0x2dbf,0x2da9,0x2d93,0x2d7c,0x2d66
+	.word 0x2d50,0x2d3a,0x2d24,0x2d0e,0x2cf8,0x2ce1,0x2ccb,0x2cb5
+	.word 0x2c9f,0x2c89,0x2c73,0x2c5d,0x2c47,0x2c31,0x2c1b,0x2c05
+	.word 0x2bef,0x2bd9,0x2bc3,0x2bad,0x2b97,0x2b81,0x2b6c,0x2b56
+	.word 0x2b40,0x2b2a,0x2b14,0x2afe,0x2ae8,0x2ad3,0x2abd,0x2aa7
+	.word 0x2a91,0x2a7c,0x2a66,0x2a50,0x2a3a,0x2a25,0x2a0f,0x29f9
+	.word 0x29e4,0x29ce,0x29b8,0x29a3,0x298d,0x2977,0x2962,0x294c
+	.word 0x2937,0x2921,0x290c,0x28f6,0x28e0,0x28cb,0x28b5,0x28a0
+	.word 0x288b,0x2875,0x2860,0x284a,0x2835,0x281f,0x280a,0x27f5
+	.word 0x27df,0x27ca,0x27b4,0x279f,0x278a,0x2774,0x275f,0x274a
+	.word 0x2735,0x271f,0x270a,0x26f5,0x26e0,0x26ca,0x26b5,0x26a0
+	.word 0x268b,0x2676,0x2660,0x264b,0x2636,0x2621,0x260c,0x25f7
+	.word 0x25e2,0x25cd,0x25b8,0x25a2,0x258d,0x2578,0x2563,0x254e
+	.word 0x2539,0x2524,0x250f,0x24fa,0x24e5,0x24d1,0x24bc,0x24a7
+	.word 0x2492,0x247d,0x2468,0x2453,0x243e,0x2429,0x2415,0x2400
+	.word 0x23eb,0x23d6,0x23c1,0x23ad,0x2398,0x2383,0x236e,0x235a
+	.word 0x2345,0x2330,0x231c,0x2307,0x22f2,0x22dd,0x22c9,0x22b4
+	.word 0x22a0,0x228b,0x2276,0x2262,0x224d,0x2239,0x2224,0x2210
+	.word 0x21fb,0x21e6,0x21d2,0x21bd,0x21a9,0x2194,0x2180,0x216c
+	.word 0x2157,0x2143,0x212e,0x211a,0x2105,0x20f1,0x20dd,0x20c8
+	.word 0x20b4,0x20a0,0x208b,0x2077,0x2063,0x204e,0x203a,0x2026
+	.word 0x2012,0x1ffd,0x1fe9,0x1fd5,0x1fc1,0x1fac,0x1f98,0x1f84
+	.word 0x1f70,0x1f5c,0x1f47,0x1f33,0x1f1f,0x1f0b,0x1ef7,0x1ee3
+	.word 0x1ecf,0x1ebb,0x1ea7,0x1e93,0x1e7f,0x1e6a,0x1e56,0x1e42
+	.word 0x1e2e,0x1e1a,0x1e06,0x1df3,0x1ddf,0x1dcb,0x1db7,0x1da3
+	.word 0x1d8f,0x1d7b,0x1d67,0x1d53,0x1d3f,0x1d2b,0x1d18,0x1d04
+	.word 0x1cf0,0x1cdc,0x1cc8,0x1cb5,0x1ca1,0x1c8d,0x1c79,0x1c65
+	.word 0x1c52,0x1c3e,0x1c2a,0x1c17,0x1c03,0x1bef,0x1bdb,0x1bc8
+	.word 0x1bb4,0x1ba0,0x1b8d,0x1b79,0x1b66,0x1b52,0x1b3e,0x1b2b
+	.word 0x1b17,0x1b04,0x1af0,0x1add,0x1ac9,0x1ab6,0x1aa2,0x1a8f
+	.word 0x1a7b,0x1a68,0x1a54,0x1a41,0x1a2d,0x1a1a,0x1a06,0x19f3
+	.word 0x19e0,0x19cc,0x19b9,0x19a5,0x1992,0x197f,0x196b,0x1958
+	.word 0x1945,0x1931,0x191e,0x190b,0x18f8,0x18e4,0x18d1,0x18be
+	.word 0x18ab,0x1897,0x1884,0x1871,0x185e,0x184b,0x1837,0x1824
+	.word 0x1811,0x17fe,0x17eb,0x17d8,0x17c4,0x17b1,0x179e,0x178b
+	.word 0x1778,0x1765,0x1752,0x173f,0x172c,0x1719,0x1706,0x16f3
+	.word 0x16e0,0x16cd,0x16ba,0x16a7,0x1694,0x1681,0x166e,0x165b
+	.word 0x1648,0x1635,0x1623,0x1610,0x15fd,0x15ea,0x15d7,0x15c4
+	.word 0x15b1,0x159f,0x158c,0x1579,0x1566,0x1553,0x1541,0x152e
+	.word 0x151b,0x1508,0x14f6,0x14e3,0x14d0,0x14bd,0x14ab,0x1498
+	.word 0x1485,0x1473,0x1460,0x144d,0x143b,0x1428,0x1416,0x1403
+	.word 0x13f0,0x13de,0x13cb,0x13b9,0x13a6,0x1394,0x1381,0x136f
+	.word 0x135c,0x1349,0x1337,0x1325,0x1312,0x1300,0x12ed,0x12db
+	.word 0x12c8,0x12b6,0x12a3,0x1291,0x127f,0x126c,0x125a,0x1247
+	.word 0x1235,0x1223,0x1210,0x11fe,0x11ec,0x11d9,0x11c7,0x11b5
+	.word 0x11a3,0x1190,0x117e,0x116c,0x1159,0x1147,0x1135,0x1123
+	.word 0x1111,0x10fe,0x10ec,0x10da,0x10c8,0x10b6,0x10a4,0x1091
+	.word 0x107f,0x106d,0x105b,0x1049,0x1037,0x1025,0x1013,0x1001
+	.word 0x0fef,0x0fdc,0x0fca,0x0fb8,0x0fa6,0x0f94,0x0f82,0x0f70
+	.word 0x0f5e,0x0f4c,0x0f3a,0x0f28,0x0f17,0x0f05,0x0ef3,0x0ee1
+	.word 0x0ecf,0x0ebd,0x0eab,0x0e99,0x0e87,0x0e75,0x0e64,0x0e52
+	.word 0x0e40,0x0e2e,0x0e1c,0x0e0a,0x0df9,0x0de7,0x0dd5,0x0dc3
+	.word 0x0db2,0x0da0,0x0d8e,0x0d7c,0x0d6b,0x0d59,0x0d47,0x0d35
+	.word 0x0d24,0x0d12,0x0d00,0x0cef,0x0cdd,0x0ccb,0x0cba,0x0ca8
+	.word 0x0c97,0x0c85,0x0c73,0x0c62,0x0c50,0x0c3f,0x0c2d,0x0c1c
+	.word 0x0c0a,0x0bf8,0x0be7,0x0bd5,0x0bc4,0x0bb2,0x0ba1,0x0b8f
+	.word 0x0b7e,0x0b6c,0x0b5b,0x0b4a,0x0b38,0x0b27,0x0b15,0x0b04
+	.word 0x0af2,0x0ae1,0x0ad0,0x0abe,0x0aad,0x0a9c,0x0a8a,0x0a79
+	.word 0x0a68,0x0a56,0x0a45,0x0a34,0x0a22,0x0a11,0x0a00,0x09ee
+	.word 0x09dd,0x09cc,0x09bb,0x09a9,0x0998,0x0987,0x0976,0x0965
+	.word 0x0953,0x0942,0x0931,0x0920,0x090f,0x08fe,0x08ec,0x08db
+	.word 0x08ca,0x08b9,0x08a8,0x0897,0x0886,0x0875,0x0864,0x0853
+	.word 0x0842,0x0831,0x081f,0x080e,0x07fd,0x07ec,0x07db,0x07ca
+	.word 0x07b9,0x07a8,0x0798,0x0787,0x0776,0x0765,0x0754,0x0743
+	.word 0x0732,0x0721,0x0710,0x06ff,0x06ee,0x06dd,0x06cd,0x06bc
+	.word 0x06ab,0x069a,0x0689,0x0678,0x0668,0x0657,0x0646,0x0635
+	.word 0x0624,0x0614,0x0603,0x05f2,0x05e1,0x05d1,0x05c0,0x05af
+	.word 0x059e,0x058e,0x057d,0x056c,0x055c,0x054b,0x053a,0x052a
+	.word 0x0519,0x0508,0x04f8,0x04e7,0x04d6,0x04c6,0x04b5,0x04a5
+	.word 0x0494,0x0484,0x0473,0x0462,0x0452,0x0441,0x0431,0x0420
+	.word 0x0410,0x03ff,0x03ef,0x03de,0x03ce,0x03bd,0x03ad,0x039c
+	.word 0x038c,0x037b,0x036b,0x035b,0x034a,0x033a,0x0329,0x0319
+	.word 0x0309,0x02f8,0x02e8,0x02d7,0x02c7,0x02b7,0x02a6,0x0296
+	.word 0x0286,0x0275,0x0265,0x0255,0x0245,0x0234,0x0224,0x0214
+	.word 0x0204,0x01f3,0x01e3,0x01d3,0x01c3,0x01b2,0x01a2,0x0192
+	.word 0x0182,0x0172,0x0161,0x0151,0x0141,0x0131,0x0121,0x0111
+	.word 0x0101,0x00f0,0x00e0,0x00d0,0x00c0,0x00b0,0x00a0,0x0090
+	.word 0x0080,0x0070,0x0060,0x0050,0x0040,0x0030,0x0020,0x0010
+DATAEND()
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/lshift.asm b/rts/gmp/mpn/alpha/lshift.asm
new file mode 100644
index 0000000000..87c46f6fe7
--- /dev/null
+++ b/rts/gmp/mpn/alpha/lshift.asm
@@ -0,0 +1,104 @@
+dnl  Alpha mpn_lshift -- Shift a number left.
+
+dnl  Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  cnt	r19
+
+dnl  This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
+dnl  it would take 4 cycles/limb.  It should be possible to get down to 3
+dnl  cycles/limb since both ldq and stq can be paired with the other used
+dnl  instructions.  But there are many restrictions in the 21064 pipeline that
+dnl  makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+dnl  1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+dnl  2. Only aligned instruction pairs can be paired.
+dnl  3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	s8addq	r18,r17,r17	C make r17 point at end of s1
+	ldq	r4,-8(r17)	C load first limb
+	subq	r17,8,r17
+	subq	r31,r19,r7
+	s8addq	r18,r16,r16	C make r16 point at end of RES
+	subq	r18,1,r18
+	and	r18,4-1,r20	C number of limbs in first loop
+	srl	r4,r7,r0	C compute function result
+
+	beq	r20,$L0
+	subq	r18,r20,r18
+
+	ALIGN(8)
+$Loop0:
+	ldq	r3,-8(r17)
+	subq	r16,8,r16
+	subq	r17,8,r17
+	subq	r20,1,r20
+	sll	r4,r19,r5
+	srl	r3,r7,r6
+	bis	r3,r3,r4
+	bis	r5,r6,r8
+	stq	r8,0(r16)
+	bne	r20,$Loop0
+
+$L0:	beq	r18,$Lend
+
+	ALIGN(8)
+$Loop:	ldq	r3,-8(r17)
+	subq	r16,32,r16
+	subq	r18,4,r18
+	sll	r4,r19,r5
+	srl	r3,r7,r6
+
+	ldq	r4,-16(r17)
+	sll	r3,r19,r1
+	bis	r5,r6,r8
+	stq	r8,24(r16)
+	srl	r4,r7,r2
+
+	ldq	r3,-24(r17)
+	sll	r4,r19,r5
+	bis	r1,r2,r8
+	stq	r8,16(r16)
+	srl	r3,r7,r6
+
+	ldq	r4,-32(r17)
+	sll	r3,r19,r1
+	bis	r5,r6,r8
+	stq	r8,8(r16)
+	srl	r4,r7,r2
+
+	subq	r17,32,r17
+	bis	r1,r2,r8
+	stq	r8,0(r16)
+
+	bgt	r18,$Loop
+
+$Lend:	sll	r4,r19,r8
+	stq	r8,-8(r16)
+	ret	r31,(r26),1
+EPILOGUE(mpn_lshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/mul_1.asm b/rts/gmp/mpn/alpha/mul_1.asm
new file mode 100644
index 0000000000..46b8df34f5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/mul_1.asm
@@ -0,0 +1,71 @@
+dnl  Alpha __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl  the result in a second limb vector.
+
+dnl  Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  s2_limb	r19
+
+dnl  This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7
+dnl  cycles/limb on EV6.
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	ldq	r2,0(r17)	C r2 = s1_limb
+	subq	r18,1,r18	C size--
+	mulq	r2,r19,r3	C r3 = prod_low
+	bic	r31,r31,r4	C clear cy_limb
+	umulh	r2,r19,r0	C r0 = prod_high
+	beq	r18,$Lend1	C jump if size was == 1
+	ldq	r2,8(r17)	C r2 = s1_limb
+	subq	r18,1,r18	C size--
+	stq	r3,0(r16)
+	beq	r18,$Lend2	C jump if size was == 2
+
+	ALIGN(8)
+$Loop:	mulq	r2,r19,r3	C r3 = prod_low
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	subq	r18,1,r18	C size--
+	umulh	r2,r19,r4	C r4 = cy_limb
+	ldq	r2,16(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	stq	r3,8(r16)
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r16,8,r16	C res_ptr++
+	bne	r18,$Loop
+
+$Lend2:	mulq	r2,r19,r3	C r3 = prod_low
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,r19,r4	C r4 = cy_limb
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	stq	r3,8(r16)
+	addq	r4,r0,r0	C cy_limb = prod_high + cy
+	ret	r31,(r26),1
+$Lend1:	stq	r3,0(r16)
+	ret	r31,(r26),1
+EPILOGUE(mpn_mul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/rshift.asm b/rts/gmp/mpn/alpha/rshift.asm
new file mode 100644
index 0000000000..aa25eda54e
--- /dev/null
+++ b/rts/gmp/mpn/alpha/rshift.asm
@@ -0,0 +1,102 @@
+dnl  Alpha mpn_rshift -- Shift a number right.
+
+dnl  Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  cnt	r19
+
+dnl  This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
+dnl  it would take 4 cycles/limb.  It should be possible to get down to 3
+dnl  cycles/limb since both ldq and stq can be paired with the other used
+dnl  instructions.  But there are many restrictions in the 21064 pipeline that
+dnl  makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+dnl  1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+dnl  2. Only aligned instruction pairs can be paired.
+dnl  3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	ldq	r4,0(r17)	C load first limb
+	addq	r17,8,r17
+	subq	r31,r19,r7
+	subq	r18,1,r18
+	and	r18,4-1,r20	C number of limbs in first loop
+	sll	r4,r7,r0	C compute function result
+
+	beq	r20,$L0
+	subq	r18,r20,r18
+
+	ALIGN(8)
+$Loop0:
+	ldq	r3,0(r17)
+	addq	r16,8,r16
+	addq	r17,8,r17
+	subq	r20,1,r20
+	srl	r4,r19,r5
+	sll	r3,r7,r6
+	bis	r3,r3,r4
+	bis	r5,r6,r8
+	stq	r8,-8(r16)
+	bne	r20,$Loop0
+
+$L0:	beq	r18,$Lend
+
+	ALIGN(8)
+$Loop:	ldq	r3,0(r17)
+	addq	r16,32,r16
+	subq	r18,4,r18
+	srl	r4,r19,r5
+	sll	r3,r7,r6
+
+	ldq	r4,8(r17)
+	srl	r3,r19,r1
+	bis	r5,r6,r8
+	stq	r8,-32(r16)
+	sll	r4,r7,r2
+
+	ldq	r3,16(r17)
+	srl	r4,r19,r5
+	bis	r1,r2,r8
+	stq	r8,-24(r16)
+	sll	r3,r7,r6
+
+	ldq	r4,24(r17)
+	srl	r3,r19,r1
+	bis	r5,r6,r8
+	stq	r8,-16(r16)
+	sll	r4,r7,r2
+
+	addq	r17,32,r17
+	bis	r1,r2,r8
+	stq	r8,-8(r16)
+
+	bgt	r18,$Loop
+
+$Lend:	srl	r4,r19,r8
+	stq	r8,0(r16)
+	ret	r31,(r26),1
+EPILOGUE(mpn_rshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/sub_n.asm b/rts/gmp/mpn/alpha/sub_n.asm
new file mode 100644
index 0000000000..718f657141
--- /dev/null
+++ b/rts/gmp/mpn/alpha/sub_n.asm
@@ -0,0 +1,114 @@
+dnl  Alpha mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl  store difference in a third limb vector.
+
+dnl  Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  s2_ptr	r18
+dnl  size	r19
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	ldq	r3,0(r17)
+	ldq	r4,0(r18)
+
+	subq	r19,1,r19
+	and	r19,4-1,r2	C number of limbs in first loop
+	bis	r31,r31,r0
+	beq	r2,$L0		C if multiple of 4 limbs, skip first loop
+
+	subq	r19,r2,r19
+
+$Loop0:	subq	r2,1,r2
+	ldq	r5,8(r17)
+	addq	r4,r0,r4
+	ldq	r6,8(r18)
+	cmpult	r4,r0,r1
+	subq	r3,r4,r4
+	cmpult	r3,r4,r0
+	stq	r4,0(r16)
+	bis	r0,r1,r0
+
+	addq	r17,8,r17
+	addq	r18,8,r18
+	bis	r5,r5,r3
+	bis	r6,r6,r4
+	addq	r16,8,r16
+	bne	r2,$Loop0
+
+$L0:	beq	r19,$Lend
+
+	ALIGN(8)
+$Loop:	subq	r19,4,r19
+
+	ldq	r5,8(r17)
+	addq	r4,r0,r4
+	ldq	r6,8(r18)
+	cmpult	r4,r0,r1
+	subq	r3,r4,r4
+	cmpult	r3,r4,r0
+	stq	r4,0(r16)
+	bis	r0,r1,r0
+
+	ldq	r3,16(r17)
+	addq	r6,r0,r6
+	ldq	r4,16(r18)
+	cmpult	r6,r0,r1
+	subq	r5,r6,r6
+	cmpult	r5,r6,r0
+	stq	r6,8(r16)
+	bis	r0,r1,r0
+
+	ldq	r5,24(r17)
+	addq	r4,r0,r4
+	ldq	r6,24(r18)
+	cmpult	r4,r0,r1
+	subq	r3,r4,r4
+	cmpult	r3,r4,r0
+	stq	r4,16(r16)
+	bis	r0,r1,r0
+
+	ldq	r3,32(r17)
+	addq	r6,r0,r6
+	ldq	r4,32(r18)
+	cmpult	r6,r0,r1
+	subq	r5,r6,r6
+	cmpult	r5,r6,r0
+	stq	r6,24(r16)
+	bis	r0,r1,r0
+
+	addq	r17,32,r17
+	addq	r18,32,r18
+	addq	r16,32,r16
+	bne	r19,$Loop
+
+$Lend:	addq	r4,r0,r4
+	cmpult	r4,r0,r1
+	subq	r3,r4,r4
+	cmpult	r3,r4,r0
+	stq	r4,0(r16)
+	bis	r0,r1,r0
+	ret	r31,(r26),1
+EPILOGUE(mpn_sub_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/submul_1.asm b/rts/gmp/mpn/alpha/submul_1.asm
new file mode 100644
index 0000000000..caec1a720b
--- /dev/null
+++ b/rts/gmp/mpn/alpha/submul_1.asm
@@ -0,0 +1,87 @@
+dnl  Alpha __gmpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  s2_limb	r19
+
+dnl  This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7
+dnl  cycles/limb on EV6.
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	subq	r18,1,r18	C size--
+	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	umulh	r2,r19,r0	C r0 = prod_high
+	beq	r18,$Lend1	C jump if size was == 1
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	subq	r18,1,r18	C size--
+	subq	r5,r3,r3
+	cmpult	r5,r3,r4
+	stq	r3,0(r16)
+	addq	r16,8,r16	C res_ptr++
+	beq	r18,$Lend2	C jump if size was == 2
+
+	ALIGN(8)
+$Loop:	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	subq	r18,1,r18	C size--
+	umulh	r2,r19,r4	C r4 = cy_limb
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	subq	r5,r3,r3
+	cmpult	r5,r3,r5
+	stq	r3,0(r16)
+	addq	r16,8,r16	C res_ptr++
+	addq	r5,r0,r0	C combine carries
+	bne	r18,$Loop
+
+$Lend2:	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,r19,r4	C r4 = cy_limb
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	subq	r5,r3,r3
+	cmpult	r5,r3,r5
+	stq	r3,0(r16)
+	addq	r5,r0,r0	C combine carries
+	addq	r4,r0,r0	C cy_limb = prod_high + cy
+	ret	r31,(r26),1
+$Lend1:	subq	r5,r3,r3
+	cmpult	r5,r3,r5
+	stq	r3,0(r16)
+	addq	r0,r5,r0
+	ret	r31,(r26),1
+EPILOGUE(mpn_submul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/udiv_qrnnd.S b/rts/gmp/mpn/alpha/udiv_qrnnd.S
new file mode 100644
index 0000000000..53814bbcb0
--- /dev/null
+++ b/rts/gmp/mpn/alpha/udiv_qrnnd.S
@@ -0,0 +1,151 @@
+ # Alpha 21064 __udiv_qrnnd
+
+ # Copyright (C) 1992, 1994, 1995, 1997, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+	.set noreorder
+	.set noat
+.text
+	.align	3
+	.globl	__gmpn_udiv_qrnnd
+	.ent	__gmpn_udiv_qrnnd
+__gmpn_udiv_qrnnd:
+	.frame $30,0,$26,0
+	.prologue 0
+#define cnt	$2
+#define tmp	$3
+#define rem_ptr	$16
+#define n1	$17
+#define n0	$18
+#define d	$19
+#define qb	$20
+
+	ldiq	cnt,16
+	blt	d,.Largedivisor
+
+.Loop1:	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	d,n1,qb
+	subq	n1,d,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	d,n1,qb
+	subq	n1,d,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	d,n1,qb
+	subq	n1,d,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	d,n1,qb
+	subq	n1,d,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	subq	cnt,1,cnt
+	bgt	cnt,.Loop1
+	stq	n1,0(rem_ptr)
+	bis	$31,n0,$0
+	ret	$31,($26),1
+
+.Largedivisor:
+	and	n0,1,$4
+
+	srl	n0,1,n0
+	sll	n1,63,tmp
+	or	tmp,n0,n0
+	srl	n1,1,n1
+
+	and	d,1,$6
+	srl	d,1,$5
+	addq	$5,$6,$5
+
+.Loop2:	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	$5,n1,qb
+	subq	n1,$5,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	$5,n1,qb
+	subq	n1,$5,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	$5,n1,qb
+	subq	n1,$5,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	$5,n1,qb
+	subq	n1,$5,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	subq	cnt,1,cnt
+	bgt	cnt,.Loop2
+
+	addq	n1,n1,n1
+	addq	$4,n1,n1
+	bne	$6,.LOdd
+	stq	n1,0(rem_ptr)
+	bis	$31,n0,$0
+	ret	$31,($26),1
+
+.LOdd:
+	/* q' in n0. r' in n1 */
+	addq	n1,n0,n1
+	cmpult	n1,n0,tmp	# tmp := carry from addq
+	beq	tmp,.LLp6
+	addq	n0,1,n0
+	subq	n1,d,n1
+.LLp6:	cmpult	n1,d,tmp
+	bne	tmp,.LLp7
+	addq	n0,1,n0
+	subq	n1,d,n1
+.LLp7:
+	stq	n1,0(rem_ptr)
+	bis	$31,n0,$0
+	ret	$31,($26),1
+
+	.end	__gmpn_udiv_qrnnd
diff --git a/rts/gmp/mpn/alpha/umul.asm b/rts/gmp/mpn/alpha/umul.asm
new file mode 100644
index 0000000000..44428ed5f5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/umul.asm
@@ -0,0 +1,39 @@
+dnl  Currently unused.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+	.set noreorder
+	.set volatile
+	.set noat
+
+.text
+	.align 3
+	.globl __umul_ppmm
+	.ent __umul_ppmm
+__umul_ppmm:
+__umul_ppmm..ng:
+	.frame $30,0,$26,0
+	.prologue 0
+	mulq $17,$18,$1
+	umulh $17,$18,$0
+	stq $1,0($16)
+	ret $31,($26),1
+	.end __umul_ppmm
diff --git a/rts/gmp/mpn/alpha/unicos.m4 b/rts/gmp/mpn/alpha/unicos.m4
new file mode 100644
index 0000000000..7ff26c090c
--- /dev/null
+++ b/rts/gmp/mpn/alpha/unicos.m4
@@ -0,0 +1,63 @@
+divert(-1)
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+define(`ASM_START',
+	`.ident	dummy')
+
+define(`X',`^X$1')
+define(`FLOAT64',
+	`dnl
+	.psect	$1@crud,data
+$1:	.t_floating $2
+	.endp')
+
+define(`PROLOGUE',
+	`dnl
+	.stack	192		; What does this mean?  Only Cray knows.
+	.psect	$1@code,code,cache
+$1::')
+define(`PROLOGUE_GP', `PROLOGUE($1)')
+
+define(`EPILOGUE',
+	`dnl
+	.endp')
+
+define(`DATASTART',
+	`dnl
+	.psect	$1@crud,data
+$1:')
+define(`DATAEND',
+	`dnl
+	.endp')
+
+define(`ASM_END',
+	`dnl
+	.end')
+
+define(`unop',`bis r31,r31,r31') ; Unicos assembler lacks unop
+define(`cvttqc',`cvttq/c')
+
+define(`ALIGN',`')		; Unicos assembler seems to align using garbage
+
+divert
+
diff --git a/rts/gmp/mpn/arm/add_n.S b/rts/gmp/mpn/arm/add_n.S
new file mode 100644
index 0000000000..fb3f8f703b
--- /dev/null
+++ b/rts/gmp/mpn/arm/add_n.S
@@ -0,0 +1,77 @@
+@ ARM mpn_add -- Add two limb vectors of the same length > 0 and store sum in
+@ a third limb vector.
+@ Contributed by Robert Harley.
+
+@ Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+#define s r0
+#define a r1
+#define b r2
+#define n r3
+
+#define sl r10
+#define fp r11
+#define ip r12
+#define sp r13
+#define lr r14
+#define pc r15
+
+.text
+	.align	0
+	.global __gmpn_add_n
+	.type	__gmpn_add_n,%function
+__gmpn_add_n:
+	stmfd	sp!, { r8, r9, lr }
+	movs	n, n, lsr #1
+	bcc	skip1
+	ldr	ip, [a], #4
+	ldr	lr, [b], #4
+	adds	ip, ip, lr
+	str	ip, [s], #4
+skip1:
+	tst	n, #1
+	beq	skip2
+	ldmia	a!, { r8, r9 }
+	ldmia	b!, { ip, lr }
+	adcs	r8, r8, ip
+	adcs	r9, r9, lr
+	stmia	s!, { r8, r9 }
+skip2:
+	bics	n, n, #1
+	beq	return
+	stmfd	sp!, { r4, r5, r6, r7 }
+add_n_loop:
+	ldmia	a!, { r4, r5, r6, r7 }
+	ldmia	b!, { r8, r9, ip, lr }
+	adcs	r4, r4, r8
+	ldr	r8, [s] /* Bring stuff into cache. */
+	adcs	r5, r5, r9
+	adcs	r6, r6, ip
+	adcs	r7, r7, lr
+	stmia	s!, { r4, r5, r6, r7 }
+	sub	n, n, #2
+	teq	n, #0
+	bne	add_n_loop
+	ldmfd	sp!, { r4, r5, r6, r7 }
+return:
+	adc	r0, n, #0
+	ldmfd	sp!, { r8, r9, pc }
+end:
+	.size	__gmpn_add_n, end - __gmpn_add_n
diff --git a/rts/gmp/mpn/arm/addmul_1.S b/rts/gmp/mpn/arm/addmul_1.S
new file mode 100644
index 0000000000..396fff77a3
--- /dev/null
+++ b/rts/gmp/mpn/arm/addmul_1.S
@@ -0,0 +1,89 @@
+@ ARM mpn_mul_1 -- Multiply a limb vector with a limb and add the result to a
+@ second limb vector.
+@ Contributed by Robert Harley.
+
+@ Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+#define p r0
+#define a r1
+#define n r2
+#define w r3
+
+#define z r11
+
+#define ip r12
+#define sp r13
+#define lr r14
+#define pc r15
+
+.text
+	.align	0
+	.global	__gmpn_addmul_1
+	.type	__gmpn_addmul_1,%function
+__gmpn_addmul_1:
+	stmfd	sp!, { r8-r11, lr }
+	mov	z, #0
+	mov	ip, #0
+	movs	n, n, lsr #1
+	bcc	skip1
+	ldr	lr, [a], #4
+	ldr	r9, [p]
+	umlal	r9, ip, w, lr
+	str	r9, [p], #4
+skip1:
+	movs	n, n, lsr #1
+	bcc	skip2
+	ldmia	p, { r9, r10 }
+	adds	r8, ip, r9
+	adc	r9, z, #0
+	ldmia	a!, { ip, lr }
+	umlal	r8, r9, w, ip
+	adds	r9, r9, r10
+	adc	ip, z, #0
+	umlal	r9, ip, w, lr
+	stmia	p!, { r8, r9 }
+skip2:
+	teq	n, #0
+	beq	return
+	stmfd	sp!, { r4-r7 }
+addmul_loop:
+	ldmia	p, { r5, r6, r7, r8 }
+	adds	r4, ip, r5
+	adc	r5, z, #0
+	ldmia	a!, { r9, r10, ip, lr }
+	umlal	r4, r5, w, r9
+	adds	r5, r5, r6
+	adc	r6, z, #0
+	umlal	r5, r6, w, r10
+	adds	r6, r6, r7
+	adc	r7, z, #0
+	umlal	r6, r7, w, ip
+	adds	r7, r7, r8
+	adc	ip, z, #0
+	umlal	r7, ip, w, lr
+	subs	n, n, #1
+	stmia	p!, { r4, r5, r6, r7 }
+	bne	addmul_loop
+	ldmfd	sp!, { r4-r7 }
+return:
+	mov	r0, ip
+	ldmfd	sp!, { r8-r11, pc }
+end:
+	.size	__gmpn_addmul_1, end - __gmpn_addmul_1
diff --git a/rts/gmp/mpn/arm/gmp-mparam.h b/rts/gmp/mpn/arm/gmp-mparam.h
new file mode 100644
index 0000000000..a35b0c7b66
--- /dev/null
+++ b/rts/gmp/mpn/arm/gmp-mparam.h
@@ -0,0 +1,34 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 21
+#endif
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 48
+#endif
diff --git a/rts/gmp/mpn/arm/mul_1.S b/rts/gmp/mpn/arm/mul_1.S
new file mode 100644
index 0000000000..bae526a0f0
--- /dev/null
+++ b/rts/gmp/mpn/arm/mul_1.S
@@ -0,0 +1,81 @@
+@ ARM mpn_addmul_1 -- Multiply a limb vector with a limb and store the result
+@ in a second limb vector.
+@ Contributed by Robert Harley.
+
+@ Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+#define p r0
+#define a r1
+#define n r2
+#define w r3
+
+#define sl r10
+#define fp r11
+#define ip r12
+#define sp r13
+#define lr r14
+#define pc r15
+
+.text
+	.align	0
+	.global	__gmpn_mul_1
+	.type	__gmpn_mul_1,%function
+__gmpn_mul_1:
+	stmfd	sp!, { r8, r9, lr }
+	ands	ip, n, #1
+	beq	skip1
+	ldr	lr, [a], #4
+	umull	r9, ip, w, lr
+	str	r9, [p], #4
+skip1:
+	tst	n, #2
+	beq	skip2
+	mov	r8, ip
+	ldmia	a!, { ip, lr }
+	mov	r9, #0
+	umlal	r8, r9, w, ip
+	mov	ip, #0
+	umlal	r9, ip, w, lr
+	stmia	p!, { r8, r9 }
+skip2:
+	bics	n, n, #3
+	beq	return
+	stmfd	sp!, { r6, r7 }
+mul_1_loop:
+	mov	r6, ip
+	ldmia	a!, { r8, r9, ip, lr }
+	ldr	r7, [p] /* Bring stuff into cache. */
+	mov	r7, #0
+	umlal	r6, r7, w, r8
+	mov	r8, #0
+	umlal	r7, r8, w, r9
+	mov	r9, #0
+	umlal	r8, r9, w, ip
+	mov	ip, #0
+	umlal	r9, ip, w, lr
+	subs	n, n, #4
+	stmia	p!, { r6, r7, r8, r9 }
+	bne	mul_1_loop
+	ldmfd	sp!, { r6, r7 }
+return:
+	mov	r0, ip
+	ldmfd	sp!, { r8, r9, pc }
+end:
+	.size	__gmpn_mul_1, end - __gmpn_mul_1
diff --git a/rts/gmp/mpn/arm/sub_n.S b/rts/gmp/mpn/arm/sub_n.S
new file mode 100644
index 0000000000..856505fe21
--- /dev/null
+++ b/rts/gmp/mpn/arm/sub_n.S
@@ -0,0 +1,79 @@
+@ ARM mpn_sub -- Subtract two limb vectors of the same length > 0 and store
+@ difference in a third limb vector.
+@ Contributed by Robert Harley.
+
+@ Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+#define d r0
+#define a r1
+#define b r2
+#define n r3
+
+#define sl r10
+#define fp r11
+#define ip r12
+#define sp r13
+#define lr r14
+#define pc r15
+
+.text
+	.align	0
+	.global	__gmpn_sub_n
+	.type	__gmpn_sub_n,%function
+__gmpn_sub_n:
+	stmfd	sp!, { r8, r9, lr }
+	subs	ip, ip, ip
+	tst	n, #1
+	beq	skip1
+	ldr	ip, [a], #4
+	ldr	lr, [b], #4
+	subs	ip, ip, lr
+	str	ip, [d], #4
+skip1:
+	tst	n, #2
+	beq	skip2
+	ldmia	a!, { r8, r9 }
+	ldmia	b!, { ip, lr }
+	sbcs	r8, r8, ip
+	sbcs	r9, r9, lr
+	stmia	d!, { r8, r9 }
+skip2:
+	bics	n, n, #3
+	beq	return
+	stmfd	sp!, { r4, r5, r6, r7 }
+sub_n_loop:
+	ldmia	a!, { r4, r5, r6, r7 }
+	ldmia	b!, { r8, r9, ip, lr }
+	sbcs	r4, r4, r8
+	ldr	r8, [d] /* Bring stuff into cache. */
+	sbcs	r5, r5, r9
+	sbcs	r6, r6, ip
+	sbcs	r7, r7, lr
+	stmia	d!, { r4, r5, r6, r7 }
+	sub	n, n, #4
+	teq	n, #0
+	bne	sub_n_loop
+	ldmfd	sp!, { r4, r5, r6, r7 }
+return:
+	sbc	r0, r0, r0
+	and	r0, r0, #1
+	ldmfd	sp!, { r8, r9, pc }
+end:
+	.size	__gmpn_sub_n, end - __gmpn_sub_n
diff --git a/rts/gmp/mpn/asm-defs.m4 b/rts/gmp/mpn/asm-defs.m4
new file mode 100644
index 0000000000..aa2024138b
--- /dev/null
+++ b/rts/gmp/mpn/asm-defs.m4
@@ -0,0 +1,1182 @@
+divert(-1)
+dnl
+dnl  m4 macros for gmp assembly code, shared by all CPUs.
+dnl
+dnl  These macros are designed for use with any m4 and have been used on
+dnl  GNU, FreeBSD, OpenBSD and SysV.
+dnl
+dnl  GNU m4 and OpenBSD 2.7 m4 will give filenames and line numbers in error
+dnl  messages.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl  Macros:
+dnl
+dnl  Most new m4 specific macros have an "m4_" prefix to emphasise they're
+dnl  m4 expansions.  But new defining things like deflit() and defreg() are
+dnl  named like the builtin define(), and forloop() is named following the
+dnl  GNU m4 example on which it's based.
+dnl
+dnl  GNU m4 with the -P option uses "m4_" as a prefix for builtins, but that
+dnl  option isn't going to be used, so there's no conflict or confusion.
+dnl
+dnl
+dnl  Comments in output:
+dnl
+dnl  The m4 comment delimiters are left at # and \n, the normal assembler
+dnl  commenting for most CPUs.  m4 passes comment text through without
+dnl  expanding macros in it, which is generally a good thing since it stops
+dnl  unexpected expansions and possible resultant errors.
+dnl
+dnl  But note that when a quoted string is being read, a # isn't special, so
+dnl  apostrophes in comments in quoted strings must be avoided or they'll be
+dnl  interpreted as a closing quote mark.  But when the quoted text is
+dnl  re-read # will still act like a normal comment, supressing macro
+dnl  expansion.
+dnl
+dnl  For example,
+dnl
+dnl          # apostrophes in comments that're outside quotes are ok
+dnl          # and using macro names like PROLOGUE is ok too
+dnl          ...
+dnl          ifdef(`PIC',`
+dnl                  # but apostrophes aren't ok inside quotes
+dnl                  #                     ^--wrong
+dnl                  ...
+dnl                  # though macro names like PROLOGUE are still ok
+dnl                  ...
+dnl          ')
+dnl
+dnl  If macro expansion in a comment is wanted, use `#' in the .asm (ie. a
+dnl  quoted hash symbol), which will turn into # in the .s but get
+dnl  expansions done on that line.  This can make the .s more readable to
+dnl  humans, but it won't make a blind bit of difference to the assembler.
+dnl
+dnl  All the above applies, mutatis mutandis, when changecom() is used to
+dnl  select @ ! ; or whatever other commenting.
+dnl
+dnl
+dnl  Variations in m4 affecting gmp:
+dnl
+dnl  $# - When a macro is called as "foo" with no brackets, BSD m4 sets $#
+dnl       to 1, whereas GNU or SysV m4 set it to 0.  In all cases though
+dnl       "foo()" sets $# to 1.  This is worked around in various places.
+dnl
+dnl  len() - When "len()" is given an empty argument, BSD m4 evaluates to
+dnl       nothing, whereas GNU, SysV, and the new OpenBSD, evaluate to 0.
+dnl       See m4_length() below which works around this.
+dnl
+dnl  translit() - GNU m4 accepts character ranges like A-Z, and the new
+dnl       OpenBSD m4 does under option -g, but basic BSD and SysV don't.
+dnl
+dnl  popdef() - in BSD and SysV m4 popdef() takes multiple arguments and
+dnl       pops each, but GNU m4 only takes one argument.
+dnl
+dnl  push back - BSD m4 has some limits on the amount of text that can be
+dnl       pushed back.  The limit is reasonably big and so long as macros
+dnl       don't gratuitously duplicate big arguments it isn't a problem.
+dnl       Normally an error message is given, but sometimes it just hangs.
+dnl
+dnl  eval() &,|,^ - GNU and SysV m4 have bitwise operators &,|,^ available,
+dnl       but BSD m4 doesn't (contrary to what the man page suggests) and
+dnl       instead ^ is exponentiation.
+dnl
+dnl  eval() ?: - The C ternary operator "?:" is available in BSD m4, but not
+dnl       in SysV or GNU m4 (as of GNU m4 1.4 and betas of 1.5).
+dnl
+dnl  eval() -2^31 - BSD m4 has a bug where an eval() resulting in -2^31
+dnl       (ie. -2147483648) gives "-(".  Using -2147483648 within an
+dnl       expression is ok, it just can't be a final result.  "-(" will of
+dnl       course upset parsing, with all sorts of strange effects.
+dnl
+dnl  eval() <<,>> - SysV m4 doesn't support shift operators in eval() (on
+dnl       SunOS 5.7 /usr/xpg4/m4 has them but /usr/ccs/m4 doesn't).  See
+dnl       m4_lshift() and m4_rshift() below for workarounds.
+dnl
+dnl  m4wrap() - in BSD m4, m4wrap() replaces any previous m4wrap() string,
+dnl       in SysV m4 it appends to it, and in GNU m4 it prepends.  See
+dnl       m4wrap_prepend() below which brings uniformity to this.
+dnl
+dnl  __file__,__line__ - GNU m4 and OpenBSD 2.7 m4 provide these, and
+dnl       they're used here to make error messages more informative.  GNU m4
+dnl       gives an unhelpful "NONE 0" in an m4wrap(), but that's worked
+dnl       around.
+dnl
+dnl  __file__ quoting - OpenBSD m4, unlike GNU m4, doesn't quote the
+dnl       filename in __file__, so care should be taken that no macro has
+dnl       the same name as a file, or an unwanted expansion will occur when
+dnl       printing an error or warning.
+dnl
+dnl  OpenBSD 2.6 m4 - this m4 rejects decimal constants containing an 8 or 9
+dnl       in eval(), making it pretty much unusable.  This bug is confined
+dnl       to version 2.6 (it's not in 2.5, and has been fixed in 2.7).
+dnl
+dnl  SunOS /usr/bin/m4 - this m4 lacks a number of desired features,
+dnl       including $# and $@, defn(), m4exit(), m4wrap(), pushdef(),
+dnl       popdef().  /usr/5bin/m4 is a SysV style m4 which should always be
+dnl       available, and "configure" will reject /usr/bin/m4 in favour of
+dnl       /usr/5bin/m4 (if necessary).
+dnl
+dnl       The sparc code actually has modest m4 requirements currently and
+dnl       could manage with /usr/bin/m4, but there's no reason to put our
+dnl       macros through contortions when /usr/5bin/m4 is available or GNU
+dnl       m4 can be installed.
+
+
+ifdef(`__ASM_DEFS_M4_INCLUDED__',
+`m4_error(`asm-defs.m4 already included, dont include it twice
+')m4exit(1)')
+define(`__ASM_DEFS_M4_INCLUDED__')
+
+
+dnl  Detect and give a message about the unsuitable OpenBSD 2.6 m4.
+
+ifelse(eval(89),89,,
+`errprint(
+`This m4 doesnt accept 8 and/or 9 in constants in eval(), making it unusable.
+This is probably OpenBSD 2.6 m4 (September 1999).  Upgrade to OpenBSD 2.7,
+or get a bug fix from the CVS (expr.c rev 1.9), or get GNU m4.  Dont forget
+to configure with M4=/wherever/m4 if you install one of these in a directory
+not in $PATH.
+')m4exit(1)')
+
+
+dnl  Detect and give a message about the unsuitable SunOS /usr/bin/m4.
+dnl
+dnl  Unfortunately this test doesn't work when m4 is run in the normal way
+dnl  from mpn/Makefile with "m4 -DOPERATION_foo foo.asm", since the bad m4
+dnl  takes "-" in "-D..." to mean read stdin, so it will look like it just
+dnl  hangs.  But running "m4 asm-defs.m4" to try it out will work.
+dnl
+dnl  We'd like to abort immediately on finding a problem, but unfortunately
+dnl  the bad m4 doesn't have an m4exit(), nor does an invalid eval() kill
+dnl  it.  Unexpanded $#'s in some m4_assert_numargs() later on will comment
+dnl  out some closing parentheses and kill it with "m4: arg stack overflow".
+
+define(m4_dollarhash_works_test,``$#'')
+ifelse(m4_dollarhash_works_test(x),1,,
+`errprint(
+`This m4 doesnt support $# and cant be used for GMP asm processing.
+If this is on SunOS, ./configure should choose /usr/5bin/m4 if you have that
+or can get it, otherwise install GNU m4.  Dont forget to configure with
+M4=/wherever/m4 if you install in a directory not in $PATH.
+')')
+undefine(`m4_dollarhash_works_test')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Basic error handling things.
+
+
+dnl  Usage: m4_dollarhash_1_if_noparen_p
+dnl
+dnl  Expand to 1 if a call "foo" gives $# set to 1 (as opposed to 0 like GNU
+dnl  and SysV m4 give).
+
+define(m4_dollarhash_1_if_noparen_test,`$#')
+define(m4_dollarhash_1_if_noparen_p,
+eval(m4_dollarhash_1_if_noparen_test==1))
+undefine(`m4_dollarhash_1_if_noparen_test')
+
+
+dnl  Usage: m4wrap_prepend(string)
+dnl
+dnl  Prepend the given string to what will be exapanded under m4wrap at the
+dnl  end of input.
+dnl
+dnl  This macro exists to work around variations in m4wrap() behaviour in
+dnl  the various m4s (notes at the start of this file).  Don't use m4wrap()
+dnl  directly since it will interfere with this scheme.
+
+define(m4wrap_prepend,
+m4_assert_numargs(1)
+`define(`m4wrap_string',`$1'defn(`m4wrap_string'))')
+
+m4wrap(`m4wrap_string')
+define(m4wrap_string,`')
+
+
+dnl  Usage: m4_file_and_line
+dnl
+dnl  Expand to the current file and line number, if the GNU m4 extensions
+dnl  __file__ and __line__ are available.
+dnl
+dnl  In GNU m4 1.4 at the end of input when m4wrap text is expanded,
+dnl  __file__ is NONE and __line__ is 0, which is not a helpful thing to
+dnl  print.  If m4_file_seen() has been called to note the last file seen,
+dnl  then that file at a big line number is used, otherwise "end of input"
+dnl  is used (although "end of input" won't parse as an error message).
+
+define(m4_file_and_line,
+`ifdef(`__file__',
+`ifelse(__file__`'__line__,`NONE0',
+`ifdef(`m4_file_seen_last',`m4_file_seen_last: 999999: ',`end of input: ')',
+`__file__: __line__: ')')')
+
+
+dnl  Usage: m4_errprint_commas(arg,...)
+dnl
+dnl  The same as errprint(), but commas are printed between arguments
+dnl  instead of spaces.
+
+define(m4_errprint_commas,
+`errprint(`$1')dnl
+ifelse(eval($#>1),1,`errprint(`,')m4_errprint_commas(shift($@))')')
+
+
+dnl  Usage: m4_error(args...)
+dnl         m4_warning(args...)
+dnl
+dnl  Print an error message, using m4_errprint_commas, prefixed with the
+dnl  current filename and line number (if available).  m4_error sets up to
+dnl  give an error exit at the end of processing, m4_warning just prints.
+dnl  These macros are the recommended way to print errors.
+dnl
+dnl  The arguments here should be quoted in the usual way to prevent them
+dnl  being expanded when the macro call is read.  (m4_error takes care not
+dnl  to do any further expansion.)
+dnl
+dnl  For example,
+dnl
+dnl         m4_error(`some error message
+dnl         ')
+dnl
+dnl  which prints
+dnl
+dnl         foo.asm:123: some error message
+dnl
+dnl  or if __file__ and __line__ aren't available
+dnl
+dnl         some error message
+dnl
+dnl  The "file:line:" format is a basic style, used by gcc and GNU m4, so
+dnl  emacs and other editors will recognise it in their normal error message
+dnl  parsing.
+
+define(m4_warning,
+`m4_errprint_commas(m4_file_and_line`'$@)')
+
+define(m4_error,
+`define(`m4_error_occurred',1)m4_warning($@)')
+
+define(`m4_error_occurred',0)
+
+dnl  This m4wrap_prepend() is first, so it'll be executed last.
+m4wrap_prepend(
+`ifelse(m4_error_occurred,1,
+`m4_error(`Errors occurred during m4 processing
+')m4exit(1)')')
+
+
+dnl  Usage: m4_assert_numargs(num)
+dnl
+dnl  Put this unquoted on a line on its own at the start of a macro
+dnl  definition to add some code to check that num many arguments get passed
+dnl  to the macro.  For example,
+dnl
+dnl         define(foo,
+dnl         m4_assert_numargs(2)
+dnl         `something `$1' and `$2' blah blah')
+dnl
+dnl  Then a call like foo(one,two,three) will provoke an error like
+dnl
+dnl         file:10: foo expected 2 arguments, got 3 arguments
+dnl
+dnl  Here are some calls and how many arguments they're interpreted as passing.
+dnl
+dnl         foo(abc,def)  2
+dnl         foo(xyz)      1
+dnl         foo()         0
+dnl         foo          -1
+dnl
+dnl  The -1 for no parentheses at all means a macro that's meant to be used
+dnl  that way can be checked with m4_assert_numargs(-1).  For example,
+dnl
+dnl         define(SPECIAL_SUFFIX,
+dnl         m4_assert_numargs(-1)
+dnl         `ifdef(`FOO',`_foo',`_bar')')
+dnl
+dnl  But as an alternative see also deflit() below where parenthesized
+dnl  expressions following a macro are passed through to the output.
+dnl
+dnl  Note that in BSD m4 there's no way to differentiate calls "foo" and
+dnl  "foo()", so in BSD m4 the distinction between the two isn't enforced.
+dnl  (In GNU and SysV m4 it can be checked, and is.)
+
+
+dnl  m4_assert_numargs is able to check its own arguments by calling
+dnl  assert_numargs_internal directly.
+dnl
+dnl  m4_doublequote($`'0) expands to ``$0'', whereas ``$`'0'' would expand
+dnl  to `$`'0' and do the wrong thing, and likewise for $1.  The same is
+dnl  done in other assert macros.
+dnl
+dnl  $`#' leaves $# in the new macro being defined, and stops # being
+dnl  interpreted as a comment character.
+dnl
+dnl  `dnl ' means an explicit dnl isn't necessary when m4_assert_numargs is
+dnl  used.  The space means that if there is a dnl it'll still work.
+
+dnl  Usage: m4_doublequote(x) expands to ``x''
+define(m4_doublequote,
+`m4_assert_numargs_internal(`$0',1,$#,len(`$1'))``$1''')
+
+define(m4_assert_numargs,
+`m4_assert_numargs_internal(`$0',1,$#,len(`$1'))dnl
+`m4_assert_numargs_internal'(m4_doublequote($`'0),$1,$`#',`len'(m4_doublequote($`'1)))`dnl '')
+
+dnl  Called: m4_assert_numargs_internal(`macroname',wantargs,$#,len(`$1'))
+define(m4_assert_numargs_internal,
+`m4_assert_numargs_internal_check(`$1',`$2',m4_numargs_count(`$3',`$4'))')
+
+dnl  Called: m4_assert_numargs_internal_check(`macroname',wantargs,gotargs)
+dnl
+dnl  If m4_dollarhash_1_if_noparen_p (BSD m4) then gotargs can be 0 when it
+dnl  should be -1.  If wantargs is -1 but gotargs is 0 and the two can't be
+dnl  distinguished then it's allowed to pass.
+dnl
+define(m4_assert_numargs_internal_check,
+`ifelse(eval($2 == $3
+             || ($2==-1 && $3==0 && m4_dollarhash_1_if_noparen_p)),0,
+`m4_error(`$1 expected 'm4_Narguments(`$2')`, got 'm4_Narguments(`$3')
+)')')
+
+dnl  Called: m4_numargs_count($#,len(`$1'))
+dnl  If $#==0 then -1 args, if $#==1 but len(`$1')==0 then 0 args, otherwise
+dnl  $# args.
+define(m4_numargs_count,
+`ifelse($1,0, -1,
+`ifelse(eval($1==1 && $2-0==0),1, 0, $1)')')
+
+dnl  Usage: m4_Narguments(N)
+dnl  "$1 argument" or "$1 arguments" with the plural according to $1.
+define(m4_Narguments,
+`$1 argument`'ifelse(`$1',1,,s)')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Additional error checking things.
+
+
+dnl  Usage: m4_file_seen()
+dnl
+dnl  Record __file__ for the benefit of m4_file_and_line in m4wrap text.
+dnl  The basic __file__ macro comes out quoted, like `foo.asm', and
+dnl  m4_file_seen_last is defined like that too.
+dnl
+dnl  This only needs to be used with something that could generate an error
+dnl  message in m4wrap text.  The x86 PROLOGUE is the only such at the
+dnl  moment (at end of input its m4wrap checks for missing EPILOGUE).  A few
+dnl  include()s can easily trick this scheme, but you'd expect an EPILOGUE
+dnl  in the same file as the PROLOGUE.
+
+define(m4_file_seen,
+m4_assert_numargs(0)
+`ifelse(__file__,`NONE',,
+`define(`m4_file_seen_last',m4_doublequote(__file__))')')
+
+
+dnl  Usage: m4_assert_onearg()
+dnl
+dnl  Put this, unquoted, at the start of a macro definition to add some code
+dnl  to check that one argument is passed to the macro, but with that
+dnl  argument allowed to be empty.  For example,
+dnl
+dnl          define(foo,
+dnl          m4_assert_onearg()
+dnl          `blah blah $1 blah blah')
+dnl
+dnl  Calls "foo(xyz)" or "foo()" are accepted.  A call "foo(xyz,abc)" fails.
+dnl  A call "foo" fails too, but BSD m4 can't detect this case (GNU and SysV
+dnl  m4 can).
+
+define(m4_assert_onearg,
+m4_assert_numargs(0)
+`m4_assert_onearg_internal'(m4_doublequote($`'0),$`#')`dnl ')
+
+dnl  Called: m4_assert_onearg(`macroname',$#)
+define(m4_assert_onearg_internal,
+`ifelse($2,1,,
+`m4_error(`$1 expected 1 argument, got 'm4_Narguments(`$2')
+)')')
+
+
+dnl  Usage: m4_assert_numargs_range(low,high)
+dnl
+dnl  Put this, unquoted, at the start of a macro definition to add some code
+dnl  to check that between low and high many arguments get passed to the
+dnl  macro.  For example,
+dnl
+dnl         define(foo,
+dnl         m4_assert_numargs_range(3,5)
+dnl         `mandatory $1 $2 $3 optional $4 $5 end')
+dnl
+dnl  See m4_assert_numargs() for more info.
+
+define(m4_assert_numargs_range,
+m4_assert_numargs(2)
+``m4_assert_numargs_range_internal'(m4_doublequote($`'0),$1,$2,$`#',`len'(m4_doublequote($`'1)))`dnl '')
+
+dnl  Called: m4_assert_numargs_range_internal(`name',low,high,$#,len(`$1'))
+define(m4_assert_numargs_range_internal,
+m4_assert_numargs(5)
+`m4_assert_numargs_range_check(`$1',`$2',`$3',m4_numargs_count(`$4',`$5'))')
+
+dnl  Called: m4_assert_numargs_range_check(`name',low,high,gotargs)
+dnl
+dnl  If m4_dollarhash_1_if_noparen_p (BSD m4) then gotargs can be 0 when it
+dnl  should be -1.  To ensure a `high' of -1 works, a fudge is applied to
+dnl  gotargs if it's 0 and the 0 and -1 cases can't be distinguished.
+dnl
+define(m4_assert_numargs_range_check,
+m4_assert_numargs(4)
+`ifelse(eval($2 <= $4 &&
+             ($4 - ($4==0 && m4_dollarhash_1_if_noparen_p) <= $3)),0,
+`m4_error(`$1 expected $2 to $3 arguments, got 'm4_Narguments(`$4')
+)')')
+
+
+dnl  Usage: m4_assert_defined(symbol)
+dnl
+dnl  Put this unquoted on a line of its own at the start of a macro
+dnl  definition to add some code to check that the given symbol is defined
+dnl  when the macro is used.  For example,
+dnl
+dnl          define(foo,
+dnl          m4_assert_defined(`FOO_PREFIX')
+dnl          `FOO_PREFIX whatever')
+dnl
+dnl  This is a convenient way to check that the user or ./configure or
+dnl  whatever has defined the things needed by a macro, as opposed to
+dnl  silently generating garbage.
+
+define(m4_assert_defined,
+m4_assert_numargs(1)
+``m4_assert_defined_internal'(m4_doublequote($`'0),``$1'')`dnl '')
+
+dnl  Called: m4_assert_defined_internal(`macroname',`define_required')
+define(m4_assert_defined_internal,
+m4_assert_numargs(2)
+`ifdef(`$2',,
+`m4_error(`$1 needs $2 defined
+')')')
+
+
+dnl  Usage: m4_not_for_expansion(`SYMBOL')
+dnl         define_not_for_expansion(`SYMBOL')
+dnl
+dnl  m4_not_for_expansion turns SYMBOL, if defined, into something which
+dnl  will give an error if expanded.  For example,
+dnl
+dnl         m4_not_for_expansion(`PIC')
+dnl
+dnl  define_not_for_expansion is the same, but always makes a definition.
+dnl
+dnl  These are for symbols that should be tested with ifdef(`FOO',...)
+dnl  rather than be expanded as such.  They guard against accidentally
+dnl  omitting the quotes, as in ifdef(FOO,...).  Note though that they only
+dnl  catches this when FOO is defined, so be sure to test code both with and
+dnl  without each definition.
+
+define(m4_not_for_expansion,
+m4_assert_numargs(1)
+`ifdef(`$1',`define_not_for_expansion(`$1')')')
+
+define(define_not_for_expansion,
+m4_assert_numargs(1)
+`ifelse(defn(`$1'),,,
+`m4_error(``$1' has a non-empty value, maybe it shouldnt be munged with m4_not_for_expansion()
+')')dnl
+define(`$1',`m4_not_for_expansion_internal(`$1')')')
+
+define(m4_not_for_expansion_internal,
+`m4_error(``$1' is not meant to be expanded, perhaps you mean `ifdef(`$1',...)'
+')')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Various generic m4 things.
+
+
+dnl  Usage: m4_ifdef_anyof_p(`symbol',...)
+dnl
+dnl  Expand to 1 if any of the symbols in the argument list are defined, or
+dnl  to 0 if not.
+
+define(m4_ifdef_anyof_p,
+`ifelse(eval($#<=1 && m4_length(`$1')==0),1, 0,
+`ifdef(`$1', 1,
+`m4_ifdef_anyof_p(shift($@))')')')
+
+
+dnl  Usage: m4_length(string)
+dnl
+dnl  Determine the length of a string.  This is the same as len(), but
+dnl  always expands to a number, working around the BSD len() which
+dnl  evaluates to nothing given an empty argument.
+
+define(m4_length,
+m4_assert_onearg()
+`eval(len(`$1')-0)')
+
+
+dnl  Usage: m4_stringequal_p(x,y)
+dnl
+dnl  Expand to 1 or 0 according as strings x and y are equal or not.
+
+define(m4_stringequal_p,
+`ifelse(`$1',`$2',1,0)')
+
+
+dnl  Usage: m4_incr_or_decr(n,last)
+dnl
+dnl  Do an incr(n) or decr(n), whichever is in the direction of "last".
+dnl  Both n and last must be numbers of course.
+
+define(m4_incr_or_decr,
+m4_assert_numargs(2)
+`ifelse(eval($1<$2),1,incr($1),decr($1))')
+
+
+dnl  Usage: forloop(i, first, last, statement)
+dnl
+dnl  Based on GNU m4 examples/forloop.m4, but extended.
+dnl
+dnl  statement is expanded repeatedly, with i successively defined as
+dnl
+dnl         first, first+1, ..., last-1, last
+dnl
+dnl  Or if first > last, then it's
+dnl
+dnl         first, first-1, ..., last+1, last
+dnl
+dnl  If first == last, then one expansion is done.
+dnl
+dnl  A pushdef/popdef of i is done to preserve any previous definition (or
+dnl  lack of definition).  first and last are eval()ed and so can be
+dnl  expressions.
+dnl
+dnl  forloop_first is defined to 1 on the first iteration, 0 on the rest.
+dnl  forloop_last is defined to 1 on the last iteration, 0 on the others.
+dnl  Nested forloops are allowed, in which case forloop_first and
+dnl  forloop_last apply to the innermost loop that's open.
+dnl
+dnl  A simple example,
+dnl
+dnl         forloop(i, 1, 2*2+1, `dnl
+dnl         iteration number i ... ifelse(forloop_first,1,FIRST)
+dnl         ')
+
+
+dnl  "i" and "statement" are carefully quoted, but "first" and "last" are
+dnl  just plain numbers once eval()ed.
+
+define(`forloop',
+m4_assert_numargs(4)
+`pushdef(`$1',eval(`$2'))dnl
+pushdef(`forloop_first',1)dnl
+pushdef(`forloop_last',0)dnl
+forloop_internal(`$1',eval(`$3'),`$4')`'dnl
+popdef(`forloop_first')dnl
+popdef(`forloop_last')dnl
+popdef(`$1')')
+
+dnl  Called: forloop_internal(`var',last,statement)
+define(`forloop_internal',
+m4_assert_numargs(3)
+`ifelse($1,$2,
+`define(`forloop_last',1)$3',
+`$3`'dnl
+define(`forloop_first',0)dnl
+define(`$1',m4_incr_or_decr($1,$2))dnl
+forloop_internal(`$1',$2,`$3')')')
+
+
+dnl  Usage: m4_toupper(x)
+dnl         m4_tolower(x)
+dnl
+dnl  Convert the argument string to upper or lower case, respectively.
+dnl  Only one argument accepted.
+dnl
+dnl  BSD m4 doesn't take ranges like a-z in translit(), so the full alphabet
+dnl  is written out.
+
+define(m4_alphabet_lower, `abcdefghijklmnopqrstuvwxyz')
+define(m4_alphabet_upper, `ABCDEFGHIJKLMNOPQRSTUVWXYZ')
+
+define(m4_toupper,
+m4_assert_onearg()
+`translit(`$1', m4_alphabet_lower, m4_alphabet_upper)')
+
+define(m4_tolower,
+m4_assert_onearg()
+`translit(`$1', m4_alphabet_upper, m4_alphabet_lower)')
+
+
+dnl  Usage: m4_empty_if_zero(x)
+dnl
+dnl  Evaluate to x, or to nothing if x is 0.  x is eval()ed and so can be an
+dnl  expression.
+dnl
+dnl  This is useful for x86 addressing mode displacements since forms like
+dnl  (%ebx) are one byte shorter than 0(%ebx).  A macro `foo' for use as
+dnl  foo(%ebx) could be defined with the following so it'll be empty if the
+dnl  expression comes out zero.
+dnl
+dnl	   deflit(`foo', `m4_empty_if_zero(a+b*4-c)')
+dnl
+dnl  Naturally this shouldn't be done if, say, a computed jump depends on
+dnl  the code being a particular size.
+
+define(m4_empty_if_zero,
+m4_assert_onearg()
+`ifelse(eval($1),0,,eval($1))')
+
+
+dnl  Usage: m4_log2(x)
+dnl
+dnl  Calculate a logarithm to base 2.
+dnl  x must be an integral power of 2, between 2**0 and 2**30.
+dnl  x is eval()ed, so it can be an expression.
+dnl  An error results if x is invalid.
+dnl
+dnl  2**31 isn't supported, because an unsigned 2147483648 is out of range
+dnl  of a 32-bit signed int.  Also, the bug in BSD m4 where an eval()
+dnl  resulting in 2147483648 (or -2147483648 as the case may be) gives `-('
+dnl  means tests like eval(1<<31==(x)) would be necessary, but that then
+dnl  gives an unattractive explosion of eval() error messages if x isn't
+dnl  numeric.
+
+define(m4_log2,
+m4_assert_numargs(1)
+`m4_log2_internal(0,1,eval(`$1'))')
+
+dnl  Called: m4_log2_internal(n,2**n,target)
+define(m4_log2_internal,
+m4_assert_numargs(3)
+`ifelse($2,$3,$1,
+`ifelse($1,30,
+`m4_error(`m4_log2() argument too big or not a power of two: $3
+')',
+`m4_log2_internal(incr($1),eval(2*$2),$3)')')')
+
+
+dnl  Usage:  m4_div2_towards_zero
+dnl
+dnl  m4 division is probably whatever a C signed division is, and C doesn't
+dnl  specify what rounding gets used on negatives, so this expression forces
+dnl  a rounding towards zero.
+
+define(m4_div2_towards_zero,
+m4_assert_numargs(1)
+`eval((($1) + ((($1)<0) & ($1))) / 2)')
+
+
+dnl  Usage: m4_lshift(n,count)
+dnl         m4_rshift(n,count)
+dnl
+dnl  Calculate n shifted left or right by count many bits.  Both n and count
+dnl  are eval()ed and so can be expressions.
+dnl
+dnl  Negative counts are allowed and mean a shift in the opposite direction.
+dnl  Negative n is allowed and right shifts will be arithmetic (meaning
+dnl  divide by 2**count, rounding towards zero, also meaning the sign bit is
+dnl  duplicated).
+dnl
+dnl  Use these macros instead of << and >> in eval() since the basic ccs
+dnl  SysV m4 doesn't have those operators.
+
+define(m4_rshift,
+m4_assert_numargs(2)
+`m4_lshift(`$1',-(`$2'))')
+
+define(m4_lshift,
+m4_assert_numargs(2)
+`m4_lshift_internal(eval(`$1'),eval(`$2'))')
+
+define(m4_lshift_internal,
+m4_assert_numargs(2)
+`ifelse(eval($2-0==0),1,$1,
+`ifelse(eval($2>0),1,
+`m4_lshift_internal(eval($1*2),decr($2))',
+`m4_lshift_internal(m4_div2_towards_zero($1),incr($2))')')')
+
+
+dnl  Usage: deflit(name,value)
+dnl
+dnl  Like define(), but "name" expands like a literal, rather than taking
+dnl  arguments.  For example "name(%eax)" expands to "value(%eax)".
+dnl
+dnl  Limitations:
+dnl
+dnl  $ characters in the value part must have quotes to stop them looking
+dnl  like macro parameters.  For example, deflit(reg,`123+$`'4+567').  See
+dnl  defreg() below for handling simple register definitions like $7 etc.
+dnl
+dnl  "name()" is turned into "name", unfortunately.  In GNU and SysV m4 an
+dnl  error is generated when this happens, but in BSD m4 it will happen
+dnl  silently.  The problem is that in BSD m4 $# is 1 in both "name" or
+dnl  "name()", so there's no way to differentiate them.  Because we want
+dnl  plain "name" to turn into plain "value", we end up with "name()"
+dnl  turning into plain "value" too.
+dnl
+dnl  "name(foo)" will lose any whitespace after commas in "foo", for example
+dnl  "disp(%eax, %ecx)" would become "128(%eax,%ecx)".
+dnl
+dnl  These parentheses oddities shouldn't matter in assembler text, but if
+dnl  they do the suggested workaround is to write "name ()" or "name (foo)"
+dnl  to stop the parentheses looking like a macro argument list.  If a space
+dnl  isn't acceptable in the output, then write "name`'()" or "name`'(foo)".
+dnl  The `' is stripped when read, but again stops the parentheses looking
+dnl  like parameters.
+
+dnl  Quoting for deflit_emptyargcheck is similar to m4_assert_numargs.  The
+dnl  stuff in the ifelse gives a $#, $1 and $@ evaluated in the new macro
+dnl  created, not in deflit.
+define(deflit,
+m4_assert_numargs(2)
+`define(`$1',
+`deflit_emptyargcheck'(``$1'',$`#',m4_doublequote($`'1))`dnl
+$2`'dnl
+ifelse(eval($'`#>1 || m4_length('m4_doublequote($`'1)`)!=0),1,($'`@))')')
+
+dnl  Called: deflit_emptyargcheck(macroname,$#,`$1')
+define(deflit_emptyargcheck,
+`ifelse(eval($2==1 && !m4_dollarhash_1_if_noparen_p && m4_length(`$3')==0),1,
+`m4_error(`dont use a deflit as $1() because it loses the brackets (see deflit in asm-incl.m4 for more information)
+')')')
+
+
+dnl  Usage: m4_assert(`expr')
+dnl
+dnl  Test a compile-time requirement with an m4 expression.  The expression
+dnl  should be quoted, and will be eval()ed and expected to be non-zero.
+dnl  For example,
+dnl
+dnl         m4_assert(`FOO*2+6 < 14')
+
+define(m4_assert,
+m4_assert_numargs(1)
+`ifelse(eval($1),1,,
+`m4_error(`assertion failed: $1
+')')')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Various assembler things, not specific to any particular CPU.
+dnl
+
+
+dnl  Usage: include_mpn(`filename')
+dnl
+dnl  Like include(), but adds a path to the mpn source directory.  For
+dnl  example,
+dnl
+dnl         include_mpn(`sparc64/addmul_1h.asm')
+
+define(include_mpn,
+m4_assert_numargs(1)
+m4_assert_defined(`CONFIG_TOP_SRCDIR')
+`include(CONFIG_TOP_SRCDIR`/mpn/$1')')
+
+
+dnl  Usage: C comment ...
+dnl
+dnl  "C" works like a FORTRAN-style comment character.  This can be used for
+dnl  comments to the right of assembly instructions, where just dnl would
+dnl  remove the linefeed, and concatenate adjacent lines.
+dnl
+dnl  "C" and/or "dnl" are useful when an assembler doesn't support comments,
+dnl  or where different assemblers for a particular CPU have different
+dnl  comment styles.  The intermediate ".s" files will end up with no
+dnl  comments, just code.
+dnl
+dnl  Using "C" is not intended to cause offence to anyone who doesn't like
+dnl  FORTRAN; but if that happens it's an unexpected bonus.
+
+define(C, `
+dnl')
+
+
+dnl  Various possible defines passed from the Makefile that are to be tested
+dnl  with ifdef() rather than be expanded.
+
+m4_not_for_expansion(`PIC')
+
+dnl  aors_n
+m4_not_for_expansion(`OPERATION_add_n')
+m4_not_for_expansion(`OPERATION_sub_n')
+
+dnl  aorsmul_n
+m4_not_for_expansion(`OPERATION_addmul_1')
+m4_not_for_expansion(`OPERATION_submul_1')
+
+dnl  logops_n
+m4_not_for_expansion(`OPERATION_and_n')
+m4_not_for_expansion(`OPERATION_andn_n')
+m4_not_for_expansion(`OPERATION_nand_n')
+m4_not_for_expansion(`OPERATION_ior_n')
+m4_not_for_expansion(`OPERATION_iorn_n')
+m4_not_for_expansion(`OPERATION_nior_n')
+m4_not_for_expansion(`OPERATION_xor_n')
+m4_not_for_expansion(`OPERATION_xnor_n')
+
+dnl  popham
+m4_not_for_expansion(`OPERATION_popcount')
+m4_not_for_expansion(`OPERATION_hamdist')
+
+
+dnl  Usage: m4_config_gmp_mparam(`symbol')
+dnl
+dnl  Check that `symbol' is defined.  If it isn't, issue an error and
+dnl  terminate immediately.  The error message explains that the symbol
+dnl  should be in config.m4, copied from gmp-mparam.h.
+dnl
+dnl  Processing is terminated immediately since missing something like
+dnl  KARATSUBA_SQR_THRESHOLD can lead to infinite loops with endless error
+dnl  messages.
+
+define(m4_config_gmp_mparam,
+m4_assert_numargs(1)
+`ifdef(`$1',,
+`m4_error(`$1 is not defined.
+	"configure" should have extracted this from gmp-mparam.h and put it
+	in config.m4, but somehow this has failed.
+')m4exit(1)')')
+
+
+dnl  Usage: defreg(name,reg)
+dnl
+dnl  Give a name to a $ style register.  For example,
+dnl
+dnl         defreg(foo,$12)
+dnl
+dnl  defreg() inserts an extra pair of quotes after the $ so that it's not
+dnl  interpreted as an m4 macro parameter, ie. foo is actually $`'12.  m4
+dnl  strips those quotes when foo is expanded.
+dnl
+dnl  deflit() is used to make the new definition, so it will expand
+dnl  literally even if followed by parentheses ie. foo(99) will become
+dnl  $12(99).  (But there's nowhere that would be used is there?)
+dnl
+dnl  When making further definitions from existing defreg() macros, remember
+dnl  to use defreg() again to protect the $ in the new definitions too.  For
+dnl  example,
+dnl
+dnl         defreg(a0,$4)
+dnl         defreg(a1,$5)
+dnl         ...
+dnl
+dnl         defreg(PARAM_DST,a0)
+dnl
+dnl  This is only because a0 is expanding at the time the PARAM_DST
+dnl  definition is made, leaving a literal $4 that must be re-quoted.  On
+dnl  the other hand in something like the following ra is only expanded when
+dnl  ret is used and its $`'31 protection will have its desired effect at
+dnl  that time.
+dnl
+dnl         defreg(ra,$31)
+dnl         ...
+dnl         define(ret,`j ra')
+dnl
+dnl  Note that only $n forms are meant to be used here, and something like
+dnl  128($30) doesn't get protected and will come out wrong.
+
+define(defreg,
+m4_assert_numargs(2)
+`deflit(`$1',
+substr(`$2',0,1)``''substr(`$2',1))')
+
+
+dnl  Usage: m4_instruction_wrapper(num)
+dnl
+dnl  Put this, unquoted, on a line on its own, at the start of a macro
+dnl  that's a wrapper around an assembler instruction.  It adds code to give
+dnl  a descriptive error message if the macro is invoked without arguments.
+dnl
+dnl  For example, suppose jmp needs to be wrapped,
+dnl
+dnl         define(jmp,
+dnl         m4_instruction_wrapper()
+dnl         m4_assert_numargs(1)
+dnl                 `.byte 0x42
+dnl                 .long  $1
+dnl                 nop')
+dnl
+dnl  The point of m4_instruction_wrapper is to get a better error message
+dnl  than m4_assert_numargs would give if jmp is accidentally used as plain
+dnl  "jmp foo" instead of the intended "jmp( foo)".  "jmp()" with no
+dnl  argument also provokes the error message.
+dnl
+dnl  m4_instruction_wrapper should only be used with wrapped instructions
+dnl  that take arguments, since obviously something meant to be used as
+dnl  plain "ret", say, doesn't want to give an error when used that way.
+
+define(m4_instruction_wrapper,
+m4_assert_numargs(0)
+``m4_instruction_wrapper_internal'(m4_doublequote($`'0),dnl
+m4_doublequote(ifdef(`__file__',__file__,`the m4 sources')),dnl
+$`#',m4_doublequote($`'1))`dnl'')
+
+dnl  Called: m4_instruction_wrapper_internal($0,`filename',$#,$1)
+define(m4_instruction_wrapper_internal,
+`ifelse(eval($3<=1 && m4_length(`$4')==0),1,
+`m4_error(`$1 is a macro replacing that instruction and needs arguments, see $2 for details
+')')')
+
+
+dnl  Usage: UNROLL_LOG2, UNROLL_MASK, UNROLL_BYTES
+dnl         CHUNK_LOG2, CHUNK_MASK, CHUNK_BYTES
+dnl
+dnl  When code supports a variable amount of loop unrolling, the convention
+dnl  is to define UNROLL_COUNT to the number of limbs processed per loop.
+dnl  When testing code this can be varied to see how much the loop overhead
+dnl  is costing.  For example,
+dnl
+dnl         deflit(UNROLL_COUNT, 32)
+dnl
+dnl  If the forloop() generating the unrolled loop has a pattern processing
+dnl  more than one limb, the convention is to express this with CHUNK_COUNT.
+dnl  For example,
+dnl
+dnl         deflit(CHUNK_COUNT, 2)
+dnl
+dnl  The LOG2, MASK and BYTES definitions below are derived from these COUNT
+dnl  definitions.  If COUNT is redefined, the LOG2, MASK and BYTES follow
+dnl  the new definition automatically.
+dnl
+dnl  LOG2 is the log base 2 of COUNT.  MASK is COUNT-1, which can be used as
+dnl  a bit mask.  BYTES is BYTES_PER_MP_LIMB*COUNT, the number of bytes
+dnl  processed in each unrolled loop.
+dnl
+dnl  BYTES_PER_MP_LIMB is defined in a CPU specific m4 include file.  It
+dnl  exists only so the BYTES definitions here can be common to all CPUs.
+dnl  In the actual code for a given CPU, an explicit 4 or 8 may as well be
+dnl  used because the code is only for a particular CPU, it doesn't need to
+dnl  be general.
+dnl
+dnl  Note that none of these macros do anything except give conventional
+dnl  names to commonly used things.  You still have to write your own
+dnl  expressions for a forloop() and the resulting address displacements.
+dnl  Something like the following would be typical for 4 bytes per limb.
+dnl
+dnl         forloop(`i',0,UNROLL_COUNT-1,`
+dnl                 deflit(`disp',eval(i*4))
+dnl                 ...
+dnl         ')
+dnl
+dnl  Or when using CHUNK_COUNT,
+dnl
+dnl         forloop(`i',0,UNROLL_COUNT/CHUNK_COUNT-1,`
+dnl                 deflit(`disp0',eval(i*CHUNK_COUNT*4))
+dnl                 deflit(`disp1',eval(disp0+4))
+dnl                 ...
+dnl         ')
+dnl
+dnl  Clearly `i' can be run starting from 1, or from high to low or whatever
+dnl  best suits.
+
+deflit(UNROLL_LOG2,
+m4_assert_defined(`UNROLL_COUNT')
+`m4_log2(UNROLL_COUNT)')
+
+deflit(UNROLL_MASK,
+m4_assert_defined(`UNROLL_COUNT')
+`eval(UNROLL_COUNT-1)')
+
+deflit(UNROLL_BYTES,
+m4_assert_defined(`UNROLL_COUNT')
+m4_assert_defined(`BYTES_PER_MP_LIMB')
+`eval(UNROLL_COUNT * BYTES_PER_MP_LIMB)')
+ 
+deflit(CHUNK_LOG2,
+m4_assert_defined(`CHUNK_COUNT')
+`m4_log2(CHUNK_COUNT)')
+
+deflit(CHUNK_MASK,
+m4_assert_defined(`CHUNK_COUNT')
+`eval(CHUNK_COUNT-1)')
+
+deflit(CHUNK_BYTES,
+m4_assert_defined(`CHUNK_COUNT')
+m4_assert_defined(`BYTES_PER_MP_LIMB')
+`eval(CHUNK_COUNT * BYTES_PER_MP_LIMB)')
+
+
+dnl  Usage: MPN(name)
+dnl
+dnl  Add MPN_PREFIX to a name.
+dnl  MPN_PREFIX defaults to "__gmpn_" if not defined.
+
+ifdef(`MPN_PREFIX',,
+`define(`MPN_PREFIX',`__gmpn_')')
+
+define(MPN,
+m4_assert_numargs(1)
+`MPN_PREFIX`'$1')
+
+
+dnl  Usage: mpn_add_n, etc
+dnl
+dnl  Convenience definitions using MPN(), like the #defines in gmp.h.  Each
+dnl  function that might be implemented in assembler is here.
+
+define(define_mpn,
+m4_assert_numargs(1)
+`define(`mpn_$1',`MPN(`$1')')')
+
+define_mpn(add)
+define_mpn(add_1)
+define_mpn(add_n)
+define_mpn(add_nc)
+define_mpn(addmul_1)
+define_mpn(addmul_1c)
+define_mpn(addsub_n)
+define_mpn(addsub_nc)
+define_mpn(and_n)
+define_mpn(andn_n)
+define_mpn(bdivmod)
+define_mpn(cmp)
+define_mpn(com_n)
+define_mpn(copyd)
+define_mpn(copyi)
+define_mpn(divexact_by3c)
+define_mpn(divrem)
+define_mpn(divrem_1)
+define_mpn(divrem_1c)
+define_mpn(divrem_2)
+define_mpn(divrem_classic)
+define_mpn(divrem_newton)
+define_mpn(dump)
+define_mpn(gcd)
+define_mpn(gcd_1)
+define_mpn(gcdext)
+define_mpn(get_str)
+define_mpn(hamdist)
+define_mpn(invert_limb)
+define_mpn(ior_n)
+define_mpn(iorn_n)
+define_mpn(kara_mul_n)
+define_mpn(kara_sqr_n)
+define_mpn(lshift)
+define_mpn(lshiftc)
+define_mpn(mod_1)
+define_mpn(mod_1c)
+define_mpn(mul)
+define_mpn(mul_1)
+define_mpn(mul_1c)
+define_mpn(mul_basecase)
+define_mpn(mul_n)
+define_mpn(perfect_square_p)
+define_mpn(popcount)
+define_mpn(preinv_mod_1)
+define_mpn(nand_n)
+define_mpn(nior_n)
+define_mpn(random)
+define_mpn(random2)
+define_mpn(rshift)
+define_mpn(rshiftc)
+define_mpn(scan0)
+define_mpn(scan1)
+define_mpn(set_str)
+define_mpn(sqr_basecase)
+define_mpn(sub_n)
+define_mpn(sqrtrem)
+define_mpn(sub)
+define_mpn(sub_1)
+define_mpn(sub_n)
+define_mpn(sub_nc)
+define_mpn(submul_1)
+define_mpn(submul_1c)
+define_mpn(toom3_mul_n)
+define_mpn(toom3_sqr_n)
+define_mpn(umul_ppmm)
+define_mpn(udiv_qrnnd)
+define_mpn(xnor_n)
+define_mpn(xor_n)
+
+define(`ASM_START',
+	`')
+
+define(`PROLOGUE',
+	`
+	TEXT
+	ALIGN(4)
+	GLOBL	GSYM_PREFIX`$1'
+	TYPE(GSYM_PREFIX`$1',`function')
+GSYM_PREFIX`$1':')
+
+define(`EPILOGUE',
+	`
+	SIZE(GSYM_PREFIX`$1',.-GSYM_PREFIX`$1')')
+
+dnl  LSYM_PREFIX might be L$, so defn() must be used to quote it or the L
+dnl  will expand as the L macro, an infinite recursion.
+define(`L',`defn(`LSYM_PREFIX')$1')
+
+define(`INT32',
+	`
+	ALIGN(4)
+$1:
+	W32	$2
+	')
+
+define(`INT64',
+	`
+	ALIGN(8)
+$1:
+	W32	$2
+	W32	$3
+	')
+
+
+dnl  Usage: ALIGN(bytes)
+dnl
+dnl  Emit a ".align" directive.  The alignment is specified in bytes, and
+dnl  will normally need to be a power of 2.  The actual ".align" generated
+dnl  is either bytes or logarithmic according to what ./configure detects.
+dnl
+dnl  ALIGN_FILL_0x90, if defined and equal to "yes", means a ", 0x90" should
+dnl  be appended (this is for x86).
+
+define(ALIGN,
+m4_assert_numargs(1)
+m4_assert_defined(`ALIGN_LOGARITHMIC')
+`.align	ifelse(ALIGN_LOGARITHMIC,yes,`m4_log2($1)',`eval($1)')dnl
+ifelse(ALIGN_FILL_0x90,yes,`, 0x90')')
+
+
+dnl  Usage: MULFUNC_PROLOGUE(function function...)
+dnl
+dnl  A dummy macro which is grepped for by ./configure to know what
+dnl  functions a multi-function file is providing.  Use this if there aren't
+dnl  explicit PROLOGUE()s for each possible function.
+dnl
+dnl  Multiple MULFUNC_PROLOGUEs can be used, or just one with the function
+dnl  names separated by spaces.
+
+define(`MULFUNC_PROLOGUE',
+m4_assert_numargs(1)
+`')
+
+
+divert`'dnl
diff --git a/rts/gmp/mpn/clipper/add_n.s b/rts/gmp/mpn/clipper/add_n.s
new file mode 100644
index 0000000000..538a1caed0
--- /dev/null
+++ b/rts/gmp/mpn/clipper/add_n.s
@@ -0,0 +1,48 @@
+; Clipper __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+.text
+	.align 16
+.globl ___gmpn_add_n
+___gmpn_add_n:
+	subq	$8,sp
+	storw	r6,(sp)
+	loadw	12(sp),r2
+	loadw	16(sp),r3
+	loadq	$0,r6		; clear carry-save register
+
+.Loop:	loadw	(r1),r4
+	loadw	(r2),r5
+	addwc	r6,r6		; restore carry from r6
+	addwc	r5,r4
+	storw	r4,(r0)
+	subwc	r6,r6		; save carry in r6
+	addq	$4,r0
+	addq	$4,r1
+	addq	$4,r2
+	subq	$1,r3
+	brne	.Loop
+
+	negw	r6,r0
+	loadw	(sp),r6
+	addq	$8,sp
+	ret	sp
diff --git a/rts/gmp/mpn/clipper/mul_1.s b/rts/gmp/mpn/clipper/mul_1.s
new file mode 100644
index 0000000000..c0c756488c
--- /dev/null
+++ b/rts/gmp/mpn/clipper/mul_1.s
@@ -0,0 +1,47 @@
+; Clipper __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+; the result in a second limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+.text
+	.align	16
+.globl	___gmpn_mul_1
+___gmpn_mul_1:
+	subq	$8,sp
+	storw	r6,(sp)
+	loadw	12(sp),r2
+	loadw	16(sp),r3
+	loadq	$0,r6		; clear carry limb
+
+.Loop:	loadw	(r1),r4
+	mulwux	r3,r4
+	addw	r6,r4		; add old carry limb into low product limb
+	loadq	$0,r6
+	addwc	r5,r6		; propagate cy into high product limb
+	storw	r4,(r0)
+	addq	$4,r0
+	addq	$4,r1
+	subq	$1,r2
+	brne	.Loop
+
+	movw	r6,r0
+	loadw	0(sp),r6
+	addq	$8,sp
+	ret	sp
diff --git a/rts/gmp/mpn/clipper/sub_n.s b/rts/gmp/mpn/clipper/sub_n.s
new file mode 100644
index 0000000000..44d8797289
--- /dev/null
+++ b/rts/gmp/mpn/clipper/sub_n.s
@@ -0,0 +1,48 @@
+; Clipper __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+.text
+	.align 16
+.globl ___gmpn_sub_n
+___gmpn_sub_n:
+	subq	$8,sp
+	storw	r6,(sp)
+	loadw	12(sp),r2
+	loadw	16(sp),r3
+	loadq	$0,r6		; clear carry-save register
+
+.Loop:	loadw	(r1),r4
+	loadw	(r2),r5
+	addwc	r6,r6		; restore carry from r6
+	subwc	r5,r4
+	storw	r4,(r0)
+	subwc	r6,r6		; save carry in r6
+	addq	$4,r0
+	addq	$4,r1
+	addq	$4,r2
+	subq	$1,r3
+	brne	.Loop
+
+	negw	r6,r0
+	loadw	(sp),r6
+	addq	$8,sp
+	ret	sp
diff --git a/rts/gmp/mpn/cray/README b/rts/gmp/mpn/cray/README
new file mode 100644
index 0000000000..8195c67e21
--- /dev/null
+++ b/rts/gmp/mpn/cray/README
@@ -0,0 +1,14 @@
+The (poorly optimized) code in this directory was originally written for a
+j90 system, but finished on a c90.  It should work on all Cray vector
+computers.  For the T3E and T3D systems, the `alpha' subdirectory at the
+same level as the directory containing this file, is much better.
+
+* `+' seems to be faster than `|' when combining carries.
+
+* It is possible that the best multiply performance would be achived by
+  storing only 24 bits per element, and using lazy carry propagation.  Before
+  calling i24mult, full carry propagation would be needed.
+
+* Supply tasking versions of the C loops.
+
+
diff --git a/rts/gmp/mpn/cray/add_n.c b/rts/gmp/mpn/cray/add_n.c
new file mode 100644
index 0000000000..1fdb394993
--- /dev/null
+++ b/rts/gmp/mpn/cray/add_n.c
@@ -0,0 +1,96 @@
+/* mpn_add_n -- Add two limb vectors of equal, non-zero length.
+   For Cray vector processors.
+
+   Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+   the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+   MA 02111-1307, USA.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_add_n (c, a, b, n)
+     mp_ptr c;
+     mp_srcptr a, b;
+     mp_size_t n;
+{
+  mp_size_t i;
+  mp_size_t nm1 = n - 1;
+  int more_carries = 0;
+  int carry_out;
+
+  /* For small operands the non-vector code is faster.  */
+  if (n < 16)
+    goto sequential;
+
+  if (a == c || b == c)
+    {
+      TMP_DECL (marker);
+      TMP_MARK (marker);
+      if (c == a)
+	{
+	  /* allocate temp space for a */
+	  mp_ptr ax = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+	  MPN_COPY (ax, a, n);
+	  a = (mp_srcptr) ax;
+	}
+      if (c == b)
+	{
+	  /* allocate temp space for b */
+	  mp_ptr bx = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+	  MPN_COPY (bx, b, n);
+	  b = (mp_srcptr) bx;
+	}
+      carry_out = mpn_add_n (c, a, b, n);
+      TMP_FREE (marker);
+      return carry_out;
+    }
+
+  carry_out = a[nm1] + b[nm1] < a[nm1];
+
+#pragma _CRI ivdep			/* Cray PVP systems */
+  for (i = nm1; i > 0; i--)
+    {
+      int cy_in;
+      cy_in = a[i - 1] + b[i - 1] < a[i - 1];
+      c[i] = a[i] + b[i] + cy_in;
+      more_carries += c[i] < cy_in;
+    }
+  c[0] = a[0] + b[0];
+
+  if (more_carries)
+    {
+      /* This won't vectorize, but we should come here rarely.  */
+      int cy;
+    sequential:
+      cy = 0;
+      for (i = 0; i < n; i++)
+	{
+	  mp_limb_t ai, ci, t;
+	  ai = a[i];
+	  t = b[i] + cy;
+	  cy = t < cy;
+	  ci = ai + t;
+	  cy += ci < ai;
+	  c[i] = ci;
+	}
+      carry_out = cy;
+    }
+
+  return carry_out;
+}
diff --git a/rts/gmp/mpn/cray/addmul_1.c b/rts/gmp/mpn/cray/addmul_1.c
new file mode 100644
index 0000000000..031b4e8e8d
--- /dev/null
+++ b/rts/gmp/mpn/cray/addmul_1.c
@@ -0,0 +1,46 @@
+/* mpn_addmul_1 for Cray PVP.
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb)
+{
+  mp_ptr p0, p1, tp;
+  mp_limb_t cy_limb;
+  TMP_DECL (marker);
+  TMP_MARK (marker);
+
+  p1 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+  p0 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+  tp = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+
+  GMPN_MULWW (p1, p0, up, &n, &limb);
+  cy_limb = mpn_add_n (tp, rp, p0, n);
+  rp[0] = tp[0];
+  cy_limb += mpn_add_n (rp + 1, tp + 1, p1, n - 1);
+  cy_limb += p1[n - 1];
+
+  TMP_FREE (marker);
+  return cy_limb;
+}
diff --git a/rts/gmp/mpn/cray/gmp-mparam.h b/rts/gmp/mpn/cray/gmp-mparam.h
new file mode 100644
index 0000000000..14f7b8e05b
--- /dev/null
+++ b/rts/gmp/mpn/cray/gmp-mparam.h
@@ -0,0 +1,27 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 64
+#define BITS_PER_SHORTINT 32
+#define BITS_PER_CHAR 8
diff --git a/rts/gmp/mpn/cray/mul_1.c b/rts/gmp/mpn/cray/mul_1.c
new file mode 100644
index 0000000000..0c8750b4ac
--- /dev/null
+++ b/rts/gmp/mpn/cray/mul_1.c
@@ -0,0 +1,44 @@
+/* mpn_mul_1 for Cray PVP.
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb)
+{
+  mp_ptr p0, p1;
+  mp_limb_t cy_limb;
+  TMP_DECL (marker);
+  TMP_MARK (marker);
+
+  p1 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+  p0 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+
+  GMPN_MULWW (p1, p0, up, &n, &limb);
+  rp[0] = p0[0];
+  cy_limb = mpn_add_n (rp + 1, p0 + 1, p1, n - 1);
+  cy_limb += p1[n - 1];
+
+  TMP_FREE (marker);
+  return cy_limb;
+}
diff --git a/rts/gmp/mpn/cray/mulww.f b/rts/gmp/mpn/cray/mulww.f
new file mode 100644
index 0000000000..99507c1e44
--- /dev/null
+++ b/rts/gmp/mpn/cray/mulww.f
@@ -0,0 +1,54 @@
+c     Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP.
+
+c     Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+c     This file is part of the GNU MP Library.
+
+c     The GNU MP Library is free software; you can redistribute it and/or
+c     modify it under the terms of the GNU Lesser General Public License as
+c     published by the Free Software Foundation; either version 2.1 of the
+c     License, or (at your option) any later version.
+
+c     The GNU MP Library is distributed in the hope that it will be useful,
+c     but WITHOUT ANY WARRANTY; without even the implied warranty of
+c     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+c     Lesser General Public License for more details.
+
+c     You should have received a copy of the GNU Lesser General Public
+c     License along with the GNU MP Library; see the file COPYING.LIB.  If
+c     not, write to the Free Software Foundation, Inc., 59 Temple Place -
+c     Suite 330, Boston, MA 02111-1307, USA.
+
+c     p1[] = hi(a[]*s); the upper limbs of each product
+c     p0[] = low(a[]*s); the corresponding lower limbs
+c     n is number of limbs in the vectors
+
+      subroutine gmpn_mulww(p1,p0,a,n,s)
+      integer*8 p1(0:*),p0(0:*),a(0:*),s
+      integer n
+
+      integer*8 a0,a1,a2,s0,s1,s2,c
+      integer*8 ai,t0,t1,t2,t3,t4
+
+      s0 = shiftl(and(s,4194303),24)
+      s1 = shiftl(and(shiftr(s,22),4194303),24)
+      s2 = shiftl(and(shiftr(s,44),4194303),24)
+
+      do i = 0,n-1
+         ai = a(i)
+         a0 = shiftl(and(ai,4194303),24)
+         a1 = shiftl(and(shiftr(ai,22),4194303),24)
+         a2 = shiftl(and(shiftr(ai,44),4194303),24)
+
+         t0 = i24mult(a0,s0)
+         t1 = i24mult(a0,s1)+i24mult(a1,s0)
+         t2 = i24mult(a0,s2)+i24mult(a1,s1)+i24mult(a2,s0)
+         t3 = i24mult(a1,s2)+i24mult(a2,s1)
+         t4 = i24mult(a2,s2)
+
+         p0(i)=shiftl(t2,44)+shiftl(t1,22)+t0
+         c=shiftr(shiftr(t0,22)+and(t1,4398046511103)+
+     $        shiftl(and(t2,1048575),22),42)
+         p1(i)=shiftl(t4,24)+shiftl(t3,2)+shiftr(t2,20)+shiftr(t1,42)+c
+      end do
+      end
diff --git a/rts/gmp/mpn/cray/mulww.s b/rts/gmp/mpn/cray/mulww.s
new file mode 100644
index 0000000000..890cdcf94d
--- /dev/null
+++ b/rts/gmp/mpn/cray/mulww.s
@@ -0,0 +1,245 @@
+*     Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP.
+
+*     Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+*     This file is generated from mulww.f in this same directory.
+
+*     This file is part of the GNU MP Library.
+
+*     The GNU MP Library is free software; you can redistribute it and/or
+*     modify it under the terms of the GNU Lesser General Public License as
+*     published by the Free Software Foundation; either version 2.1 of the
+*     License, or (at your option) any later version.
+
+*     The GNU MP Library is distributed in the hope that it will be useful,
+*     but WITHOUT ANY WARRANTY; without even the implied warranty of
+*     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*     Lesser General Public License for more details.
+
+*     You should have received a copy of the GNU Lesser General Public
+*     License along with the GNU MP Library; see the file COPYING.LIB.  If
+*     not, write to the Free Software Foundation, Inc., 59 Temple Place -
+*     Suite 330, Boston, MA 02111-1307, USA.
+
+            IDENT           GMPN_MULWW
+**********************************************
+*      Assemble with Cal Version 2.0         *
+*                                            *
+* Generated by CFT77   6.0.4.19              *
+*           on 06/27/00 at 04:34:13          *
+*                                            *
+**********************************************
+* ALLOW UNDERSCORES IN IDENTIFIERS
+            EDIT            OFF
+            FORMAT          NEW
+@DATA       SECTION         DATA,CM
+@DATA       =               W.*
+            CON             O'0000000000040000000000
+            CON             O'0435152404713723252514        ;GMPN_MUL       1
+            CON             O'0535270000000000000000        ;WW             1
+            CON             O'0000000000000001200012        ;trbk tbl       1
+            VWD             32/0,32/P.GMPN_MULWW            ;trbk tbl       1
+            CON             O'0014003000000000001416        ;trbk tbl       1
+            CON             O'0000000000000000000011        ;trbk tbl       1
+            CON             O'0000000000000000000215        ;trbk tbl       1
+            BSSZ            1                               ;trbk tbl       1
+@CODE       SECTION         CODE
+@CODE       =               P.*
+L3          =               P.*                             ;               1
+            A0              A6                              ;arg base       1
+            A5              6                               ;num Darg       1
+            B03,A5          0,A0                            ;load DAs       1
+            A0              A1+A2                           ;               1
+            A5              1                               ;num Ts         1
+            0,A0            T00,A5                          ;               1
+            B02             A2                              ;new base       1
+            B66             A3                              ;stk top        1
+            B01             A6                              ;arg base       1
+            A7              P.L4                            ;ofrn rtn       1
+            B00             A7                              ;return         1
+            A6              @DATA                           ;               1
+            J               $STKOFEN                        ;$STKOFEN       1
+GMPN_MULWW  =               P.*                             ;               1
+            A0              @DATA+3                         ;(trbk)         1
+            B77             A0                              ;(trbk)         1
+            A1              13                              ;num Bs         1
+            A0              B66                             ;stk top        1
+            A2              B66                             ;stk tmp        1
+            A4              B67                             ;stk limt       1
+            0,A0            B77,A1                          ;               1
+            A7              782                             ;stk size       1
+            A3              A2+A7                           ;               1
+            A0              A4-A3                           ;               1
+            JAM             L3                              ;overflow       1
+            A0              A6                              ;arg base       1
+            A5              6                               ;num Darg       1
+            B03,A5          0,A0                            ;load DAs       1
+            A0              A1+A2                           ;               1
+            A5              1                               ;num Ts         1
+            0,A0            T00,A5                          ;               1
+            B02             A2                              ;new base       1
+            B66             A3                              ;new top        1
+            B01             A6                              ;arg base       1
+L4          =               P.*                             ;ofrn rtn       1
+            A7              B07                             ;regs          14
+            S7              0,A7                            ;              14
+            A6              B10                             ;regs           9
+            S6              0,A6                            ;               9
+            S5              1                               ;              14
+            S4              <22                             ;               9
+            S7              S7-S5                           ;              14
+            S5              #S7                             ;              14
+            T00             S6                              ;regs          10
+            S6              S6>22                           ;              10
+            S7              T00                             ;regs          11
+            S7              S7>44                           ;              11
+            S3              T00                             ;regs           9
+            S3              S3&S4                           ;               9
+            S6              S6&S4                           ;              10
+            S7              S7&S4                           ;              11
+            S3              S3<24                           ;               9
+            S6              S6<24                           ;              10
+            S7              S7<24                           ;              11
+            S0              S5                              ;regs          14
+            S4              S5                              ;regs          14
+            S1              S6                              ;regs          14
+            S2              S3                              ;regs          14
+            S3              S7                              ;regs          14
+            JSP             L5                              ;              14
+L6          =               P.*                             ;              14
+            S7              -S4                             ;              14
+            A2              S7                              ;regs          14
+            VL              A2                              ;regs          14
+            A3              B06                             ;s_bt_sp       14
+            A5              B05                             ;s_bt_sp       14
+            A4              B04                             ;s_bt_sp       14
+            A1              VL                              ;              14
+            A2              S4                              ;regs          14
+L7          =               P.*                             ;              14
+            A0              A3                              ;regs          15
+            VL              A1                              ;regs          15
+            V7              ,A0,1                           ;              15
+            B11             A5                              ;s_bt_sp       15
+            A7              22                              ;              17
+            B12             A4                              ;s_bt_sp       17
+            V6              V7>A7                           ;              17
+            B13             A3                              ;s_bt_sp       17
+            S7              <22                             ;              17
+            A3              B02                             ;s_bt_sp       17
+            V5              S7&V6                           ;              17
+            A6              24                              ;              17
+            V4              V5<A6                           ;              17
+            V3              S1*FV4                          ;              22
+            V2              S7&V7                           ;              16
+            V1              V2<A6                           ;              16
+            V0              S3*FV1                          ;              22
+            V6              V0+V3                           ;              22
+            A5              44                              ;              18
+            V5              V7>A5                           ;              18
+            V2              S1*FV1                          ;              21
+            V3              S7&V5                           ;              18
+            A0              14                              ;              34
+            B77             A0                              ;regs          34
+            A4              B77                             ;regs          34
+            A0              A4+A3                           ;              34
+            ,A0,1           V2                              ;v_ld_str      34
+            V0              V3<A6                           ;              18
+            V7              S2*FV1                          ;              20
+            A4              142                             ;              34
+            A0              A4+A3                           ;              34
+            ,A0,1           V7                              ;v_ld_str      34
+            V5              V7>A7                           ;              28
+            V2              S2*FV0                          ;              22
+            V3              V6+V2                           ;              22
+            S7              <20                             ;              28
+            V1              S7&V3                           ;              28
+            A4              270                             ;              34
+            A0              A4+A3                           ;              34
+            ,A0,1           V0                              ;v_ld_str      34
+            A4              14                              ;              34
+            A0              A4+A3                           ;              34
+            V7              ,A0,1                           ;v_ld_str      34
+            V6              V1<A7                           ;              28
+            V2              S2*FV4                          ;              21
+            V0              V7+V2                           ;              21
+            S7              <42                             ;              28
+            V1              S7&V0                           ;              28
+            A4              398                             ;              34
+            A0              A4+A3                           ;              34
+            ,A0,1           V0                              ;v_ld_str      34
+            V7              S3*FV4                          ;              23
+            V2              V5+V1                           ;              28
+            V0              V3<A5                           ;              26
+            A5              526                             ;              34
+            A0              A5+A3                           ;              34
+            ,A0,1           V0                              ;v_ld_str      34
+            A5              270                             ;              34
+            A0              A5+A3                           ;              34
+            V4              ,A0,1                           ;v_ld_str      34
+            V5              V2+V6                           ;              28
+            A5              20                              ;              32
+            V1              V3>A5                           ;              32
+            V0              S1*FV4                          ;              23
+            A5              654                             ;              34
+            A0              A5+A3                           ;              34
+            ,A0,1           V1                              ;v_ld_str      34
+            V6              V7+V0                           ;              23
+            A5              2                               ;              32
+            V2              V6<A5                           ;              32
+            V3              S3*FV4                          ;              24
+            A5              142                             ;              34
+            A0              A5+A3                           ;              34
+            V1              ,A0,1                           ;v_ld_str      34
+            A5              526                             ;              34
+            A0              A5+A3                           ;              34
+            V7              ,A0,1                           ;v_ld_str      34
+            V0              V1+V7                           ;              26
+            V6              V3<A6                           ;              32
+            V4              V6+V2                           ;              32
+            A6              42                              ;              28
+            V7              V5>A6                           ;              28
+            A5              654                             ;              34
+            CPW                                             ;cmr_vrsp      34
+            A0              A5+A3                           ;              34
+            V1              ,A0,1                           ;v_ld_str      34
+            A5              398                             ;              34
+            A0              A5+A3                           ;              34
+            V3              ,A0,1                           ;v_ld_str      34
+            V6              V4+V1                           ;              32
+            V2              V3>A6                           ;              32
+            V5              V6+V2                           ;              32
+            A6              B12                             ;s_bt_sp       32
+            V4              V3<A7                           ;              26
+            A7              B13                             ;regs          34
+            A3              A7+A1                           ;              34
+            A7              B11                             ;regs          34
+            A5              A7+A1                           ;              34
+            A4              A6+A1                           ;              34
+            A7              A2+A1                           ;              34
+            A0              A2+A1                           ;              34
+            A2              128                             ;              34
+            B13             A0                              ;s_bt_sp       34
+            V1              V0+V4                           ;              26
+            A0              B11                             ;regs          31
+            ,A0,1           V1                              ;              31
+            V6              V5+V7                           ;              33
+            A0              A6                              ;regs          33
+            ,A0,1           V6                              ;              33
+            A0              B13                             ;regs          34
+            A1              A2                              ;regs          34
+            A2              A7                              ;regs          34
+            JAN             L7                              ;              34
+L8          =               P.*                             ;              34
+L5          =               P.*                             ;              34
+            S1              0                               ;              35
+            A0              B02                             ;              35
+            A2              B02                             ;              35
+            A1              13                              ;num Bs        35
+            B66             A0                              ;              35
+            B77,A1          0,A0                            ;              35
+            A0              A2+A1                           ;              35
+            A1              1                               ;num Ts        35
+            T00,A1          0,A0                            ;              35
+            J               B00                             ;              35
+            EXT             $STKOFEN:p
+            ENTRY           GMPN_MULWW
+            END
diff --git a/rts/gmp/mpn/cray/sub_n.c b/rts/gmp/mpn/cray/sub_n.c
new file mode 100644
index 0000000000..902e07a727
--- /dev/null
+++ b/rts/gmp/mpn/cray/sub_n.c
@@ -0,0 +1,97 @@
+/* mpn_sub_n -- Subtract two limb vectors of equal, non-zero length.
+   For Cray vector processors.
+
+   Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+   the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+   MA 02111-1307, USA.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_sub_n (c, a, b, n)
+     mp_ptr c;
+     mp_srcptr a, b;
+     mp_size_t n;
+{
+  mp_size_t i;
+  mp_size_t nm1 = n - 1;
+  int more_carries = 0;
+  int carry_out;
+
+  /* For small operands the non-vector code is faster.  */
+  if (n < 16)
+    goto sequential;
+
+  if (a == c || b == c)
+    {
+      TMP_DECL (marker);
+      TMP_MARK (marker);
+      if (c == a)
+	{
+	  /* allocate temp space for a */
+	  mp_ptr ax = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+	  MPN_COPY (ax, a, n);
+	  a = (mp_srcptr) ax;
+	}
+      if (c == b)
+	{
+	  /* allocate temp space for b */
+	  mp_ptr bx = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+	  MPN_COPY (bx, b, n);
+	  b = (mp_srcptr) bx;
+	}
+      carry_out = mpn_sub_n (c, a, b, n);
+      TMP_FREE (marker);
+      return carry_out;
+    }
+
+  carry_out = a[nm1] < b[nm1];
+
+#pragma _CRI ivdep			/* Cray PVP systems */
+  for (i = nm1; i > 0; i--)
+    {
+      int cy_in; mp_limb_t t;
+      cy_in = a[i - 1] < b[i - 1];
+      t = a[i] - b[i];
+      more_carries += t < cy_in;
+      c[i] = t - cy_in;
+    }
+  c[0] = a[0] - b[0];
+
+  if (more_carries)
+    {
+      /* This won't vectorize, but we should come here rarely.  */
+      int cy;
+    sequential:
+      cy = 0;
+      for (i = 0; i < n; i++)
+	{
+	  mp_limb_t ai, ci, t;
+	  ai = a[i];
+	  t = b[i] + cy;
+	  cy = t < cy;
+	  ci = ai - t;
+	  cy += ci > ai;
+	  c[i] = ci;
+	}
+      carry_out = cy;
+    }
+
+  return carry_out;
+}
diff --git a/rts/gmp/mpn/cray/submul_1.c b/rts/gmp/mpn/cray/submul_1.c
new file mode 100644
index 0000000000..4d2fb13c62
--- /dev/null
+++ b/rts/gmp/mpn/cray/submul_1.c
@@ -0,0 +1,46 @@
+/* mpn_submul_1 for Cray PVP.
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb)
+{
+  mp_ptr p0, p1, tp;
+  mp_limb_t cy_limb;
+  TMP_DECL (marker);
+  TMP_MARK (marker);
+
+  p1 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+  p0 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+  tp = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+
+  GMPN_MULWW (p1, p0, up, &n, &limb);
+  cy_limb = mpn_sub_n (tp, rp, p0, n);
+  rp[0] = tp[0];
+  cy_limb += mpn_sub_n (rp + 1, tp + 1, p1, n - 1);
+  cy_limb += p1[n - 1];
+
+  TMP_FREE (marker);
+  return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/add_n.c b/rts/gmp/mpn/generic/add_n.c
new file mode 100644
index 0000000000..5fcb7e4835
--- /dev/null
+++ b/rts/gmp/mpn/generic/add_n.c
@@ -0,0 +1,62 @@
+/* mpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+#if __STDC__
+mpn_add_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size)
+#else
+mpn_add_n (res_ptr, s1_ptr, s2_ptr, size)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     register mp_srcptr s2_ptr;
+     mp_size_t size;
+#endif
+{
+  register mp_limb_t x, y, cy;
+  register mp_size_t j;
+
+  /* The loop counter and index J goes from -SIZE to -1.  This way
+     the loop becomes faster.  */
+  j = -size;
+
+  /* Offset the base pointers to compensate for the negative indices.  */
+  s1_ptr -= j;
+  s2_ptr -= j;
+  res_ptr -= j;
+
+  cy = 0;
+  do
+    {
+      y = s2_ptr[j];
+      x = s1_ptr[j];
+      y += cy;			/* add previous carry to one addend */
+      cy = (y < cy);		/* get out carry from that addition */
+      y = x + y;		/* add other addend */
+      cy = (y < x) + cy;	/* get out carry from that add, combine */
+      res_ptr[j] = y;
+    }
+  while (++j != 0);
+
+  return cy;
+}
diff --git a/rts/gmp/mpn/generic/addmul_1.c b/rts/gmp/mpn/generic/addmul_1.c
new file mode 100644
index 0000000000..746ae31307
--- /dev/null
+++ b/rts/gmp/mpn/generic/addmul_1.c
@@ -0,0 +1,65 @@
+/* mpn_addmul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR
+   by S2_LIMB, add the S1_SIZE least significant limbs of the product to the
+   limb vector pointed to by RES_PTR.  Return the most significant limb of
+   the product, adjusted for carry-out from the addition.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_addmul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     mp_size_t s1_size;
+     register mp_limb_t s2_limb;
+{
+  register mp_limb_t cy_limb;
+  register mp_size_t j;
+  register mp_limb_t prod_high, prod_low;
+  register mp_limb_t x;
+
+  /* The loop counter and index J goes from -SIZE to -1.  This way
+     the loop becomes faster.  */
+  j = -s1_size;
+
+  /* Offset the base pointers to compensate for the negative indices.  */
+  res_ptr -= j;
+  s1_ptr -= j;
+
+  cy_limb = 0;
+  do
+    {
+      umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb);
+
+      prod_low += cy_limb;
+      cy_limb = (prod_low < cy_limb) + prod_high;
+
+      x = res_ptr[j];
+      prod_low = x + prod_low;
+      cy_limb += (prod_low < x);
+      res_ptr[j] = prod_low;
+    }
+  while (++j != 0);
+
+  return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/addsub_n.c b/rts/gmp/mpn/generic/addsub_n.c
new file mode 100644
index 0000000000..c9bab3ef60
--- /dev/null
+++ b/rts/gmp/mpn/generic/addsub_n.c
@@ -0,0 +1,167 @@
+/* mpn_addsub_n -- Add and Subtract two limb vectors of equal, non-zero length.
+
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#ifndef L1_CACHE_SIZE
+#define L1_CACHE_SIZE 8192	/* only 68040 has less than this */
+#endif
+
+#define PART_SIZE (L1_CACHE_SIZE / BYTES_PER_MP_LIMB / 6)
+
+
+/* mpn_addsub_n.
+   r1[] = s1[] + s2[]
+   r2[] = s1[] - s2[]
+   All operands have n limbs.
+   In-place operations allowed.  */
+mp_limb_t
+#if __STDC__
+mpn_addsub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n)
+#else
+mpn_addsub_n (r1p, r2p, s1p, s2p, n)
+     mp_ptr r1p, r2p;
+     mp_srcptr s1p, s2p;
+     mp_size_t n;
+#endif
+{
+  mp_limb_t acyn, acyo;		/* carry for add */
+  mp_limb_t scyn, scyo;		/* carry for subtract */
+  mp_size_t off;		/* offset in operands */
+  mp_size_t this_n;		/* size of current chunk */
+
+  /* We alternatingly add and subtract in chunks that fit into the (L1)
+     cache.  Since the chunks are several hundred limbs, the function call
+     overhead is insignificant, but we get much better locality.  */
+
+  /* We have three variant of the inner loop, the proper loop is chosen
+     depending on whether r1 or r2 are the same operand as s1 or s2.  */
+
+  if (r1p != s1p && r1p != s2p)
+    {
+      /* r1 is not identical to either input operand.  We can therefore write
+	 to r1 directly, without using temporary storage.  */
+      acyo = 0;
+      scyo = 0;
+      for (off = 0; off < n; off += PART_SIZE)
+	{
+	  this_n = MIN (n - off, PART_SIZE);
+#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
+	  acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
+#else
+	  acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
+	  acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
+#endif
+#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
+	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
+#else
+	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
+	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
+#endif
+	}
+    }
+  else if (r2p != s1p && r2p != s2p)
+    {
+      /* r2 is not identical to either input operand.  We can therefore write
+	 to r2 directly, without using temporary storage.  */
+      acyo = 0;
+      scyo = 0;
+      for (off = 0; off < n; off += PART_SIZE)
+	{
+	  this_n = MIN (n - off, PART_SIZE);
+#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
+	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
+#else
+	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
+	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
+#endif
+#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
+	  acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
+#else
+	  acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
+	  acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
+#endif
+	}
+    }
+  else
+    {
+      /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2=s2 or vice versa)
+	 Need temporary storage.  */
+      mp_limb_t tp[PART_SIZE];
+      acyo = 0;
+      scyo = 0;
+      for (off = 0; off < n; off += PART_SIZE)
+	{
+	  this_n = MIN (n - off, PART_SIZE);
+#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
+	  acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo);
+#else
+	  acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n);
+	  acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo);
+#endif
+#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
+	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
+#else
+	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
+	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
+#endif
+	  MPN_COPY (r1p + off, tp, this_n);
+	}
+    }
+
+  return 2 * acyo + scyo;
+}
+
+#ifdef MAIN
+#include <stdlib.h>
+#include <stdio.h>
+#include "timing.h"
+
+long cputime ();
+
+int
+main (int argc, char **argv)
+{
+  mp_ptr r1p, r2p, s1p, s2p;
+  double t;
+  mp_size_t n;
+
+  n = strtol (argv[1], 0, 0);
+
+  r1p = malloc (n * BYTES_PER_MP_LIMB);
+  r2p = malloc (n * BYTES_PER_MP_LIMB);
+  s1p = malloc (n * BYTES_PER_MP_LIMB);
+  s2p = malloc (n * BYTES_PER_MP_LIMB);
+  TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n)));
+  printf ("              separate add and sub: %.3f\n", t);
+  TIME (t,mpn_addsub_n(r1p,r2p,s1p,s2p,n));
+  printf ("combined addsub separate variables: %.3f\n", t);
+  TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n));
+  printf ("        combined addsub r1 overlap: %.3f\n", t);
+  TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n));
+  printf ("        combined addsub r2 overlap: %.3f\n", t);
+  TIME (t,mpn_addsub_n(r1p,r2p,r1p,r2p,n));
+  printf ("          combined addsub in-place: %.3f\n", t);
+
+  return 0;
+}
+#endif
diff --git a/rts/gmp/mpn/generic/bdivmod.c b/rts/gmp/mpn/generic/bdivmod.c
new file mode 100644
index 0000000000..c4bcb414e6
--- /dev/null
+++ b/rts/gmp/mpn/generic/bdivmod.c
@@ -0,0 +1,120 @@
+/* mpn/bdivmod.c: mpn_bdivmod for computing U/V mod 2^d.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* q_high = mpn_bdivmod (qp, up, usize, vp, vsize, d).
+
+   Puts the low d/BITS_PER_MP_LIMB limbs of Q = U / V mod 2^d at qp, and
+   returns the high d%BITS_PER_MP_LIMB bits of Q as the result.
+
+   Also, U - Q * V mod 2^(usize*BITS_PER_MP_LIMB) is placed at up.  Since the
+   low d/BITS_PER_MP_LIMB limbs of this difference are zero, the code allows
+   the limb vectors at qp to overwrite the low limbs at up, provided qp <= up.
+
+   Preconditions:
+   1.  V is odd.
+   2.  usize * BITS_PER_MP_LIMB >= d.
+   3.  If Q and U overlap, qp <= up.
+
+   Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
+
+   Funding for this work has been partially provided by Conselho Nacional
+   de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
+   301314194-2, and was done while I was a visiting reseacher in the Instituto
+   de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
+
+   References:
+       T. Jebelean, An algorithm for exact division, Journal of Symbolic
+       Computation, v. 15, 1993, pp. 169-180.
+
+       K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
+       Mathematical Software, v. 21 (March), 1995, pp. 111-122.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+#if __STDC__
+mpn_bdivmod (mp_ptr qp, mp_ptr up, mp_size_t usize,
+	     mp_srcptr vp, mp_size_t vsize, unsigned long int d)
+#else
+mpn_bdivmod (qp, up, usize, vp, vsize, d)
+     mp_ptr qp;
+     mp_ptr up;
+     mp_size_t usize;
+     mp_srcptr vp;
+     mp_size_t vsize;
+     unsigned long int d;
+#endif
+{
+  mp_limb_t v_inv;
+
+  /* 1/V mod 2^BITS_PER_MP_LIMB. */
+  modlimb_invert (v_inv, vp[0]);
+
+  /* Fast code for two cases previously used by the accel part of mpn_gcd.
+     (Could probably remove this now it's inlined there.) */
+  if (usize == 2 && vsize == 2 &&
+      (d == BITS_PER_MP_LIMB || d == 2*BITS_PER_MP_LIMB))
+    {
+      mp_limb_t hi, lo;
+      mp_limb_t q = up[0] * v_inv;
+      umul_ppmm (hi, lo, q, vp[0]);
+      up[0] = 0, up[1] -= hi + q*vp[1], qp[0] = q;
+      if (d == 2*BITS_PER_MP_LIMB)
+	q = up[1] * v_inv, up[1] = 0, qp[1] = q;
+      return 0;
+    }
+
+  /* Main loop.  */
+  while (d >= BITS_PER_MP_LIMB)
+    {
+      mp_limb_t q = up[0] * v_inv;
+      mp_limb_t b = mpn_submul_1 (up, vp, MIN (usize, vsize), q);
+      if (usize > vsize)
+	mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+      d -= BITS_PER_MP_LIMB;
+      up += 1, usize -= 1;
+      *qp++ = q;
+    }
+
+  if (d)
+    {
+      mp_limb_t b;
+      mp_limb_t q = (up[0] * v_inv) & (((mp_limb_t)1<<d) - 1);
+      if (q <= 1)
+	{
+	  if (q == 0)
+	    return 0;
+	  else
+	    b = mpn_sub_n (up, up, vp, MIN (usize, vsize));
+	}
+      else
+	b = mpn_submul_1 (up, vp, MIN (usize, vsize), q);
+
+      if (usize > vsize)
+	mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+      return q;
+    }
+
+  return 0;
+}
diff --git a/rts/gmp/mpn/generic/bz_divrem_n.c b/rts/gmp/mpn/generic/bz_divrem_n.c
new file mode 100644
index 0000000000..d234b22af5
--- /dev/null
+++ b/rts/gmp/mpn/generic/bz_divrem_n.c
@@ -0,0 +1,153 @@
+/* mpn_bz_divrem_n and auxilliary routines.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE
+   INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
+   IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A
+   FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 2000 Free Software Foundation, Inc.
+Contributed by Paul Zimmermann.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/*
+[1] Fast Recursive Division, by Christoph Burnikel and Joachim Ziegler,
+    Technical report MPI-I-98-1-022, october 1998.
+    http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz
+*/
+
+static mp_limb_t mpn_bz_div_3_halves_by_2
+  _PROTO ((mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n));
+
+
+/* mpn_bz_divrem_n(n) calls 2*mul(n/2)+2*div(n/2), thus to be faster than
+   div(n) = 4*div(n/2), we need mul(n/2) to be faster than the classic way,
+   i.e. n/2 >= KARATSUBA_MUL_THRESHOLD */
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD (7 * KARATSUBA_MUL_THRESHOLD)
+#endif
+
+#if 0
+static
+unused_mpn_divrem (qp, qxn, np, nn, dp, dn)
+     mp_ptr qp;
+     mp_size_t qxn;
+     mp_ptr np;
+     mp_size_t nn;
+     mp_srcptr dp;
+     mp_size_t dn;
+{
+  /* This might be useful: */
+  if (qxn != 0)
+    {
+      mp_limb_t c;
+      mp_ptr tp = alloca ((nn + qxn) * BYTES_PER_MP_LIMB);
+      MPN_COPY (tp + qxn - nn, np, nn);
+      MPN_ZERO (tp, qxn);
+      c = mpn_divrem (qp, 0L, tp, nn + qxn, dp, dn);
+      /* Maybe copy proper part of tp to np?  Documentation is unclear about
+	 the returned np value when qxn != 0 */
+      return c;
+    }
+}
+#endif
+
+
+/* mpn_bz_divrem_n - Implements algorithm of page 8 in [1]: divides (np,2n)
+   by (dp,n) and puts the quotient in (qp,n), the remainder in (np,n).
+   Returns most significant limb of the quotient, which is 0 or 1.
+   Requires that the most significant bit of the divisor is set.  */
+
+mp_limb_t
+#if __STDC__
+mpn_bz_divrem_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)
+#else
+mpn_bz_divrem_n (qp, np, dp, n)
+     mp_ptr qp;
+     mp_ptr np;
+     mp_srcptr dp;
+     mp_size_t n;
+#endif
+{
+  mp_limb_t qhl, cc;
+
+  if (n % 2 != 0)
+    {
+      qhl = mpn_bz_divrem_n (qp + 1, np + 2, dp + 1, n - 1);
+      cc = mpn_submul_1 (np + 1, qp + 1, n - 1, dp[0]);
+      cc = mpn_sub_1 (np + n, np + n, 1, cc);
+      if (qhl) cc += mpn_sub_1 (np + n, np + n, 1, dp[0]);
+      while (cc)
+        {
+          qhl -= mpn_sub_1 (qp + 1, qp + 1, n - 1, (mp_limb_t) 1);
+          cc -= mpn_add_n (np + 1, np + 1, dp, n);
+        }
+      qhl += mpn_add_1 (qp + 1, qp + 1, n - 1,
+                        mpn_sb_divrem_mn (qp, np, n + 1, dp, n));
+    }
+  else
+    {
+      mp_size_t n2 = n/2;
+      qhl = mpn_bz_div_3_halves_by_2 (qp + n2, np + n2, dp, n2);
+      qhl += mpn_add_1 (qp + n2, qp + n2, n2,
+                        mpn_bz_div_3_halves_by_2 (qp, np, dp, n2));
+    }
+  return qhl;
+}
+
+
+/* divides (np, 3n) by (dp, 2n) and puts the quotient in (qp, n),
+   the remainder in (np, 2n) */
+
+static mp_limb_t
+#if __STDC__
+mpn_bz_div_3_halves_by_2 (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)
+#else
+mpn_bz_div_3_halves_by_2 (qp, np, dp, n)
+     mp_ptr qp;
+     mp_ptr np;
+     mp_srcptr dp;
+     mp_size_t n;
+#endif
+{
+  mp_size_t twon = n + n; 
+  mp_limb_t qhl, cc;
+  mp_ptr tmp;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+  if (n < BZ_THRESHOLD)
+    qhl = mpn_sb_divrem_mn (qp, np + n, twon, dp + n, n);
+  else
+    qhl = mpn_bz_divrem_n (qp, np + n, dp + n, n);
+  tmp = (mp_ptr) TMP_ALLOC (twon * BYTES_PER_MP_LIMB);
+  mpn_mul_n (tmp, qp, dp, n);
+  cc = mpn_sub_n (np, np, tmp, twon);
+  TMP_FREE (marker);
+  if (qhl) cc += mpn_sub_n (np + n, np + n, dp, n);
+  while (cc)
+    {
+      qhl -= mpn_sub_1 (qp, qp, n, (mp_limb_t) 1);
+      cc -= mpn_add_n (np, np, dp, twon);
+    }
+  return qhl;
+}
diff --git a/rts/gmp/mpn/generic/cmp.c b/rts/gmp/mpn/generic/cmp.c
new file mode 100644
index 0000000000..8e9792f54e
--- /dev/null
+++ b/rts/gmp/mpn/generic/cmp.c
@@ -0,0 +1,56 @@
+/* mpn_cmp -- Compare two low-level natural-number integers.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Compare OP1_PTR/OP1_SIZE with OP2_PTR/OP2_SIZE.
+   There are no restrictions on the relative sizes of
+   the two arguments.
+   Return 1 if OP1 > OP2, 0 if they are equal, and -1 if OP1 < OP2.  */
+
+int
+#if __STDC__
+mpn_cmp (mp_srcptr op1_ptr, mp_srcptr op2_ptr, mp_size_t size)
+#else
+mpn_cmp (op1_ptr, op2_ptr, size)
+     mp_srcptr op1_ptr;
+     mp_srcptr op2_ptr;
+     mp_size_t size;
+#endif
+{
+  mp_size_t i;
+  mp_limb_t op1_word, op2_word;
+
+  for (i = size - 1; i >= 0; i--)
+    {
+      op1_word = op1_ptr[i];
+      op2_word = op2_ptr[i];
+      if (op1_word != op2_word)
+	goto diff;
+    }
+  return 0;
+ diff:
+  /* This can *not* be simplified to
+	op2_word - op2_word
+     since that expression might give signed overflow.  */
+  return (op1_word > op2_word) ? 1 : -1;
+}
diff --git a/rts/gmp/mpn/generic/diveby3.c b/rts/gmp/mpn/generic/diveby3.c
new file mode 100644
index 0000000000..a2fb552bfa
--- /dev/null
+++ b/rts/gmp/mpn/generic/diveby3.c
@@ -0,0 +1,77 @@
+/* mpn_divexact_by3 -- mpn division by 3, expecting no remainder. */
+
+/*
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB.
+   0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */
+#define INVERSE_3      ((MP_LIMB_T_MAX / 3) * 2 + 1)
+
+
+/* The "c += ..."s are adding the high limb of 3*l to c.  That high limb
+   will be 0, 1 or 2.  Doing two separate "+="s seems to turn out better
+   code on gcc (as of 2.95.2 at least).
+
+   When a subtraction of a 0,1,2 carry value causes a borrow, that leaves a
+   limb value of either 0xFF...FF or 0xFF...FE and the multiply by INVERSE_3
+   gives 0x55...55 or 0xAA...AA respectively, producing a further borrow of
+   only 0 or 1 respectively.  Hence the carry out of each stage and for the
+   return value is always only 0, 1 or 2.  */
+
+mp_limb_t
+#if __STDC__
+mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t c)
+#else
+mpn_divexact_by3c (dst, src, size, c)
+     mp_ptr    dst;
+     mp_srcptr src;
+     mp_size_t size;
+     mp_limb_t c;
+#endif
+{
+  mp_size_t  i;
+
+  ASSERT (size >= 1);
+
+  i = 0;
+  do
+    {
+      mp_limb_t  l, s;
+
+      s = src[i];
+      l = s - c;
+      c = (l > s);
+
+      l *= INVERSE_3;
+      dst[i] = l;
+
+      c += (l > MP_LIMB_T_MAX/3);
+      c += (l > (MP_LIMB_T_MAX/3)*2);
+    }
+  while (++i < size);
+
+  return c;
+}
diff --git a/rts/gmp/mpn/generic/divrem.c b/rts/gmp/mpn/generic/divrem.c
new file mode 100644
index 0000000000..30673e76d9
--- /dev/null
+++ b/rts/gmp/mpn/generic/divrem.c
@@ -0,0 +1,101 @@
+/* mpn_divrem -- Divide natural numbers, producing both remainder and
+   quotient.  This is now just a middle layer for calling the new
+   internal mpn_tdiv_qr.
+
+Copyright (C) 1993, 1994, 1995, 1996, 1997, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+#if __STDC__
+mpn_divrem (mp_ptr qp, mp_size_t qxn,
+	    mp_ptr np, mp_size_t nn,
+	    mp_srcptr dp, mp_size_t dn)
+#else
+mpn_divrem (qp, qxn, np, nn, dp, dn)
+     mp_ptr qp;
+     mp_size_t qxn;
+     mp_ptr np;
+     mp_size_t nn;
+     mp_srcptr dp;
+     mp_size_t dn;
+#endif
+{
+  if (dn == 1)
+    {
+      mp_limb_t ret;
+      mp_ptr q2p;
+      mp_size_t qn;
+      TMP_DECL (marker);
+
+      TMP_MARK (marker);
+      q2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB);
+
+      np[0] = mpn_divrem_1 (q2p, qxn, np, nn, dp[0]);
+      qn = nn + qxn - 1;
+      MPN_COPY (qp, q2p, qn);
+      ret = q2p[qn];
+
+      TMP_FREE (marker);
+      return ret;
+    }
+  else if (dn == 2)
+    {
+      return mpn_divrem_2 (qp, qxn, np, nn, dp);
+    }
+  else
+    {
+      mp_ptr rp, q2p;
+      mp_limb_t qhl;
+      mp_size_t qn;
+      TMP_DECL (marker);
+
+      TMP_MARK (marker);
+      if (qxn != 0)
+	{
+	  mp_ptr n2p;
+	  n2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB);
+	  MPN_ZERO (n2p, qxn);
+	  MPN_COPY (n2p + qxn, np, nn);
+	  q2p = (mp_ptr) TMP_ALLOC ((nn - dn + qxn + 1) * BYTES_PER_MP_LIMB);
+	  rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+	  mpn_tdiv_qr (q2p, rp, 0L, n2p, nn + qxn, dp, dn);
+	  MPN_COPY (np, rp, dn);
+	  qn = nn - dn + qxn;
+	  MPN_COPY (qp, q2p, qn);
+	  qhl = q2p[qn];
+	}
+      else
+	{
+	  q2p = (mp_ptr) TMP_ALLOC ((nn - dn + 1) * BYTES_PER_MP_LIMB);
+	  rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+	  mpn_tdiv_qr (q2p, rp, 0L, np, nn, dp, dn);
+	  MPN_COPY (np, rp, dn);	/* overwrite np area with remainder */
+	  qn = nn - dn;
+	  MPN_COPY (qp, q2p, qn);
+	  qhl = q2p[qn];
+	}
+      TMP_FREE (marker);
+      return qhl;
+    }
+}
diff --git a/rts/gmp/mpn/generic/divrem_1.c b/rts/gmp/mpn/generic/divrem_1.c
new file mode 100644
index 0000000000..e93f241c9d
--- /dev/null
+++ b/rts/gmp/mpn/generic/divrem_1.c
@@ -0,0 +1,248 @@
+/* mpn_divrem_1(quot_ptr, qsize, dividend_ptr, dividend_size, divisor_limb) --
+   Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+   Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR.
+   Return the single-limb remainder.
+   There are no constraints on the value of the divisor.
+
+   QUOT_PTR and DIVIDEND_PTR might point to the same limb.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1998, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+
+/* __gmpn_divmod_1_internal(quot_ptr,dividend_ptr,dividend_size,divisor_limb)
+   Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+   Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR.
+   Return the single-limb remainder.
+   There are no constraints on the value of the divisor.
+
+   QUOT_PTR and DIVIDEND_PTR might point to the same limb. */
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+static mp_limb_t
+#if __STDC__
+__gmpn_divmod_1_internal (mp_ptr quot_ptr,
+	      mp_srcptr dividend_ptr, mp_size_t dividend_size,
+	      mp_limb_t divisor_limb)
+#else
+__gmpn_divmod_1_internal (quot_ptr, dividend_ptr, dividend_size, divisor_limb)
+     mp_ptr quot_ptr;
+     mp_srcptr dividend_ptr;
+     mp_size_t dividend_size;
+     mp_limb_t divisor_limb;
+#endif
+{
+  mp_size_t i;
+  mp_limb_t n1, n0, r;
+  int dummy;
+
+  /* ??? Should this be handled at all?  Rely on callers?  */
+  if (dividend_size == 0)
+    return 0;
+
+  /* If multiplication is much faster than division, and the
+     dividend is large, pre-invert the divisor, and use
+     only multiplications in the inner loop.  */
+
+  /* This test should be read:
+       Does it ever help to use udiv_qrnnd_preinv?
+	 && Does what we save compensate for the inversion overhead?  */
+  if (UDIV_TIME > (2 * UMUL_TIME + 6)
+      && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME)
+    {
+      int normalization_steps;
+
+      count_leading_zeros (normalization_steps, divisor_limb);
+      if (normalization_steps != 0)
+	{
+	  mp_limb_t divisor_limb_inverted;
+
+	  divisor_limb <<= normalization_steps;
+	  invert_limb (divisor_limb_inverted, divisor_limb);
+
+	  n1 = dividend_ptr[dividend_size - 1];
+	  r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+	  /* Possible optimization:
+	     if (r == 0
+	     && divisor_limb > ((n1 << normalization_steps)
+			     | (dividend_ptr[dividend_size - 2] >> ...)))
+	     ...one division less... */
+
+	  for (i = dividend_size - 2; i >= 0; i--)
+	    {
+	      n0 = dividend_ptr[i];
+	      udiv_qrnnd_preinv (quot_ptr[i + 1], r, r,
+				 ((n1 << normalization_steps)
+				  | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+				 divisor_limb, divisor_limb_inverted);
+	      n1 = n0;
+	    }
+	  udiv_qrnnd_preinv (quot_ptr[0], r, r,
+			     n1 << normalization_steps,
+			     divisor_limb, divisor_limb_inverted);
+	  return r >> normalization_steps;
+	}
+      else
+	{
+	  mp_limb_t divisor_limb_inverted;
+
+	  invert_limb (divisor_limb_inverted, divisor_limb);
+
+	  i = dividend_size - 1;
+	  r = dividend_ptr[i];
+
+	  if (r >= divisor_limb)
+	    r = 0;
+	  else
+	    {
+	      quot_ptr[i] = 0;
+	      i--;
+	    }
+
+	  for (; i >= 0; i--)
+	    {
+	      n0 = dividend_ptr[i];
+	      udiv_qrnnd_preinv (quot_ptr[i], r, r,
+				 n0, divisor_limb, divisor_limb_inverted);
+	    }
+	  return r;
+	}
+    }
+  else
+    {
+      if (UDIV_NEEDS_NORMALIZATION)
+	{
+	  int normalization_steps;
+
+	  count_leading_zeros (normalization_steps, divisor_limb);
+	  if (normalization_steps != 0)
+	    {
+	      divisor_limb <<= normalization_steps;
+
+	      n1 = dividend_ptr[dividend_size - 1];
+	      r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+	      /* Possible optimization:
+		 if (r == 0
+		 && divisor_limb > ((n1 << normalization_steps)
+				 | (dividend_ptr[dividend_size - 2] >> ...)))
+		 ...one division less... */
+
+	      for (i = dividend_size - 2; i >= 0; i--)
+		{
+		  n0 = dividend_ptr[i];
+		  udiv_qrnnd (quot_ptr[i + 1], r, r,
+			      ((n1 << normalization_steps)
+			       | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+			      divisor_limb);
+		  n1 = n0;
+		}
+	      udiv_qrnnd (quot_ptr[0], r, r,
+			  n1 << normalization_steps,
+			  divisor_limb);
+	      return r >> normalization_steps;
+	    }
+	}
+      /* No normalization needed, either because udiv_qrnnd doesn't require
+	 it, or because DIVISOR_LIMB is already normalized.  */
+
+      i = dividend_size - 1;
+      r = dividend_ptr[i];
+
+      if (r >= divisor_limb)
+	r = 0;
+      else
+	{
+	  quot_ptr[i] = 0;
+	  i--;
+	}
+
+      for (; i >= 0; i--)
+	{
+	  n0 = dividend_ptr[i];
+	  udiv_qrnnd (quot_ptr[i], r, r, n0, divisor_limb);
+	}
+      return r;
+    }
+}
+
+
+
+mp_limb_t
+#if __STDC__
+mpn_divrem_1 (mp_ptr qp, mp_size_t qxn,
+	      mp_srcptr np, mp_size_t nn,
+	      mp_limb_t d)
+#else
+mpn_divrem_1 (qp, qxn, np, nn, d)
+     mp_ptr qp;
+     mp_size_t qxn;
+     mp_srcptr np;
+     mp_size_t nn;
+     mp_limb_t d;
+#endif
+{
+  mp_limb_t rlimb;
+  mp_size_t i;
+
+  /* Develop integer part of quotient.  */
+  rlimb = __gmpn_divmod_1_internal (qp + qxn, np, nn, d);
+
+  /* Develop fraction part of quotient.  This is not as fast as it should;
+     the preinvert stuff from __gmpn_divmod_1_internal ought to be used here
+     too.  */
+  if (UDIV_NEEDS_NORMALIZATION)
+    {
+      int normalization_steps;
+
+      count_leading_zeros (normalization_steps, d);
+      if (normalization_steps != 0)
+	{
+	  d <<= normalization_steps;
+	  rlimb <<= normalization_steps;
+
+	  for (i = qxn - 1; i >= 0; i--)
+	    udiv_qrnnd (qp[i], rlimb, rlimb, 0, d);
+
+	  return rlimb >> normalization_steps;
+	}
+      else
+	/* fall out */
+	;
+    }
+
+  for (i = qxn - 1; i >= 0; i--)
+    udiv_qrnnd (qp[i], rlimb, rlimb, 0, d);
+
+  return rlimb;
+}
diff --git a/rts/gmp/mpn/generic/divrem_2.c b/rts/gmp/mpn/generic/divrem_2.c
new file mode 100644
index 0000000000..0bc31ae2e7
--- /dev/null
+++ b/rts/gmp/mpn/generic/divrem_2.c
@@ -0,0 +1,151 @@
+/* mpn_divrem_2 -- Divide natural numbers, producing both remainder and
+   quotient.  The divisor is two limbs.
+
+   THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS
+   ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS
+   ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP
+   RELEASE.
+
+
+Copyright (C) 1993, 1994, 1995, 1996, 1999, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Divide num (NP/NSIZE) by den (DP/2) and write
+   the NSIZE-2 least significant quotient limbs at QP
+   and the 2 long remainder at NP.  If QEXTRA_LIMBS is
+   non-zero, generate that many fraction bits and append them after the
+   other quotient limbs.
+   Return the most significant limb of the quotient, this is always 0 or 1.
+
+   Preconditions:
+   0. NSIZE >= 2.
+   1. The most significant bit of the divisor must be set.
+   2. QP must either not overlap with the input operands at all, or
+      QP + 2 >= NP must hold true.  (This means that it's
+      possible to put the quotient in the high part of NUM, right after the
+      remainder in NUM.
+   3. NSIZE >= 2, even if QEXTRA_LIMBS is non-zero.  */
+
+mp_limb_t
+#if __STDC__
+mpn_divrem_2 (mp_ptr qp, mp_size_t qxn,
+	      mp_ptr np, mp_size_t nsize,
+	      mp_srcptr dp)
+#else
+mpn_divrem_2 (qp, qxn, np, nsize, dp)
+     mp_ptr qp;
+     mp_size_t qxn;
+     mp_ptr np;
+     mp_size_t nsize;
+     mp_srcptr dp;
+#endif
+{
+  mp_limb_t most_significant_q_limb = 0;
+  mp_size_t i;
+  mp_limb_t n1, n0, n2;
+  mp_limb_t d1, d0;
+  mp_limb_t d1inv;
+  int have_preinv;
+
+  np += nsize - 2;
+  d1 = dp[1];
+  d0 = dp[0];
+  n1 = np[1];
+  n0 = np[0];
+
+  if (n1 >= d1 && (n1 > d1 || n0 >= d0))
+    {
+      sub_ddmmss (n1, n0, n1, n0, d1, d0);
+      most_significant_q_limb = 1;
+    }
+
+  /* If multiplication is much faster than division, preinvert the most 
+     significant divisor limb before entering the loop.  */
+  if (UDIV_TIME > 2 * UMUL_TIME + 6)
+    {
+      have_preinv = 0;
+      if ((UDIV_TIME - (2 * UMUL_TIME + 6)) * (nsize - 2) > UDIV_TIME)
+	{
+	  invert_limb (d1inv, d1);
+	  have_preinv = 1;
+	}
+    }
+
+  for (i = qxn + nsize - 2 - 1; i >= 0; i--)
+    {
+      mp_limb_t q;
+      mp_limb_t r;
+
+      if (i >= qxn)
+	np--;
+      else
+	np[0] = 0;
+
+      if (n1 == d1)
+	{
+	  /* Q should be either 111..111 or 111..110.  Need special treatment
+	     of this rare case as normal division would give overflow.  */
+	  q = ~(mp_limb_t) 0;
+
+	  r = n0 + d1;
+	  if (r < d1)	/* Carry in the addition? */
+	    {
+	      add_ssaaaa (n1, n0, r - d0, np[0], 0, d0);
+	      qp[i] = q;
+	      continue;
+	    }
+	  n1 = d0 - (d0 != 0);
+	  n0 = -d0;
+	}
+      else
+	{
+	  if (UDIV_TIME > 2 * UMUL_TIME + 6 && have_preinv)
+	    udiv_qrnnd_preinv (q, r, n1, n0, d1, d1inv);
+	  else
+	    udiv_qrnnd (q, r, n1, n0, d1);
+	  umul_ppmm (n1, n0, d0, q);
+	}
+
+      n2 = np[0];
+
+    q_test:
+      if (n1 > r || (n1 == r && n0 > n2))
+	{
+	  /* The estimated Q was too large.  */
+	  q--;
+
+	  sub_ddmmss (n1, n0, n1, n0, 0, d0);
+	  r += d1;
+	  if (r >= d1)	/* If not carry, test Q again.  */
+	    goto q_test;
+	}
+
+      qp[i] = q;
+      sub_ddmmss (n1, n0, r, n2, n1, n0);
+    }
+  np[1] = n1;
+  np[0] = n0;
+
+  return most_significant_q_limb;
+}
diff --git a/rts/gmp/mpn/generic/dump.c b/rts/gmp/mpn/generic/dump.c
new file mode 100644
index 0000000000..66f375c74b
--- /dev/null
+++ b/rts/gmp/mpn/generic/dump.c
@@ -0,0 +1,76 @@
+/* THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS NOT SAFE TO
+   CALL THIS FUNCTION DIRECTLY.  IN FACT, IT IS ALMOST GUARANTEED THAT THIS
+   FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpn_dump (mp_srcptr ptr, mp_size_t size)
+#else
+mpn_dump (ptr, size)
+     mp_srcptr ptr;
+     mp_size_t size;
+#endif
+{
+  MPN_NORMALIZE (ptr, size);
+
+  if (size == 0)
+    printf ("0\n");
+  else
+    {
+      size--;
+      if (BYTES_PER_MP_LIMB > sizeof (long))
+	{
+	  if ((ptr[size] >> BITS_PER_MP_LIMB/2) != 0)
+	    {
+	      printf ("%lX",
+		      (unsigned long) (ptr[size] >> BITS_PER_MP_LIMB/2));
+	      printf ("%0*lX", (int) (BYTES_PER_MP_LIMB),
+		      (unsigned long) ptr[size]);
+	    }
+	  else
+	    printf ("%lX", (unsigned long) ptr[size]);
+	}
+      else
+	printf ("%lX", ptr[size]);
+
+      while (size)
+	{
+	  size--;
+	  if (BYTES_PER_MP_LIMB > sizeof (long))
+	    {
+	      printf ("%0*lX", (int) (BYTES_PER_MP_LIMB),
+		(unsigned long) (ptr[size] >> BITS_PER_MP_LIMB/2));
+	      printf ("%0*lX", (int) (BYTES_PER_MP_LIMB),
+		(unsigned long) ptr[size]);
+	    }
+	  else
+	    printf ("%0*lX", (int) (2 * BYTES_PER_MP_LIMB), ptr[size]);
+	}
+      printf ("\n");
+    }
+}
diff --git a/rts/gmp/mpn/generic/gcd.c b/rts/gmp/mpn/generic/gcd.c
new file mode 100644
index 0000000000..059e219a06
--- /dev/null
+++ b/rts/gmp/mpn/generic/gcd.c
@@ -0,0 +1,414 @@
+/* mpn/gcd.c: mpn_gcd for gcd of two odd integers.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* Integer greatest common divisor of two unsigned integers, using
+   the accelerated algorithm (see reference below).
+
+   mp_size_t mpn_gcd (up, usize, vp, vsize).
+
+   Preconditions [U = (up, usize) and V = (vp, vsize)]:
+
+   1.  V is odd.
+   2.  numbits(U) >= numbits(V).
+
+   Both U and V are destroyed by the operation.  The result is left at vp,
+   and its size is returned.
+
+   Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
+
+   Funding for this work has been partially provided by Conselho Nacional
+   de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
+   301314194-2, and was done while I was a visiting reseacher in the Instituto
+   de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
+
+   Refer to
+	K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
+	Mathematical Software, v. 21 (March), 1995, pp. 111-122.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated
+   algorithm is used, otherwise the binary algorithm is used.  This may be
+   adjusted for different architectures.  */
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 5
+#endif
+
+/* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated
+   algorithm reduces using the bmod operation.  Otherwise, the k-ary reduction
+   is used.  0 <= BMOD_THRESHOLD < BITS_PER_MP_LIMB.  */
+enum
+  {
+    BMOD_THRESHOLD = BITS_PER_MP_LIMB/2
+  };
+
+
+/* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2.
+   Both U and V must be odd.  */
+static __gmp_inline mp_size_t
+#if __STDC__
+gcd_2 (mp_ptr vp, mp_srcptr up)
+#else
+gcd_2 (vp, up)
+     mp_ptr vp;
+     mp_srcptr up;
+#endif
+{
+  mp_limb_t u0, u1, v0, v1;
+  mp_size_t vsize;
+
+  u0 = up[0], u1 = up[1], v0 = vp[0], v1 = vp[1];
+
+  while (u1 != v1 && u0 != v0)
+    {
+      unsigned long int r;
+      if (u1 > v1)
+	{
+	  u1 -= v1 + (u0 < v0), u0 -= v0;
+	  count_trailing_zeros (r, u0);
+	  u0 = u1 << (BITS_PER_MP_LIMB - r) | u0 >> r;
+	  u1 >>= r;
+	}
+      else  /* u1 < v1.  */
+	{
+	  v1 -= u1 + (v0 < u0), v0 -= u0;
+	  count_trailing_zeros (r, v0);
+	  v0 = v1 << (BITS_PER_MP_LIMB - r) | v0 >> r;
+	  v1 >>= r;
+	}
+    }
+
+  vp[0] = v0, vp[1] = v1, vsize = 1 + (v1 != 0);
+
+  /* If U == V == GCD, done.  Otherwise, compute GCD (V, |U - V|).  */
+  if (u1 == v1 && u0 == v0)
+    return vsize;
+
+  v0 = (u0 == v0) ? (u1 > v1) ? u1-v1 : v1-u1 : (u0 > v0) ? u0-v0 : v0-u0;
+  vp[0] = mpn_gcd_1 (vp, vsize, v0);
+
+  return 1;
+}
+
+/* The function find_a finds 0 < N < 2^BITS_PER_MP_LIMB such that there exists
+   0 < |D| < 2^BITS_PER_MP_LIMB, and N == D * C mod 2^(2*BITS_PER_MP_LIMB).
+   In the reference article, D was computed along with N, but it is better to
+   compute D separately as D <-- N / C mod 2^(BITS_PER_MP_LIMB + 1), treating
+   the result as a twos' complement signed integer.
+
+   Initialize N1 to C mod 2^(2*BITS_PER_MP_LIMB).  According to the reference
+   article, N2 should be initialized to 2^(2*BITS_PER_MP_LIMB), but we use
+   2^(2*BITS_PER_MP_LIMB) - N1 to start the calculations within double
+   precision.  If N2 > N1 initially, the first iteration of the while loop
+   will swap them.  In all other situations, N1 >= N2 is maintained.  */
+
+static
+#if ! defined (__i386__)
+__gmp_inline			/* don't inline this for the x86 */
+#endif
+mp_limb_t
+#if __STDC__
+find_a (mp_srcptr cp)
+#else
+find_a (cp)
+     mp_srcptr cp;
+#endif
+{
+  unsigned long int leading_zero_bits = 0;
+
+  mp_limb_t n1_l = cp[0];	/* N1 == n1_h * 2^BITS_PER_MP_LIMB + n1_l.  */
+  mp_limb_t n1_h = cp[1];
+
+  mp_limb_t n2_l = -n1_l;	/* N2 == n2_h * 2^BITS_PER_MP_LIMB + n2_l.  */
+  mp_limb_t n2_h = ~n1_h;
+
+  /* Main loop.  */
+  while (n2_h)			/* While N2 >= 2^BITS_PER_MP_LIMB.  */
+    {
+      /* N1 <-- N1 % N2.  */
+      if ((MP_LIMB_T_HIGHBIT >> leading_zero_bits & n2_h) == 0)
+	{
+	  unsigned long int i;
+	  count_leading_zeros (i, n2_h);
+	  i -= leading_zero_bits, leading_zero_bits += i;
+	  n2_h = n2_h<<i | n2_l>>(BITS_PER_MP_LIMB - i), n2_l <<= i;
+	  do
+	    {
+	      if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
+		n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l;
+	      n2_l = n2_l>>1 | n2_h<<(BITS_PER_MP_LIMB - 1), n2_h >>= 1;
+	      i -= 1;
+	    }
+	  while (i);
+	}
+      if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
+	n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l;
+
+      MP_LIMB_T_SWAP (n1_h, n2_h);
+      MP_LIMB_T_SWAP (n1_l, n2_l);
+    }
+
+  return n2_l;
+}
+
+mp_size_t
+#if __STDC__
+mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize)
+#else
+mpn_gcd (gp, up, usize, vp, vsize)
+     mp_ptr gp;
+     mp_ptr up;
+     mp_size_t usize;
+     mp_ptr vp;
+     mp_size_t vsize;
+#endif
+{
+  mp_ptr orig_vp = vp;
+  mp_size_t orig_vsize = vsize;
+  int binary_gcd_ctr;		/* Number of times binary gcd will execute.  */
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD.
+     Two EXTRA limbs for U and V are required for kary reduction.  */
+  if (vsize >= GCD_ACCEL_THRESHOLD)
+    {
+      unsigned long int vbitsize, d;
+      mp_ptr orig_up = up;
+      mp_size_t orig_usize = usize;
+      mp_ptr anchor_up = (mp_ptr) TMP_ALLOC ((usize + 2) * BYTES_PER_MP_LIMB);
+
+      MPN_COPY (anchor_up, orig_up, usize);
+      up = anchor_up;
+
+      count_leading_zeros (d, up[usize-1]);
+      d = usize * BITS_PER_MP_LIMB - d;
+      count_leading_zeros (vbitsize, vp[vsize-1]);
+      vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+      d = d - vbitsize + 1;
+
+      /* Use bmod reduction to quickly discover whether V divides U.  */
+      up[usize++] = 0;				/* Insert leading zero.  */
+      mpn_bdivmod (up, up, usize, vp, vsize, d);
+
+      /* Now skip U/V mod 2^d and any low zero limbs.  */
+      d /= BITS_PER_MP_LIMB, up += d, usize -= d;
+      while (usize != 0 && up[0] == 0)
+	up++, usize--;
+
+      if (usize == 0)				/* GCD == ORIG_V.  */
+	goto done;
+
+      vp = (mp_ptr) TMP_ALLOC ((vsize + 2) * BYTES_PER_MP_LIMB);
+      MPN_COPY (vp, orig_vp, vsize);
+
+      do					/* Main loop.  */
+	{
+          /* mpn_com_n can't be used here because anchor_up and up may
+             partially overlap */
+	  if (up[usize-1] & MP_LIMB_T_HIGHBIT)  /* U < 0; take twos' compl. */
+	    {
+	      mp_size_t i;
+	      anchor_up[0] = -up[0];
+	      for (i = 1; i < usize; i++)
+		anchor_up[i] = ~up[i];
+	      up = anchor_up;
+	    }
+
+	  MPN_NORMALIZE_NOT_ZERO (up, usize);
+
+	  if ((up[0] & 1) == 0)			/* Result even; remove twos. */
+	    {
+	      unsigned int r;
+	      count_trailing_zeros (r, up[0]);
+	      mpn_rshift (anchor_up, up, usize, r);
+	      usize -= (anchor_up[usize-1] == 0);
+	    }
+	  else if (anchor_up != up)
+	    MPN_COPY_INCR (anchor_up, up, usize);
+
+	  MPN_PTR_SWAP (anchor_up,usize, vp,vsize);
+	  up = anchor_up;
+
+	  if (vsize <= 2)		/* Kary can't handle < 2 limbs and  */
+	    break;			/* isn't efficient for == 2 limbs.  */
+
+	  d = vbitsize;
+	  count_leading_zeros (vbitsize, vp[vsize-1]);
+	  vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+	  d = d - vbitsize + 1;
+
+	  if (d > BMOD_THRESHOLD)	/* Bmod reduction.  */
+	    {
+	      up[usize++] = 0;
+	      mpn_bdivmod (up, up, usize, vp, vsize, d);
+	      d /= BITS_PER_MP_LIMB, up += d, usize -= d;
+	    }
+	  else				/* Kary reduction.  */
+	    {
+	      mp_limb_t bp[2], cp[2];
+
+	      /* C <-- V/U mod 2^(2*BITS_PER_MP_LIMB).  */
+              {
+                mp_limb_t u_inv, hi, lo;
+                modlimb_invert (u_inv, up[0]);
+                cp[0] = vp[0] * u_inv;
+                umul_ppmm (hi, lo, cp[0], up[0]);
+                cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv;
+              }
+
+	      /* U <-- find_a (C)  *  U.  */
+	      up[usize] = mpn_mul_1 (up, up, usize, find_a (cp));
+	      usize++;
+
+	      /* B <-- A/C == U/V mod 2^(BITS_PER_MP_LIMB + 1).
+		  bp[0] <-- U/V mod 2^BITS_PER_MP_LIMB and
+		  bp[1] <-- ( (U - bp[0] * V)/2^BITS_PER_MP_LIMB ) / V mod 2
+
+                 Like V/U above, but simplified because only the low bit of
+                 bp[1] is wanted. */
+              {
+                mp_limb_t  v_inv, hi, lo;
+                modlimb_invert (v_inv, vp[0]);
+                bp[0] = up[0] * v_inv;
+                umul_ppmm (hi, lo, bp[0], vp[0]);
+                bp[1] = (up[1] + hi + (bp[0]&vp[1])) & 1;
+              }
+
+	      up[usize++] = 0;
+	      if (bp[1])	/* B < 0: U <-- U + (-B)  * V.  */
+		{
+		   mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0]);
+		   mpn_add_1 (up + vsize, up + vsize, usize - vsize, c);
+		}
+	      else		/* B >= 0:  U <-- U - B * V.  */
+		{
+		  mp_limb_t b = mpn_submul_1 (up, vp, vsize, bp[0]);
+		  mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+		}
+
+	      up += 2, usize -= 2;  /* At least two low limbs are zero.  */
+	    }
+
+	  /* Must remove low zero limbs before complementing.  */
+	  while (usize != 0 && up[0] == 0)
+	    up++, usize--;
+	}
+      while (usize);
+
+      /* Compute GCD (ORIG_V, GCD (ORIG_U, V)).  Binary will execute twice.  */
+      up = orig_up, usize = orig_usize;
+      binary_gcd_ctr = 2;
+    }
+  else
+    binary_gcd_ctr = 1;
+
+  /* Finish up with the binary algorithm.  Executes once or twice.  */
+  for ( ; binary_gcd_ctr--; up = orig_vp, usize = orig_vsize)
+    {
+      if (usize > 2)		/* First make U close to V in size.  */
+	{
+	  unsigned long int vbitsize, d;
+	  count_leading_zeros (d, up[usize-1]);
+	  d = usize * BITS_PER_MP_LIMB - d;
+	  count_leading_zeros (vbitsize, vp[vsize-1]);
+	  vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+	  d = d - vbitsize - 1;
+	  if (d != -(unsigned long int)1 && d > 2)
+	    {
+	      mpn_bdivmod (up, up, usize, vp, vsize, d);  /* Result > 0.  */
+	      d /= (unsigned long int)BITS_PER_MP_LIMB, up += d, usize -= d;
+	    }
+	}
+
+      /* Start binary GCD.  */
+      do
+	{
+	  mp_size_t zeros;
+
+	  /* Make sure U is odd.  */
+	  MPN_NORMALIZE (up, usize);
+	  while (up[0] == 0)
+	    up += 1, usize -= 1;
+	  if ((up[0] & 1) == 0)
+	    {
+	      unsigned int r;
+	      count_trailing_zeros (r, up[0]);
+	      mpn_rshift (up, up, usize, r);
+	      usize -= (up[usize-1] == 0);
+	    }
+
+	  /* Keep usize >= vsize.  */
+	  if (usize < vsize)
+	    MPN_PTR_SWAP (up, usize, vp, vsize);
+
+	  if (usize <= 2)				/* Double precision. */
+	    {
+	      if (vsize == 1)
+		vp[0] = mpn_gcd_1 (up, usize, vp[0]);
+	      else
+		vsize = gcd_2 (vp, up);
+	      break;					/* Binary GCD done.  */
+	    }
+
+	  /* Count number of low zero limbs of U - V.  */
+	  for (zeros = 0; up[zeros] == vp[zeros] && ++zeros != vsize; )
+	    continue;
+
+	  /* If U < V, swap U and V; in any case, subtract V from U.  */
+	  if (zeros == vsize)				/* Subtract done.  */
+	    up += zeros, usize -= zeros;
+	  else if (usize == vsize)
+	    {
+	      mp_size_t size = vsize;
+	      do
+		size--;
+	      while (up[size] == vp[size]);
+	      if (up[size] < vp[size])			/* usize == vsize.  */
+		MP_PTR_SWAP (up, vp);
+	      up += zeros, usize = size + 1 - zeros;
+	      mpn_sub_n (up, up, vp + zeros, usize);
+	    }
+	  else
+	    {
+	      mp_size_t size = vsize - zeros;
+	      up += zeros, usize -= zeros;
+	      if (mpn_sub_n (up, up, vp + zeros, size))
+		{
+		  while (up[size] == 0)			/* Propagate borrow. */
+		    up[size++] = -(mp_limb_t)1;
+		  up[size] -= 1;
+		}
+	    }
+	}
+      while (usize);					/* End binary GCD.  */
+    }
+
+done:
+  if (vp != gp)
+    MPN_COPY (gp, vp, vsize);
+  TMP_FREE (marker);
+  return vsize;
+}
diff --git a/rts/gmp/mpn/generic/gcd_1.c b/rts/gmp/mpn/generic/gcd_1.c
new file mode 100644
index 0000000000..1832636636
--- /dev/null
+++ b/rts/gmp/mpn/generic/gcd_1.c
@@ -0,0 +1,77 @@
+/* mpn_gcd_1 --
+
+Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Does not work for U == 0 or V == 0.  It would be tough to make it work for
+   V == 0 since gcd(x,0) = x, and U does not generally fit in an mp_limb_t.  */
+
+mp_limb_t
+#if __STDC__
+mpn_gcd_1 (mp_srcptr up, mp_size_t size, mp_limb_t vlimb)
+#else
+mpn_gcd_1 (up, size, vlimb)
+     mp_srcptr up;
+     mp_size_t size;
+     mp_limb_t vlimb;
+#endif
+{
+  mp_limb_t ulimb;
+  unsigned long int u_low_zero_bits, v_low_zero_bits;
+
+  if (size > 1)
+    {
+      ulimb = mpn_mod_1 (up, size, vlimb);
+      if (ulimb == 0)
+	return vlimb;
+    }
+  else
+    ulimb = up[0];
+
+  /*  Need to eliminate low zero bits.  */
+  count_trailing_zeros (u_low_zero_bits, ulimb);
+  ulimb >>= u_low_zero_bits;
+
+  count_trailing_zeros (v_low_zero_bits, vlimb);
+  vlimb >>= v_low_zero_bits;
+
+  while (ulimb != vlimb)
+    {
+      if (ulimb > vlimb)
+	{
+	  ulimb -= vlimb;
+	  do
+	    ulimb >>= 1;
+	  while ((ulimb & 1) == 0);
+	}
+      else /*  vlimb > ulimb.  */
+	{
+	  vlimb -= ulimb;
+	  do
+	    vlimb >>= 1;
+	  while ((vlimb & 1) == 0);
+	}
+    }
+
+  return  ulimb << MIN (u_low_zero_bits, v_low_zero_bits);
+}
diff --git a/rts/gmp/mpn/generic/gcdext.c b/rts/gmp/mpn/generic/gcdext.c
new file mode 100644
index 0000000000..fe22d779a6
--- /dev/null
+++ b/rts/gmp/mpn/generic/gcdext.c
@@ -0,0 +1,700 @@
+/* mpn_gcdext -- Extended Greatest Common Divisor.
+
+Copyright (C) 1996, 1998, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 17
+#endif
+
+#ifndef EXTEND
+#define EXTEND 1
+#endif
+
+#if STAT
+int arr[BITS_PER_MP_LIMB];
+#endif
+
+
+/* mpn_gcdext (GP, SP, SSIZE, UP, USIZE, VP, VSIZE)
+
+   Compute the extended GCD of {UP,USIZE} and {VP,VSIZE} and store the
+   greatest common divisor at GP (unless it is 0), and the first cofactor at
+   SP.  Write the size of the cofactor through the pointer SSIZE.  Return the
+   size of the value at GP.  Note that SP might be a negative number; this is
+   denoted by storing the negative of the size through SSIZE.
+
+   {UP,USIZE} and {VP,VSIZE} are both clobbered.
+
+   The space allocation for all four areas needs to be USIZE+1.
+
+   Preconditions: 1) U >= V.
+		  2) V > 0.  */
+
+/* We use Lehmer's algorithm.  The idea is to extract the most significant
+   bits of the operands, and compute the continued fraction for them.  We then
+   apply the gathered cofactors to the full operands.
+
+   Idea 1: After we have performed a full division, don't shift operands back,
+	   but instead account for the extra factors-of-2 thus introduced.
+   Idea 2: Simple generalization to use divide-and-conquer would give us an
+	   algorithm that runs faster than O(n^2).
+   Idea 3: The input numbers need less space as the computation progresses,
+	   while the s0 and s1 variables need more space.  To save memory, we
+	   could make them share space, and have the latter variables grow
+	   into the former.
+   Idea 4: We should not do double-limb arithmetic from the start.  Instead,
+	   do things in single-limb arithmetic until the quotients differ,
+	   and then switch to double-limb arithmetic.  */
+
+
+/* Division optimized for small quotients.  If the quotient is more than one limb,
+   store 1 in *qh and return 0.  */
+static mp_limb_t
+#if __STDC__
+div2 (mp_limb_t *qh, mp_limb_t n1, mp_limb_t n0, mp_limb_t d1, mp_limb_t d0)
+#else
+div2 (qh, n1, n0, d1, d0)
+     mp_limb_t *qh;
+     mp_limb_t n1;
+     mp_limb_t n0;
+     mp_limb_t d1;
+     mp_limb_t d0;
+#endif
+{
+  if (d1 == 0)
+    {
+      *qh = 1;
+      return 0;
+    }
+
+  if ((mp_limb_signed_t) n1 < 0)
+    {
+      mp_limb_t q;
+      int cnt;
+      for (cnt = 1; (mp_limb_signed_t) d1 >= 0; cnt++)
+	{
+	  d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1));
+	  d0 = d0 << 1;
+	}
+
+      q = 0;
+      while (cnt)
+	{
+	  q <<= 1;
+	  if (n1 > d1 || (n1 == d1 && n0 >= d0))
+	    {
+	      sub_ddmmss (n1, n0, n1, n0, d1, d0);
+	      q |= 1;
+	    }
+	  d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1);
+	  d1 = d1 >> 1;
+	  cnt--;
+	}
+
+      *qh = 0;
+      return q;
+    }
+  else
+    {
+      mp_limb_t q;
+      int cnt;
+      for (cnt = 0; n1 > d1 || (n1 == d1 && n0 >= d0); cnt++)
+	{
+	  d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1));
+	  d0 = d0 << 1;
+	}
+
+      q = 0;
+      while (cnt)
+	{
+	  d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1);
+	  d1 = d1 >> 1;
+	  q <<= 1;
+	  if (n1 > d1 || (n1 == d1 && n0 >= d0))
+	    {
+	      sub_ddmmss (n1, n0, n1, n0, d1, d0);
+	      q |= 1;
+	    }
+	  cnt--;
+	}
+
+      *qh = 0;
+      return q;
+    }
+}
+
+mp_size_t
+#if EXTEND
+#if __STDC__
+mpn_gcdext (mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
+	    mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize)
+#else
+mpn_gcdext (gp, s0p, s0size, up, size, vp, vsize)
+     mp_ptr gp;
+     mp_ptr s0p;
+     mp_size_t *s0size;
+     mp_ptr up;
+     mp_size_t size;
+     mp_ptr vp;
+     mp_size_t vsize;
+#endif
+#else
+#if __STDC__
+mpn_gcd (mp_ptr gp,
+	 mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize)
+#else
+mpn_gcd (gp, up, size, vp, vsize)
+     mp_ptr gp;
+     mp_ptr up;
+     mp_size_t size;
+     mp_ptr vp;
+     mp_size_t vsize;
+#endif
+#endif
+{
+  mp_limb_t A, B, C, D;
+  int cnt;
+  mp_ptr tp, wp;
+#if RECORD
+  mp_limb_t max = 0;
+#endif
+#if EXTEND
+  mp_ptr s1p;
+  mp_ptr orig_s0p = s0p;
+  mp_size_t ssize;
+  int sign = 1;
+#endif
+  int use_double_flag;
+  TMP_DECL (mark);
+
+  TMP_MARK (mark);
+
+  use_double_flag = (size >= GCDEXT_THRESHOLD);
+
+  tp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
+  wp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
+#if EXTEND
+  s1p = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
+
+  MPN_ZERO (s0p, size);
+  MPN_ZERO (s1p, size);
+
+  s0p[0] = 1;
+  s1p[0] = 0;
+  ssize = 1;
+#endif
+
+  if (size > vsize)
+    {
+      /* Normalize V (and shift up U the same amount).  */
+      count_leading_zeros (cnt, vp[vsize - 1]);
+      if (cnt != 0)
+	{
+	  mp_limb_t cy;
+	  mpn_lshift (vp, vp, vsize, cnt);
+	  cy = mpn_lshift (up, up, size, cnt);
+	  up[size] = cy;
+	  size += cy != 0;
+	}
+
+      mpn_divmod (up + vsize, up, size, vp, vsize);
+#if EXTEND
+      /* This is really what it boils down to in this case... */
+      s0p[0] = 0;
+      s1p[0] = 1;
+      sign = -sign;
+#endif
+      size = vsize;
+      if (cnt != 0)
+	{
+	  mpn_rshift (up, up, size, cnt);
+	  mpn_rshift (vp, vp, size, cnt);
+	}
+      MP_PTR_SWAP (up, vp);
+    }
+
+  for (;;)
+    {
+      mp_limb_t asign;
+      /* Figure out exact size of V.  */
+      vsize = size;
+      MPN_NORMALIZE (vp, vsize);
+      if (vsize <= 1)
+	break;
+
+      if (use_double_flag)
+	{
+	  mp_limb_t uh, vh, ul, vl;
+	  /* Let UH,UL be the most significant limbs of U, and let VH,VL be
+	     the corresponding bits from V.  */
+	  uh = up[size - 1];
+	  vh = vp[size - 1];
+	  ul = up[size - 2];
+	  vl = vp[size - 2];
+	  count_leading_zeros (cnt, uh);
+	  if (cnt != 0)
+	    {
+	      uh = (uh << cnt) | (ul >> (BITS_PER_MP_LIMB - cnt));
+	      vh = (vh << cnt) | (vl >> (BITS_PER_MP_LIMB - cnt));
+	      vl <<= cnt;
+	      ul <<= cnt;
+	      if (size >= 3)
+		{
+		  ul |= (up[size - 3] >> (BITS_PER_MP_LIMB - cnt));
+		  vl |= (vp[size - 3] >> (BITS_PER_MP_LIMB - cnt));
+		}
+	    }
+
+	  A = 1;
+	  B = 0;
+	  C = 0;
+	  D = 1;
+
+	  asign = 0;
+	  for (;;)
+	    {
+	      mp_limb_t T;
+	      mp_limb_t qh, q1, q2;
+	      mp_limb_t nh, nl, dh, dl;
+	      mp_limb_t t1, t0;
+	      mp_limb_t Th, Tl;
+
+	      sub_ddmmss (dh, dl, vh, vl, 0, C);
+	      if ((dl | dh) == 0)
+		break;
+	      add_ssaaaa (nh, nl, uh, ul, 0, A);
+	      q1 = div2 (&qh, nh, nl, dh, dl);
+	      if (qh != 0)
+		break;		/* could handle this */
+
+	      add_ssaaaa (dh, dl, vh, vl, 0, D);
+	      if ((dl | dh) == 0)
+		break;
+	      sub_ddmmss (nh, nl, uh, ul, 0, B);
+	      q2 = div2 (&qh, nh, nl, dh, dl);
+	      if (qh != 0)
+		break;		/* could handle this */
+
+	      if (q1 != q2)
+		break;
+
+	      asign = ~asign;
+
+	      T = A + q1 * C;
+	      A = C;
+	      C = T;
+	      T = B + q1 * D;
+	      B = D;
+	      D = T;
+	      umul_ppmm (t1, t0, q1, vl);
+	      t1 += q1 * vh;
+	      sub_ddmmss (Th, Tl, uh, ul, t1, t0);
+	      uh = vh, ul = vl;
+	      vh = Th, vl = Tl;
+
+	      add_ssaaaa (dh, dl, vh, vl, 0, C);
+	      sub_ddmmss (nh, nl, uh, ul, 0, A);
+	      q1 = div2 (&qh, nh, nl, dh, dl);
+	      if (qh != 0)
+		break;		/* could handle this */
+
+	      sub_ddmmss (dh, dl, vh, vl, 0, D);
+	      if ((dl | dh) == 0)
+		break;
+	      add_ssaaaa (nh, nl, uh, ul, 0, B);
+	      q2 = div2 (&qh, nh, nl, dh, dl);
+	      if (qh != 0)
+		break;		/* could handle this */
+
+	      if (q1 != q2)
+		break;
+
+	      asign = ~asign;
+
+	      T = A + q1 * C;
+	      A = C;
+	      C = T;
+	      T = B + q1 * D;
+	      B = D;
+	      D = T;
+	      umul_ppmm (t1, t0, q1, vl);
+	      t1 += q1 * vh;
+	      sub_ddmmss (Th, Tl, uh, ul, t1, t0);
+	      uh = vh, ul = vl;
+	      vh = Th, vl = Tl;
+	    }
+#if EXTEND
+	  if (asign)
+	    sign = -sign;
+#endif
+	}
+      else /* Same, but using single-limb calculations.  */
+	{
+	  mp_limb_t uh, vh;
+	  /* Make UH be the most significant limb of U, and make VH be
+	     corresponding bits from V.  */
+	  uh = up[size - 1];
+	  vh = vp[size - 1];
+	  count_leading_zeros (cnt, uh);
+	  if (cnt != 0)
+	    {
+	      uh = (uh << cnt) | (up[size - 2] >> (BITS_PER_MP_LIMB - cnt));
+	      vh = (vh << cnt) | (vp[size - 2] >> (BITS_PER_MP_LIMB - cnt));
+	    }
+
+	  A = 1;
+	  B = 0;
+	  C = 0;
+	  D = 1;
+
+	  asign = 0;
+	  for (;;)
+	    {
+	      mp_limb_t q, T;
+	      if (vh - C == 0 || vh + D == 0)
+		break;
+
+	      q = (uh + A) / (vh - C);
+	      if (q != (uh - B) / (vh + D))
+		break;
+
+	      asign = ~asign;
+
+	      T = A + q * C;
+	      A = C;
+	      C = T;
+	      T = B + q * D;
+	      B = D;
+	      D = T;
+	      T = uh - q * vh;
+	      uh = vh;
+	      vh = T;
+
+	      if (vh - D == 0)
+		break;
+
+	      q = (uh - A) / (vh + C);
+	      if (q != (uh + B) / (vh - D))
+		break;
+
+	      asign = ~asign;
+
+	      T = A + q * C;
+	      A = C;
+	      C = T;
+	      T = B + q * D;
+	      B = D;
+	      D = T;
+	      T = uh - q * vh;
+	      uh = vh;
+	      vh = T;
+	    }
+#if EXTEND
+	  if (asign)
+	    sign = -sign;
+#endif
+	}
+
+#if RECORD
+      max = MAX (A, max);  max = MAX (B, max);
+      max = MAX (C, max);  max = MAX (D, max);
+#endif
+
+      if (B == 0)
+	{
+	  mp_limb_t qh;
+	  mp_size_t i;
+	  /* This is quite rare.  I.e., optimize something else!  */
+
+	  /* Normalize V (and shift up U the same amount).  */
+	  count_leading_zeros (cnt, vp[vsize - 1]);
+	  if (cnt != 0)
+	    {
+	      mp_limb_t cy;
+	      mpn_lshift (vp, vp, vsize, cnt);
+	      cy = mpn_lshift (up, up, size, cnt);
+	      up[size] = cy;
+	      size += cy != 0;
+	    }
+
+	  qh = mpn_divmod (up + vsize, up, size, vp, vsize);
+#if EXTEND
+	  MPN_COPY (tp, s0p, ssize);
+	  {
+	    mp_size_t qsize;
+
+	    qsize = size - vsize; /* size of stored quotient from division */
+	    if (ssize < qsize)
+	      {
+		MPN_ZERO (tp + ssize, qsize - ssize);
+		MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+		for (i = 0; i < ssize; i++)
+		  {
+		    mp_limb_t cy;
+		    cy = mpn_addmul_1 (tp + i, up + vsize, qsize, s1p[i]);
+		    tp[qsize + i] = cy;
+		  }
+		if (qh != 0)
+		  {
+		    mp_limb_t cy;
+		    cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize);
+		    if (cy != 0)
+		      abort ();
+		  }
+	      }
+	    else
+	      {
+		MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+		for (i = 0; i < qsize; i++)
+		  {
+		    mp_limb_t cy;
+		    cy = mpn_addmul_1 (tp + i, s1p, ssize, up[vsize + i]);
+		    tp[ssize + i] = cy;
+		  }
+		if (qh != 0)
+		  {
+		    mp_limb_t cy;
+		    cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize);
+		    if (cy != 0)
+		      {
+			tp[qsize + ssize] = cy;
+			s1p[qsize + ssize] = 0;
+			ssize++;
+		      }
+		  }
+	      }
+	    ssize += qsize;
+	    ssize -= tp[ssize - 1] == 0;
+	  }
+
+	  sign = -sign;
+	  MP_PTR_SWAP (s0p, s1p);
+	  MP_PTR_SWAP (s1p, tp);
+#endif
+	  size = vsize;
+	  if (cnt != 0)
+	    {
+	      mpn_rshift (up, up, size, cnt);
+	      mpn_rshift (vp, vp, size, cnt);
+	    }
+	  MP_PTR_SWAP (up, vp);
+	}
+      else
+	{
+#if EXTEND
+	  mp_size_t tsize, wsize;
+#endif
+	  /* T = U*A + V*B
+	     W = U*C + V*D
+	     U = T
+	     V = W	   */
+
+#if STAT
+	  { mp_limb_t x; x = A | B | C | D; count_leading_zeros (cnt, x);
+	  arr[BITS_PER_MP_LIMB - cnt]++; }
+#endif
+	  if (A == 0)
+	    {
+	      /* B == 1 and C == 1 (D is arbitrary) */
+	      mp_limb_t cy;
+	      MPN_COPY (tp, vp, size);
+	      MPN_COPY (wp, up, size);
+	      mpn_submul_1 (wp, vp, size, D);
+	      MP_PTR_SWAP (tp, up);
+	      MP_PTR_SWAP (wp, vp);
+#if EXTEND
+	      MPN_COPY (tp, s1p, ssize);
+	      tsize = ssize;
+	      tp[ssize] = 0;	/* must zero since wp might spill below */
+	      MPN_COPY (wp, s0p, ssize);
+	      cy = mpn_addmul_1 (wp, s1p, ssize, D);
+	      wp[ssize] = cy;
+	      wsize = ssize + (cy != 0);
+	      MP_PTR_SWAP (tp, s0p);
+	      MP_PTR_SWAP (wp, s1p);
+	      ssize = MAX (wsize, tsize);
+#endif
+	    }
+	  else
+	    {
+	      if (asign)
+		{
+		  mp_limb_t cy;
+		  mpn_mul_1 (tp, vp, size, B);
+		  mpn_submul_1 (tp, up, size, A);
+		  mpn_mul_1 (wp, up, size, C);
+		  mpn_submul_1 (wp, vp, size, D);
+		  MP_PTR_SWAP (tp, up);
+		  MP_PTR_SWAP (wp, vp);
+#if EXTEND
+		  cy = mpn_mul_1 (tp, s1p, ssize, B);
+		  cy += mpn_addmul_1 (tp, s0p, ssize, A);
+		  tp[ssize] = cy;
+		  tsize = ssize + (cy != 0);
+		  cy = mpn_mul_1 (wp, s0p, ssize, C);
+		  cy += mpn_addmul_1 (wp, s1p, ssize, D);
+		  wp[ssize] = cy;
+		  wsize = ssize + (cy != 0);
+		  MP_PTR_SWAP (tp, s0p);
+		  MP_PTR_SWAP (wp, s1p);
+		  ssize = MAX (wsize, tsize);
+#endif
+		}
+	      else
+		{
+		  mp_limb_t cy;
+		  mpn_mul_1 (tp, up, size, A);
+		  mpn_submul_1 (tp, vp, size, B);
+		  mpn_mul_1 (wp, vp, size, D);
+		  mpn_submul_1 (wp, up, size, C);
+		  MP_PTR_SWAP (tp, up);
+		  MP_PTR_SWAP (wp, vp);
+#if EXTEND
+		  cy = mpn_mul_1 (tp, s0p, ssize, A);
+		  cy += mpn_addmul_1 (tp, s1p, ssize, B);
+		  tp[ssize] = cy;
+		  tsize = ssize + (cy != 0);
+		  cy = mpn_mul_1 (wp, s1p, ssize, D);
+		  cy += mpn_addmul_1 (wp, s0p, ssize, C);
+		  wp[ssize] = cy;
+		  wsize = ssize + (cy != 0);
+		  MP_PTR_SWAP (tp, s0p);
+		  MP_PTR_SWAP (wp, s1p);
+		  ssize = MAX (wsize, tsize);
+#endif
+		}
+	    }
+
+	  size -= up[size - 1] == 0;
+	}
+    }
+
+#if RECORD
+  printf ("max: %lx\n", max);
+#endif
+
+#if STAT
+ {int i; for (i = 0; i < BITS_PER_MP_LIMB; i++) printf ("%d:%d\n", i, arr[i]);}
+#endif
+
+  if (vsize == 0)
+    {
+      if (gp != up && gp != 0)
+	MPN_COPY (gp, up, size);
+#if EXTEND
+      MPN_NORMALIZE (s0p, ssize);
+      if (orig_s0p != s0p)
+	MPN_COPY (orig_s0p, s0p, ssize);
+      *s0size = sign >= 0 ? ssize : -ssize;
+#endif
+      TMP_FREE (mark);
+      return size;
+    }
+  else
+    {
+      mp_limb_t vl, ul, t;
+#if EXTEND
+      mp_size_t qsize, i;
+#endif
+      vl = vp[0];
+#if EXTEND
+      t = mpn_divmod_1 (wp, up, size, vl);
+
+      MPN_COPY (tp, s0p, ssize);
+
+      qsize = size - (wp[size - 1] == 0); /* size of quotient from division */
+      if (ssize < qsize)
+	{
+	  MPN_ZERO (tp + ssize, qsize - ssize);
+	  MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+	  for (i = 0; i < ssize; i++)
+	    {
+	      mp_limb_t cy;
+	      cy = mpn_addmul_1 (tp + i, wp, qsize, s1p[i]);
+	      tp[qsize + i] = cy;
+	    }
+	}
+      else
+	{
+	  MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+	  for (i = 0; i < qsize; i++)
+	    {
+	      mp_limb_t cy;
+	      cy = mpn_addmul_1 (tp + i, s1p, ssize, wp[i]);
+	      tp[ssize + i] = cy;
+	    }
+	}
+      ssize += qsize;
+      ssize -= tp[ssize - 1] == 0;
+
+      sign = -sign;
+      MP_PTR_SWAP (s0p, s1p);
+      MP_PTR_SWAP (s1p, tp);
+#else
+      t = mpn_mod_1 (up, size, vl);
+#endif
+      ul = vl;
+      vl = t;
+      while (vl != 0)
+	{
+	  mp_limb_t t;
+#if EXTEND
+	  mp_limb_t q;
+	  q = ul / vl;
+	  t = ul - q * vl;
+
+	  MPN_COPY (tp, s0p, ssize);
+
+	  MPN_ZERO (s1p + ssize, 1); /* zero s1 too */
+
+	  {
+	    mp_limb_t cy;
+	    cy = mpn_addmul_1 (tp, s1p, ssize, q);
+	    tp[ssize] = cy;
+	  }
+
+	  ssize += 1;
+	  ssize -= tp[ssize - 1] == 0;
+
+	  sign = -sign;
+	  MP_PTR_SWAP (s0p, s1p);
+	  MP_PTR_SWAP (s1p, tp);
+#else
+	  t = ul % vl;
+#endif
+	  ul = vl;
+	  vl = t;
+	}
+      if (gp != 0)
+	gp[0] = ul;
+#if EXTEND
+      MPN_NORMALIZE (s0p, ssize);
+      if (orig_s0p != s0p)
+	MPN_COPY (orig_s0p, s0p, ssize);
+      *s0size = sign >= 0 ? ssize : -ssize;
+#endif
+      TMP_FREE (mark);
+      return 1;
+    }
+}
diff --git a/rts/gmp/mpn/generic/get_str.c b/rts/gmp/mpn/generic/get_str.c
new file mode 100644
index 0000000000..a713b61825
--- /dev/null
+++ b/rts/gmp/mpn/generic/get_str.c
@@ -0,0 +1,216 @@
+/* mpn_get_str -- Convert a MSIZE long limb vector pointed to by MPTR
+   to a printable string in STR in base BASE.
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Convert the limb vector pointed to by MPTR and MSIZE long to a
+   char array, using base BASE for the result array.  Store the
+   result in the character array STR.  STR must point to an array with
+   space for the largest possible number represented by a MSIZE long
+   limb vector + 1 extra character.
+
+   The result is NOT in Ascii, to convert it to printable format, add
+   '0' or 'A' depending on the base and range.
+
+   Return the number of digits in the result string.
+   This may include some leading zeros.
+
+   The limb vector pointed to by MPTR is clobbered.  */
+
+size_t
+#if __STDC__
+mpn_get_str (unsigned char *str, int base, mp_ptr mptr, mp_size_t msize)
+#else
+mpn_get_str (str, base, mptr, msize)
+     unsigned char *str;
+     int base;
+     mp_ptr mptr;
+     mp_size_t msize;
+#endif
+{
+  mp_limb_t big_base;
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+  int normalization_steps;
+#endif
+#if UDIV_TIME > 2 * UMUL_TIME
+  mp_limb_t big_base_inverted;
+#endif
+  unsigned int dig_per_u;
+  mp_size_t out_len;
+  register unsigned char *s;
+
+  big_base = __mp_bases[base].big_base;
+
+  s = str;
+
+  /* Special case zero, as the code below doesn't handle it.  */
+  if (msize == 0)
+    {
+      s[0] = 0;
+      return 1;
+    }
+
+  if ((base & (base - 1)) == 0)
+    {
+      /* The base is a power of 2.  Make conversion from most
+	 significant side.  */
+      mp_limb_t n1, n0;
+      register int bits_per_digit = big_base;
+      register int x;
+      register int bit_pos;
+      register int i;
+
+      n1 = mptr[msize - 1];
+      count_leading_zeros (x, n1);
+
+	/* BIT_POS should be R when input ends in least sign. nibble,
+	   R + bits_per_digit * n when input ends in n:th least significant
+	   nibble. */
+
+      {
+	int bits;
+
+	bits = BITS_PER_MP_LIMB * msize - x;
+	x = bits % bits_per_digit;
+	if (x != 0)
+	  bits += bits_per_digit - x;
+	bit_pos = bits - (msize - 1) * BITS_PER_MP_LIMB;
+      }
+
+      /* Fast loop for bit output.  */
+      i = msize - 1;
+      for (;;)
+	{
+	  bit_pos -= bits_per_digit;
+	  while (bit_pos >= 0)
+	    {
+	      *s++ = (n1 >> bit_pos) & ((1 << bits_per_digit) - 1);
+	      bit_pos -= bits_per_digit;
+	    }
+	  i--;
+	  if (i < 0)
+	    break;
+	  n0 = (n1 << -bit_pos) & ((1 << bits_per_digit) - 1);
+	  n1 = mptr[i];
+	  bit_pos += BITS_PER_MP_LIMB;
+	  *s++ = n0 | (n1 >> bit_pos);
+	}
+
+      *s = 0;
+
+      return s - str;
+    }
+  else
+    {
+      /* General case.  The base is not a power of 2.  Make conversion
+	 from least significant end.  */
+
+      /* If udiv_qrnnd only handles divisors with the most significant bit
+	 set, prepare BIG_BASE for being a divisor by shifting it to the
+	 left exactly enough to set the most significant bit.  */
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+      count_leading_zeros (normalization_steps, big_base);
+      big_base <<= normalization_steps;
+#if UDIV_TIME > 2 * UMUL_TIME
+      /* Get the fixed-point approximation to 1/(BIG_BASE << NORMALIZATION_STEPS).  */
+      big_base_inverted = __mp_bases[base].big_base_inverted;
+#endif
+#endif
+
+      dig_per_u = __mp_bases[base].chars_per_limb;
+      out_len = ((size_t) msize * BITS_PER_MP_LIMB
+		 * __mp_bases[base].chars_per_bit_exactly) + 1;
+      s += out_len;
+
+      while (msize != 0)
+	{
+	  int i;
+	  mp_limb_t n0, n1;
+
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+	  /* If we shifted BIG_BASE above, shift the dividend too, to get
+	     the right quotient.  We need to do this every loop,
+	     since the intermediate quotients are OK, but the quotient from
+	     one turn in the loop is going to be the dividend in the
+	     next turn, and the dividend needs to be up-shifted.  */
+	  if (normalization_steps != 0)
+	    {
+	      n0 = mpn_lshift (mptr, mptr, msize, normalization_steps);
+
+	      /* If the shifting gave a carry out limb, store it and
+		 increase the length.  */
+	      if (n0 != 0)
+		{
+		  mptr[msize] = n0;
+		  msize++;
+		}
+	    }
+#endif
+
+	  /* Divide the number at TP with BIG_BASE to get a quotient and a
+	     remainder.  The remainder is our new digit in base BIG_BASE.  */
+	  i = msize - 1;
+	  n1 = mptr[i];
+
+	  if (n1 >= big_base)
+	    n1 = 0;
+	  else
+	    {
+	      msize--;
+	      i--;
+	    }
+
+	  for (; i >= 0; i--)
+	    {
+	      n0 = mptr[i];
+#if UDIV_TIME > 2 * UMUL_TIME
+	      udiv_qrnnd_preinv (mptr[i], n1, n1, n0, big_base, big_base_inverted);
+#else
+	      udiv_qrnnd (mptr[i], n1, n1, n0, big_base);
+#endif
+	    }
+
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+	  /* If we shifted above (at previous UDIV_NEEDS_NORMALIZATION tests)
+	     the remainder will be up-shifted here.  Compensate.  */
+	  n1 >>= normalization_steps;
+#endif
+
+	  /* Convert N1 from BIG_BASE to a string of digits in BASE
+	     using single precision operations.  */
+	  for (i = dig_per_u - 1; i >= 0; i--)
+	    {
+	      *--s = n1 % base;
+	      n1 /= base;
+	      if (n1 == 0 && msize == 0)
+		break;
+	    }
+	}
+
+      while (s != str)
+	*--s = 0;
+      return out_len;
+    }
+}
diff --git a/rts/gmp/mpn/generic/gmp-mparam.h b/rts/gmp/mpn/generic/gmp-mparam.h
new file mode 100644
index 0000000000..14bcaece83
--- /dev/null
+++ b/rts/gmp/mpn/generic/gmp-mparam.h
@@ -0,0 +1,27 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
diff --git a/rts/gmp/mpn/generic/hamdist.c b/rts/gmp/mpn/generic/hamdist.c
new file mode 100644
index 0000000000..35c10e8450
--- /dev/null
+++ b/rts/gmp/mpn/generic/hamdist.c
@@ -0,0 +1,94 @@
+/* mpn_hamdist --
+
+Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#if defined __GNUC__
+/* No processor claiming to be SPARC v9 compliant seem to
+   implement the POPC instruction.  Disable pattern for now.  */
+#if 0 && defined __sparc_v9__ && BITS_PER_MP_LIMB == 64
+#define popc_limb(a) \
+  ({									\
+    DItype __res;							\
+    asm ("popc %1,%0" : "=r" (__res) : "rI" (a));			\
+    __res;								\
+  })
+#endif
+#endif
+
+#ifndef popc_limb
+
+/* Cool population count of a mp_limb_t.
+   You have to figure out how this works, I won't tell you!  */
+
+static inline unsigned int
+#if __STDC__
+popc_limb (mp_limb_t x)
+#else
+popc_limb (x)
+     mp_limb_t x;
+#endif
+{
+#if BITS_PER_MP_LIMB == 64
+  /* We have to go into some trouble to define these constants.
+     (For mp_limb_t being `long long'.)  */
+  mp_limb_t cnst;
+  cnst = 0xaaaaaaaaL | ((mp_limb_t) 0xaaaaaaaaL << BITS_PER_MP_LIMB/2);
+  x -= (x & cnst) >> 1;
+  cnst = 0x33333333L | ((mp_limb_t) 0x33333333L << BITS_PER_MP_LIMB/2);
+  x = ((x & ~cnst) >> 2) + (x & cnst);
+  cnst = 0x0f0f0f0fL | ((mp_limb_t) 0x0f0f0f0fL << BITS_PER_MP_LIMB/2);
+  x = ((x >> 4) + x) & cnst;
+  x = ((x >> 8) + x);
+  x = ((x >> 16) + x);
+  x = ((x >> 32) + x) & 0xff;
+#endif
+#if BITS_PER_MP_LIMB == 32
+  x -= (x & 0xaaaaaaaa) >> 1;
+  x = ((x >> 2) & 0x33333333L) + (x & 0x33333333L);
+  x = ((x >> 4) + x) & 0x0f0f0f0fL;
+  x = ((x >> 8) + x);
+  x = ((x >> 16) + x) & 0xff;
+#endif
+  return x;
+}
+#endif
+
+unsigned long int
+#if __STDC__
+mpn_hamdist (mp_srcptr up, mp_srcptr vp, mp_size_t size)
+#else
+mpn_hamdist (up, vp, size)
+     register mp_srcptr up;
+     register mp_srcptr vp;
+     register mp_size_t size;
+#endif
+{
+  unsigned long int hamdist;
+  mp_size_t i;
+
+  hamdist = 0;
+  for (i = 0; i < size; i++)
+    hamdist += popc_limb (up[i] ^ vp[i]);
+
+  return hamdist;
+}
diff --git a/rts/gmp/mpn/generic/inlines.c b/rts/gmp/mpn/generic/inlines.c
new file mode 100644
index 0000000000..9487e58cf2
--- /dev/null
+++ b/rts/gmp/mpn/generic/inlines.c
@@ -0,0 +1,24 @@
+/*
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#define _FORCE_INLINES
+#define _EXTERN_INLINE /* empty */
+#include "gmp.h"
diff --git a/rts/gmp/mpn/generic/jacbase.c b/rts/gmp/mpn/generic/jacbase.c
new file mode 100644
index 0000000000..dd437f1ac1
--- /dev/null
+++ b/rts/gmp/mpn/generic/jacbase.c
@@ -0,0 +1,136 @@
+/* mpn_jacobi_base -- limb/limb Jacobi symbol with restricted arguments.
+
+   THIS INTERFACE IS PRELIMINARY AND MIGHT DISAPPEAR OR BE SUBJECT TO
+   INCOMPATIBLE CHANGES IN A FUTURE RELEASE OF GMP. */
+
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+#if COUNT_TRAILING_ZEROS_TIME <= 7
+/* If count_trailing_zeros is fast, use it.
+   K7 at 7 cycles and P6 at 2 are good here.  K6 at 12-27 and P5 at 18-42
+   are not.  The default 15 in longlong.h is meant to mean not good here.  */
+
+#define PROCESS_TWOS_ANY                                \
+  {                                                     \
+    mp_limb_t  twos;                                    \
+    count_trailing_zeros (twos, a);                     \
+    result_bit1 ^= JACOBI_TWOS_U_BIT1 (twos, b);        \
+    a >>= twos;                                         \
+  }
+
+#define PROCESS_TWOS_EVEN  PROCESS_TWOS_ANY
+
+#else
+/* Use a loop instead.  With "a" uniformly distributed there will usually be
+   only a few trailing zeros.
+
+   Unfortunately the branch for the while loop here will be on a 50/50
+   chance of a 1 or 0, which is bad for branch prediction.  */
+
+#define PROCESS_TWOS_EVEN               \
+  {                                     \
+    int  two;                           \
+    two = JACOBI_TWO_U_BIT1 (b);        \
+    do                                  \
+      {                                 \
+        a >>= 1;                        \
+        result_bit1 ^= two;             \
+        ASSERT (a != 0);                \
+      }                                 \
+    while ((a & 1) == 0);               \
+  }
+
+#define PROCESS_TWOS_ANY        \
+  if ((a & 1) == 0)             \
+    PROCESS_TWOS_EVEN;
+
+#endif
+
+
+/* Calculate the value of the Jacobi symbol (a/b) of two mp_limb_t's, but
+   with a restricted range of inputs accepted, namely b>1, b odd, and a<=b.
+
+   The initial result_bit1 is taken as a parameter for the convenience of
+   mpz_kronecker_zi_ui() et al.  The sign changes both here and in those
+   routines accumulate nicely in bit 1, see the JACOBI macros.
+
+   The return value here is the normal +1, 0, or -1.  Note that +1 and -1
+   have bit 1 in the "BIT1" sense, which could be useful if the caller is
+   accumulating it into some extended calculation.
+
+   Duplicating the loop body to avoid the MP_LIMB_T_SWAP(a,b) would be
+   possible, but a couple of tests suggest it's not a significant speedup,
+   and may even be a slowdown, so what's here is good enough for now.
+
+   Future: The code doesn't demand a<=b actually, so maybe this could be
+   relaxed.  All the places this is used currently call with a<=b though.  */
+
+int
+#if __STDC__
+mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int result_bit1)
+#else
+mpn_jacobi_base (a, b, result_bit1)
+     mp_limb_t a;
+     mp_limb_t b;
+     int       result_bit1;
+#endif
+{
+  ASSERT (b & 1);  /* b odd */
+  ASSERT (b != 1);
+  ASSERT (a <= b);
+
+  if (a == 0)
+    return 0;
+
+  PROCESS_TWOS_ANY;
+  if (a == 1)
+    goto done;
+
+  for (;;)
+    {
+      result_bit1 ^= JACOBI_RECIP_UU_BIT1 (a, b);
+      MP_LIMB_T_SWAP (a, b);
+
+      do
+	{
+          /* working on (a/b), a,b odd, a>=b */
+          ASSERT (a & 1);
+          ASSERT (b & 1);
+          ASSERT (a >= b);
+
+	  if ((a -= b) == 0)
+	    return 0;
+
+          PROCESS_TWOS_EVEN;
+	  if (a == 1)
+	    goto done;
+	}
+      while (a >= b);
+    }
+
+ done:
+  return JACOBI_BIT1_TO_PN (result_bit1);
+}
diff --git a/rts/gmp/mpn/generic/lshift.c b/rts/gmp/mpn/generic/lshift.c
new file mode 100644
index 0000000000..0b58389658
--- /dev/null
+++ b/rts/gmp/mpn/generic/lshift.c
@@ -0,0 +1,87 @@
+/* mpn_lshift -- Shift left low level.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Shift U (pointed to by UP and USIZE digits long) CNT bits to the left
+   and store the USIZE least significant digits of the result at WP.
+   Return the bits shifted out from the most significant digit.
+
+   Argument constraints:
+   1. 0 < CNT < BITS_PER_MP_LIMB
+   2. If the result is to be written over the input, WP must be >= UP.
+*/
+
+mp_limb_t
+#if __STDC__
+mpn_lshift (register mp_ptr wp,
+	    register mp_srcptr up, mp_size_t usize,
+	    register unsigned int cnt)
+#else
+mpn_lshift (wp, up, usize, cnt)
+     register mp_ptr wp;
+     register mp_srcptr up;
+     mp_size_t usize;
+     register unsigned int cnt;
+#endif
+{
+  register mp_limb_t high_limb, low_limb;
+  register unsigned sh_1, sh_2;
+  register mp_size_t i;
+  mp_limb_t retval;
+
+#ifdef DEBUG
+  if (usize == 0 || cnt == 0)
+    abort ();
+#endif
+
+  sh_1 = cnt;
+#if 0
+  if (sh_1 == 0)
+    {
+      if (wp != up)
+	{
+	  /* Copy from high end to low end, to allow specified input/output
+	     overlapping.  */
+	  for (i = usize - 1; i >= 0; i--)
+	    wp[i] = up[i];
+	}
+      return 0;
+    }
+#endif
+
+  wp += 1;
+  sh_2 = BITS_PER_MP_LIMB - sh_1;
+  i = usize - 1;
+  low_limb = up[i];
+  retval = low_limb >> sh_2;
+  high_limb = low_limb;
+  while (--i >= 0)
+    {
+      low_limb = up[i];
+      wp[i] = (high_limb << sh_1) | (low_limb >> sh_2);
+      high_limb = low_limb;
+    }
+  wp[i] = high_limb << sh_1;
+
+  return retval;
+}
diff --git a/rts/gmp/mpn/generic/mod_1.c b/rts/gmp/mpn/generic/mod_1.c
new file mode 100644
index 0000000000..168ec9df49
--- /dev/null
+++ b/rts/gmp/mpn/generic/mod_1.c
@@ -0,0 +1,175 @@
+/* mpn_mod_1(dividend_ptr, dividend_size, divisor_limb) --
+   Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+   Return the single-limb remainder.
+   There are no constraints on the value of the divisor.
+
+Copyright (C) 1991, 1993, 1994, 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+mp_limb_t
+#if __STDC__
+mpn_mod_1 (mp_srcptr dividend_ptr, mp_size_t dividend_size,
+	   mp_limb_t divisor_limb)
+#else
+mpn_mod_1 (dividend_ptr, dividend_size, divisor_limb)
+     mp_srcptr dividend_ptr;
+     mp_size_t dividend_size;
+     mp_limb_t divisor_limb;
+#endif
+{
+  mp_size_t i;
+  mp_limb_t n1, n0, r;
+  int dummy;
+
+  /* Botch: Should this be handled at all?  Rely on callers?  */
+  if (dividend_size == 0)
+    return 0;
+
+  /* If multiplication is much faster than division, and the
+     dividend is large, pre-invert the divisor, and use
+     only multiplications in the inner loop.  */
+
+  /* This test should be read:
+       Does it ever help to use udiv_qrnnd_preinv?
+	 && Does what we save compensate for the inversion overhead?  */
+  if (UDIV_TIME > (2 * UMUL_TIME + 6)
+      && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME)
+    {
+      int normalization_steps;
+
+      count_leading_zeros (normalization_steps, divisor_limb);
+      if (normalization_steps != 0)
+	{
+	  mp_limb_t divisor_limb_inverted;
+
+	  divisor_limb <<= normalization_steps;
+	  invert_limb (divisor_limb_inverted, divisor_limb);
+
+	  n1 = dividend_ptr[dividend_size - 1];
+	  r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+	  /* Possible optimization:
+	     if (r == 0
+	     && divisor_limb > ((n1 << normalization_steps)
+			     | (dividend_ptr[dividend_size - 2] >> ...)))
+	     ...one division less... */
+
+	  for (i = dividend_size - 2; i >= 0; i--)
+	    {
+	      n0 = dividend_ptr[i];
+	      udiv_qrnnd_preinv (dummy, r, r,
+				 ((n1 << normalization_steps)
+				  | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+				 divisor_limb, divisor_limb_inverted);
+	      n1 = n0;
+	    }
+	  udiv_qrnnd_preinv (dummy, r, r,
+			     n1 << normalization_steps,
+			     divisor_limb, divisor_limb_inverted);
+	  return r >> normalization_steps;
+	}
+      else
+	{
+	  mp_limb_t divisor_limb_inverted;
+
+	  invert_limb (divisor_limb_inverted, divisor_limb);
+
+	  i = dividend_size - 1;
+	  r = dividend_ptr[i];
+
+	  if (r >= divisor_limb)
+	    r = 0;
+	  else
+	    i--;
+
+	  for (; i >= 0; i--)
+	    {
+	      n0 = dividend_ptr[i];
+	      udiv_qrnnd_preinv (dummy, r, r,
+				 n0, divisor_limb, divisor_limb_inverted);
+	    }
+	  return r;
+	}
+    }
+  else
+    {
+      if (UDIV_NEEDS_NORMALIZATION)
+	{
+	  int normalization_steps;
+
+	  count_leading_zeros (normalization_steps, divisor_limb);
+	  if (normalization_steps != 0)
+	    {
+	      divisor_limb <<= normalization_steps;
+
+	      n1 = dividend_ptr[dividend_size - 1];
+	      r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+	      /* Possible optimization:
+		 if (r == 0
+		 && divisor_limb > ((n1 << normalization_steps)
+				 | (dividend_ptr[dividend_size - 2] >> ...)))
+		 ...one division less... */
+
+	      for (i = dividend_size - 2; i >= 0; i--)
+		{
+		  n0 = dividend_ptr[i];
+		  udiv_qrnnd (dummy, r, r,
+			      ((n1 << normalization_steps)
+			       | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+			      divisor_limb);
+		  n1 = n0;
+		}
+	      udiv_qrnnd (dummy, r, r,
+			  n1 << normalization_steps,
+			  divisor_limb);
+	      return r >> normalization_steps;
+	    }
+	}
+      /* No normalization needed, either because udiv_qrnnd doesn't require
+	 it, or because DIVISOR_LIMB is already normalized.  */
+
+      i = dividend_size - 1;
+      r = dividend_ptr[i];
+
+      if (r >= divisor_limb)
+	r = 0;
+      else
+	i--;
+
+      for (; i >= 0; i--)
+	{
+	  n0 = dividend_ptr[i];
+	  udiv_qrnnd (dummy, r, r, n0, divisor_limb);
+	}
+      return r;
+    }
+}
diff --git a/rts/gmp/mpn/generic/mod_1_rs.c b/rts/gmp/mpn/generic/mod_1_rs.c
new file mode 100644
index 0000000000..62aaa94b92
--- /dev/null
+++ b/rts/gmp/mpn/generic/mod_1_rs.c
@@ -0,0 +1,111 @@
+/* mpn_mod_1_rshift -- mpn remainder under hypothetical right shift.
+
+   THE FUNCTION IN THIS FILE IS FOR INTERNAL USE AND HAS A MUTABLE
+   INTERFACE.  IT IS ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.
+   IT'S ALMOST GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP
+   RELEASE. */
+
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* When testing on a CPU with UDIV_NEEDS_NORMALIZATION equal to 0, it can be
+   changed to 1 temporarily to test the code under that case too. */
+#if 0
+#undef UDIV_NEEDS_NORMALIZATION
+#define UDIV_NEEDS_NORMALIZATION 1
+#endif
+
+
+/* Calculate the remainder "(ptr,size >> shift) % divisor".  Note ptr,size
+   is unchanged, the shift is only for its effect on the remainder.
+   The shift doesn't even need to be considered until the last limb.
+
+   This function has the normal size!=0 restriction, unlike the basic
+   mpn_mod_1. */
+
+mp_limb_t
+#if __STDC__
+mpn_mod_1_rshift (mp_srcptr ptr, mp_size_t size, unsigned shift,
+                  mp_limb_t divisor)
+#else
+mpn_mod_1_rshift (ptr, size, shift, divisor)
+     mp_srcptr ptr;
+     mp_size_t size;
+     unsigned  shift;
+     mp_limb_t divisor;
+#endif
+{
+  mp_limb_t  quot, rem;
+
+  ASSERT (shift >= 1);
+  ASSERT (shift < BITS_PER_MP_LIMB);
+  ASSERT (size >= 1);
+
+  if (size == 1)
+    return (ptr[0] >> shift) % divisor;
+
+#if UDIV_NEEDS_NORMALIZATION 
+  {
+    int  norm;
+    int  delta;
+
+    count_leading_zeros (norm, divisor);
+    divisor <<= norm;
+
+    delta = shift - norm;
+    if (delta == 0)
+      return mpn_mod_1 (ptr, size, divisor) >> norm;
+
+    if (delta > 0)
+      {
+        rem = mpn_mod_1 (ptr+1, size-1, divisor);
+        udiv_qrnnd (quot, rem,
+                    rem >> delta,
+                    (rem << (BITS_PER_MP_LIMB-delta)) | (ptr[0] >> delta),
+                    divisor);
+        return rem >> norm;
+      }
+    else
+      {
+        rem = mpn_mod_1 (ptr, size, divisor);
+        udiv_qrnnd (quot, rem,
+                    rem >> (BITS_PER_MP_LIMB+delta),
+                    rem << -delta,
+                    divisor);
+        return rem >> norm;
+      }
+  }
+
+#else /* !UDIV_NEEDS_NORMALIZATION */
+
+  rem = mpn_mod_1 (ptr+1, size-1, divisor);
+  udiv_qrnnd (quot, rem,
+              rem >> shift,
+              (rem << (BITS_PER_MP_LIMB-shift)) | (ptr[0] >> shift),
+              divisor);
+  return rem;
+
+#endif
+}
diff --git a/rts/gmp/mpn/generic/mul.c b/rts/gmp/mpn/generic/mul.c
new file mode 100644
index 0000000000..cecfa19ca1
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul.c
@@ -0,0 +1,190 @@
+/* mpn_mul -- Multiply two natural numbers.
+
+   THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul)
+   ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY SAFE TO REACH
+   THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
+   THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
+   (pointed to by VP, with VN limbs), and store the result at PRODP.  The
+   result is UN + VN limbs.  Return the most significant limb of the result.
+
+   NOTE: The space pointed to by PRODP is overwritten before finished with U
+   and V, so overlap is an error.
+
+   Argument constraints:
+   1. UN >= VN.
+   2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
+      the multiplier and the multiplicand.  */
+
+void
+#if __STDC__
+mpn_sqr_n (mp_ptr prodp,
+         mp_srcptr up, mp_size_t un)
+#else
+mpn_sqr_n (prodp, up, un)
+     mp_ptr prodp;
+     mp_srcptr up;
+     mp_size_t un;
+#endif
+{
+  if (un < KARATSUBA_SQR_THRESHOLD)
+    { /* plain schoolbook multiplication */
+      if (un == 0)
+	return;
+      mpn_sqr_basecase (prodp, up, un);
+    }
+  else if (un < TOOM3_SQR_THRESHOLD)
+    { /* karatsuba multiplication */
+      mp_ptr tspace;
+      TMP_DECL (marker);
+      TMP_MARK (marker);
+      tspace = (mp_ptr) TMP_ALLOC (2 * (un + BITS_PER_MP_LIMB) * BYTES_PER_MP_LIMB);
+      mpn_kara_sqr_n (prodp, up, un, tspace);
+      TMP_FREE (marker);
+    }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+  else if (un < FFT_SQR_THRESHOLD)
+#else
+  else
+#endif
+    { /* toom3 multiplication */
+      mp_ptr tspace;
+      TMP_DECL (marker);
+      TMP_MARK (marker);
+      tspace = (mp_ptr) TMP_ALLOC (2 * (un + BITS_PER_MP_LIMB) * BYTES_PER_MP_LIMB);
+      mpn_toom3_sqr_n (prodp, up, un, tspace);
+      TMP_FREE (marker);
+    }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+  else
+    {
+      /* schoenhage multiplication */
+      mpn_mul_fft_full (prodp, up, un, up, un);
+    }
+#endif
+}
+
+mp_limb_t
+#if __STDC__
+mpn_mul (mp_ptr prodp,
+	 mp_srcptr up, mp_size_t un,
+	 mp_srcptr vp, mp_size_t vn)
+#else
+mpn_mul (prodp, up, un, vp, vn)
+     mp_ptr prodp;
+     mp_srcptr up;
+     mp_size_t un;
+     mp_srcptr vp;
+     mp_size_t vn;
+#endif
+{
+  mp_size_t l;
+  mp_limb_t c;
+
+  if (up == vp && un == vn)
+    {
+      mpn_sqr_n (prodp, up, un);
+      return prodp[2 * un - 1];
+    }
+
+  if (vn < KARATSUBA_MUL_THRESHOLD)
+    { /* long multiplication */
+      mpn_mul_basecase (prodp, up, un, vp, vn);
+      return prodp[un + vn - 1];
+    }
+
+  mpn_mul_n (prodp, up, vp, vn);
+  if (un != vn)
+    { mp_limb_t t;
+      mp_ptr ws;
+      TMP_DECL (marker);
+      TMP_MARK (marker);
+
+      prodp += vn;
+      l = vn;
+      up += vn;
+      un -= vn;
+
+      if (un < vn) 
+	{
+	  /* Swap u's and v's. */
+          MPN_SRCPTR_SWAP (up,un, vp,vn);
+	}
+
+      ws = (mp_ptr) TMP_ALLOC (((vn >= KARATSUBA_MUL_THRESHOLD ? vn : un) + vn)
+			       * BYTES_PER_MP_LIMB);
+
+      t = 0;
+      while (vn >= KARATSUBA_MUL_THRESHOLD)
+	{
+	  mpn_mul_n (ws, up, vp, vn);
+	  if (l <= 2*vn) 
+	    {
+	      t += mpn_add_n (prodp, prodp, ws, l);
+	      if (l != 2*vn)
+		{
+		  t = mpn_add_1 (prodp + l, ws + l, 2*vn - l, t);
+		  l = 2*vn;
+		}
+	    }
+	  else
+	    {
+	      c = mpn_add_n (prodp, prodp, ws, 2*vn);
+	      t += mpn_add_1 (prodp + 2*vn, prodp + 2*vn, l - 2*vn, c);
+	    }
+	  prodp += vn;
+	  l -= vn;
+	  up += vn;
+	  un -= vn;
+	  if (un < vn) 
+	    {
+	      /* Swap u's and v's. */
+              MPN_SRCPTR_SWAP (up,un, vp,vn);
+	    }
+	}
+
+      if (vn)
+	{
+	  mpn_mul_basecase (ws, up, un, vp, vn);
+	  if (l <= un + vn) 
+	    {
+	      t += mpn_add_n (prodp, prodp, ws, l);
+	      if (l != un + vn)
+		t = mpn_add_1 (prodp + l, ws + l, un + vn - l, t);
+	    } 
+	  else
+	    {
+	      c = mpn_add_n (prodp, prodp, ws, un + vn);
+	      t += mpn_add_1 (prodp + un + vn, prodp + un + vn, l - un - vn, c);
+	    }
+	}
+
+    TMP_FREE (marker);
+  }
+  return prodp[un + vn - 1];
+}
diff --git a/rts/gmp/mpn/generic/mul_1.c b/rts/gmp/mpn/generic/mul_1.c
new file mode 100644
index 0000000000..1c36b5fb1f
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_1.c
@@ -0,0 +1,59 @@
+/* mpn_mul_1 -- Multiply a limb vector with a single limb and
+   store the product in a second limb vector.
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_mul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     mp_size_t s1_size;
+     register mp_limb_t s2_limb;
+{
+  register mp_limb_t cy_limb;
+  register mp_size_t j;
+  register mp_limb_t prod_high, prod_low;
+
+  /* The loop counter and index J goes from -S1_SIZE to -1.  This way
+     the loop becomes faster.  */
+  j = -s1_size;
+
+  /* Offset the base pointers to compensate for the negative indices.  */
+  s1_ptr -= j;
+  res_ptr -= j;
+
+  cy_limb = 0;
+  do
+    {
+      umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb);
+
+      prod_low += cy_limb;
+      cy_limb = (prod_low < cy_limb) + prod_high;
+
+      res_ptr[j] = prod_low;
+    }
+  while (++j != 0);
+
+  return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/mul_basecase.c b/rts/gmp/mpn/generic/mul_basecase.c
new file mode 100644
index 0000000000..00c06aa5c4
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_basecase.c
@@ -0,0 +1,87 @@
+/* mpn_mul_basecase -- Internal routine to multiply two natural numbers
+   of length m and n.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Handle simple cases with traditional multiplication.
+
+   This is the most critical code of multiplication.  All multiplies rely on
+   this, both small and huge.  Small ones arrive here immediately, huge ones
+   arrive here as this is the base case for Karatsuba's recursive algorithm. */
+
+void
+#if __STDC__
+mpn_mul_basecase (mp_ptr prodp,
+		     mp_srcptr up, mp_size_t usize,
+		     mp_srcptr vp, mp_size_t vsize)
+#else
+mpn_mul_basecase (prodp, up, usize, vp, vsize)
+     mp_ptr prodp;
+     mp_srcptr up;
+     mp_size_t usize;
+     mp_srcptr vp;
+     mp_size_t vsize;
+#endif
+{
+  /* We first multiply by the low order one or two limbs, as the result can
+     be stored, not added, to PROD.  We also avoid a loop for zeroing this
+     way.  */
+#if HAVE_NATIVE_mpn_mul_2
+  if (vsize >= 2)
+    {
+      prodp[usize + 1] = mpn_mul_2 (prodp, up, usize, vp[0], vp[1]);
+      prodp += 2, vp += 2, vsize -= 2;
+    }
+  else
+    {
+      prodp[usize] = mpn_mul_1 (prodp, up, usize, vp[0]);
+      return;
+    }
+#else
+  prodp[usize] = mpn_mul_1 (prodp, up, usize, vp[0]);
+  prodp += 1, vp += 1, vsize -= 1;
+#endif
+
+#if HAVE_NATIVE_mpn_addmul_2
+  while (vsize >= 2)
+    {
+      prodp[usize + 1] = mpn_addmul_2 (prodp, up, usize, vp[0], vp[1]);
+      prodp += 2, vp += 2, vsize -= 2;
+    }
+  if (vsize != 0)
+    prodp[usize] = mpn_addmul_1 (prodp, up, usize, vp[0]);
+#else
+  /* For each iteration in the loop, multiply U with one limb from V, and
+     add the result to PROD.  */
+  while (vsize != 0)
+    {
+      prodp[usize] = mpn_addmul_1 (prodp, up, usize, vp[0]);
+      prodp += 1, vp += 1, vsize -= 1;
+    }
+#endif
+}
diff --git a/rts/gmp/mpn/generic/mul_fft.c b/rts/gmp/mpn/generic/mul_fft.c
new file mode 100644
index 0000000000..00fd6d72de
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_fft.c
@@ -0,0 +1,772 @@
+/* An implementation in GMP of Scho"nhage's fast multiplication algorithm
+   modulo 2^N+1, by Paul Zimmermann, INRIA Lorraine, February 1998.
+
+   THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND THE FUNCTIONS HAVE
+   MUTABLE INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
+   INTERFACES.  IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN
+   A FUTURE GNU MP RELEASE.
+
+Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+/* References:
+
+   Schnelle Multiplikation grosser Zahlen, by Arnold Scho"nhage and Volker
+   Strassen, Computing 7, p. 281-292, 1971.
+
+   Asymptotically fast algorithms for the numerical multiplication
+   and division of polynomials with complex coefficients, by Arnold Scho"nhage,
+   Computer Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982.
+
+   Tapes versus Pointers, a study in implementing fast algorithms,
+   by Arnold Scho"nhage, Bulletin of the EATCS, 30, p. 23-32, 1986.
+
+   See also http://www.loria.fr/~zimmerma/bignum
+
+
+   Future:
+
+   K==2 isn't needed in the current uses of this code and the bits specific
+   for that could be dropped.
+
+   It might be possible to avoid a small number of MPN_COPYs by using a
+   rotating temporary or two.
+
+   Multiplications of unequal sized operands can be done with this code, but
+   it needs a tighter test for identifying squaring (same sizes as well as
+   same pointers).  */
+
+
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+/* Change this to "#define TRACE(x) x" for some traces. */
+#define TRACE(x)
+
+
+
+FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] = {
+  FFT_MUL_TABLE,
+  FFT_SQR_TABLE
+};
+
+
+static void mpn_mul_fft_internal
+_PROTO ((mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
+         int k, int K,
+         mp_limb_t **Ap, mp_limb_t **Bp,
+         mp_limb_t *A, mp_limb_t *B,
+         mp_size_t nprime, mp_size_t l, mp_size_t Mp, int **_fft_l,
+         mp_limb_t *T, int rec));
+
+
+/* Find the best k to use for a mod 2^(n*BITS_PER_MP_LIMB)+1 FFT.
+   sqr==0 if for a multiply, sqr==1 for a square */
+int
+#if __STDC__
+mpn_fft_best_k (mp_size_t n, int sqr)
+#else
+mpn_fft_best_k (n, sqr)
+     mp_size_t n;
+     int       sqr;
+#endif
+{
+  mp_size_t  t;
+  int        i;
+
+  for (i = 0; mpn_fft_table[sqr][i] != 0; i++)
+    if (n < mpn_fft_table[sqr][i])
+      return i + FFT_FIRST_K;
+
+  /* treat 4*last as one further entry */
+  if (i == 0 || n < 4*mpn_fft_table[sqr][i-1])
+    return i + FFT_FIRST_K;
+  else
+    return i + FFT_FIRST_K + 1;
+}
+
+
+/* Returns smallest possible number of limbs >= pl for a fft of size 2^k.
+   FIXME: Is this simply pl rounded up to the next multiple of 2^k ?  */
+
+mp_size_t
+#if __STDC__
+mpn_fft_next_size (mp_size_t pl, int k)
+#else
+mpn_fft_next_size (pl, k)
+     mp_size_t pl;
+     int       k;
+#endif
+{
+  mp_size_t N, M;
+  int       K;
+
+  /*  if (k==0) k = mpn_fft_best_k (pl, sqr); */
+  N = pl*BITS_PER_MP_LIMB;
+  K = 1<<k;
+  if (N%K) N=(N/K+1)*K;
+  M = N/K;
+  if (M%BITS_PER_MP_LIMB) N=((M/BITS_PER_MP_LIMB)+1)*BITS_PER_MP_LIMB*K;
+  return (N/BITS_PER_MP_LIMB);
+}
+
+
+static void
+#if __STDC__
+mpn_fft_initl(int **l, int k)
+#else
+mpn_fft_initl(l, k)
+     int  **l;
+     int  k;
+#endif
+{
+    int i,j,K;
+
+    l[0][0] = 0;
+    for (i=1,K=2;i<=k;i++,K*=2) {
+	for (j=0;j<K/2;j++) {
+	    l[i][j] = 2*l[i-1][j];
+	    l[i][K/2+j] = 1+l[i][j];
+	}
+    }
+}
+
+
+/* a <- -a mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_neg_modF(mp_limb_t *ap, mp_size_t n)
+#else
+mpn_fft_neg_modF(ap, n)
+     mp_limb_t *ap;
+     mp_size_t n;
+#endif
+{
+  mp_limb_t c;
+
+  c = ap[n]+2;
+  mpn_com_n (ap, ap, n);
+  ap[n]=0; mpn_incr_u(ap, c);
+}
+
+
+/* a <- a*2^e mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_mul_2exp_modF(mp_limb_t *ap, int e, mp_size_t n, mp_limb_t *tp)
+#else
+mpn_fft_mul_2exp_modF(ap, e, n, tp)
+     mp_limb_t *ap;
+     int e;
+     mp_size_t n;
+     mp_limb_t *tp;
+#endif
+{
+  int d, sh, i; mp_limb_t cc;
+
+  d = e%(n*BITS_PER_MP_LIMB); /* 2^e = (+/-) 2^d */
+  sh = d % BITS_PER_MP_LIMB;
+  if (sh) mpn_lshift(tp, ap, n+1, sh); /* no carry here */
+  else MPN_COPY(tp, ap, n+1);
+  d /= BITS_PER_MP_LIMB; /* now shift of d limbs to the left */
+ if (d) { 
+   /* ap[d..n-1] = tp[0..n-d-1], ap[0..d-1] = -tp[n-d..n-1] */
+   /* mpn_xor would be more efficient here */
+   for (i=d-1;i>=0;i--) ap[i] = ~tp[n-d+i];
+   cc = 1-mpn_add_1(ap, ap, d, 1);
+   if (cc) cc=mpn_sub_1(ap+d, tp, n-d, 1);
+   else MPN_COPY(ap+d, tp, n-d);
+   if (cc+=mpn_sub_1(ap+d, ap+d, n-d, tp[n]))
+     ap[n]=mpn_add_1(ap, ap, n, cc);
+   else ap[n]=0;
+  }
+  else if ((ap[n]=mpn_sub_1(ap, tp, n, tp[n]))) {
+    ap[n]=mpn_add_1(ap, ap, n, 1);
+  }
+  if ((e/(n*BITS_PER_MP_LIMB))%2) mpn_fft_neg_modF(ap, n);
+}
+
+
+/* a <- a+b mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_add_modF (mp_limb_t *ap, mp_limb_t *bp, int n)
+#else
+mpn_fft_add_modF (ap, bp, n)
+     mp_limb_t *ap,*bp;
+     int n;
+#endif
+{
+  mp_limb_t c;
+
+  c = ap[n] + bp[n] + mpn_add_n(ap, ap, bp, n);
+  if (c>1) c -= 1+mpn_sub_1(ap,ap,n,1);
+  ap[n]=c;
+}
+
+
+/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
+          N=n*BITS_PER_MP_LIMB 
+          2^omega is a primitive root mod 2^N+1
+   output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
+
+static void
+#if __STDC__
+mpn_fft_fft_sqr (mp_limb_t **Ap, mp_size_t K, int **ll,
+                 mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp)
+#else
+mpn_fft_fft_sqr(Ap,K,ll,omega,n,inc,tp)
+mp_limb_t **Ap,*tp;
+mp_size_t K,omega,n,inc;
+int       **ll;
+#endif
+{
+  if (K==2) {
+#ifdef ADDSUB
+      if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1)
+#else
+      MPN_COPY(tp, Ap[0], n+1);
+      mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1);
+      if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1))
+#endif
+      	Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1);
+    }
+    else {
+      int       j, inc2=2*inc;
+      int       *lk = *ll;
+      mp_limb_t *tmp;
+      TMP_DECL(marker);
+
+      TMP_MARK(marker);
+      tmp = TMP_ALLOC_LIMBS (n+1);
+	mpn_fft_fft_sqr(Ap, K/2,ll-1,2*omega,n,inc2, tp);
+	mpn_fft_fft_sqr(Ap+inc, K/2,ll-1,2*omega,n,inc2, tp);
+	/* A[2*j*inc]   <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
+	   A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
+	for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc) {
+	  MPN_COPY(tp, Ap[inc], n+1);
+	  mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp);
+	  mpn_fft_add_modF(Ap[inc], Ap[0], n);
+	  mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
+	  mpn_fft_add_modF(Ap[0], tp, n);
+	}
+        TMP_FREE(marker);
+    }
+}
+
+
+/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
+          N=n*BITS_PER_MP_LIMB 
+         2^omega is a primitive root mod 2^N+1 
+   output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
+
+static void
+#if __STDC__
+mpn_fft_fft (mp_limb_t **Ap, mp_limb_t **Bp, mp_size_t K, int **ll,
+             mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp)
+#else
+mpn_fft_fft(Ap,Bp,K,ll,omega,n,inc,tp)
+     mp_limb_t **Ap,**Bp,*tp;
+     mp_size_t K,omega,n,inc;
+     int       **ll;
+#endif
+{
+  if (K==2) {
+#ifdef ADDSUB
+      if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1)
+#else
+      MPN_COPY(tp, Ap[0], n+1);
+      mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1);
+      if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1))
+#endif
+      	Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1);
+#ifdef ADDSUB
+      if (mpn_addsub_n(Bp[0], Bp[inc], Bp[0], Bp[inc], n+1) & 1)
+#else
+      MPN_COPY(tp, Bp[0], n+1);
+      mpn_add_n(Bp[0], Bp[0], Bp[inc],n+1);
+      if (mpn_sub_n(Bp[inc], tp, Bp[inc],n+1))
+#endif
+      	Bp[inc][n] = mpn_add_1(Bp[inc], Bp[inc], n, 1);
+    }
+    else {
+	int       j, inc2=2*inc;
+        int       *lk=*ll;
+        mp_limb_t *tmp;
+	TMP_DECL(marker);
+
+	TMP_MARK(marker);
+	tmp = TMP_ALLOC_LIMBS (n+1);
+	mpn_fft_fft(Ap, Bp, K/2,ll-1,2*omega,n,inc2, tp);
+	mpn_fft_fft(Ap+inc, Bp+inc, K/2,ll-1,2*omega,n,inc2, tp);
+	/* A[2*j*inc]   <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
+	   A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
+	for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc,Bp+=2*inc) {
+	  MPN_COPY(tp, Ap[inc], n+1);
+	  mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp);
+	  mpn_fft_add_modF(Ap[inc], Ap[0], n);
+	  mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
+	  mpn_fft_add_modF(Ap[0], tp, n);
+	  MPN_COPY(tp, Bp[inc], n+1);
+	  mpn_fft_mul_2exp_modF(Bp[inc], lk[1]*omega, n, tmp);
+	  mpn_fft_add_modF(Bp[inc], Bp[0], n);
+	  mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
+	  mpn_fft_add_modF(Bp[0], tp, n);
+	}
+	TMP_FREE(marker);
+    }
+}
+
+
+/* a[i] <- a[i]*b[i] mod 2^(n*BITS_PER_MP_LIMB)+1 for 0 <= i < K */
+static void
+#if __STDC__
+mpn_fft_mul_modF_K (mp_limb_t **ap, mp_limb_t **bp, mp_size_t n, int K) 
+#else
+mpn_fft_mul_modF_K(ap, bp, n, K) 
+     mp_limb_t **ap, **bp;
+     mp_size_t n;
+     int       K;
+#endif
+{
+  int  i;
+  int  sqr = (ap == bp);
+  TMP_DECL(marker);
+  
+  TMP_MARK(marker); 
+
+  if (n >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) {
+    int k, K2,nprime2,Nprime2,M2,maxLK,l,Mp2;
+    int       **_fft_l;
+    mp_limb_t **Ap,**Bp,*A,*B,*T;
+
+    k = mpn_fft_best_k (n, sqr);
+    K2 = 1<<k;
+    maxLK = (K2>BITS_PER_MP_LIMB) ? K2 : BITS_PER_MP_LIMB;
+    M2 = n*BITS_PER_MP_LIMB/K2;
+    l = n/K2;
+    Nprime2 = ((2*M2+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M2+k+3)/maxLK)*maxLK*/
+    nprime2 = Nprime2/BITS_PER_MP_LIMB;
+    Mp2 = Nprime2/K2;
+
+    Ap = TMP_ALLOC_MP_PTRS (K2);
+    Bp = TMP_ALLOC_MP_PTRS (K2);
+    A = TMP_ALLOC_LIMBS (2*K2*(nprime2+1));
+    T = TMP_ALLOC_LIMBS (nprime2+1);
+    B = A + K2*(nprime2+1);
+    _fft_l = TMP_ALLOC_TYPE (k+1, int*);
+    for (i=0;i<=k;i++)
+      _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
+    mpn_fft_initl(_fft_l, k);
+
+    TRACE (printf("recurse: %dx%d limbs -> %d times %dx%d (%1.2f)\n", n,
+                  n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2));
+
+    for (i=0;i<K;i++,ap++,bp++)
+      mpn_mul_fft_internal(*ap, *ap, *bp, n, k, K2, Ap, Bp, A, B, nprime2,
+	 l, Mp2, _fft_l, T, 1);
+  }
+  else {
+     mp_limb_t *a, *b, cc, *tp, *tpn; int n2=2*n;
+     tp = TMP_ALLOC_LIMBS (n2);
+     tpn = tp+n;
+     TRACE (printf ("  mpn_mul_n %d of %d limbs\n", K, n));
+     for (i=0;i<K;i++) {
+        a = *ap++; b=*bp++;
+        if (sqr)
+          mpn_sqr_n(tp, a, n);
+        else
+          mpn_mul_n(tp, b, a, n);
+	if (a[n]) cc=mpn_add_n(tpn, tpn, b, n); else cc=0;
+	if (b[n]) cc += mpn_add_n(tpn, tpn, a, n) + a[n];
+	if (cc) {
+          cc = mpn_add_1(tp, tp, n2, cc);
+          ASSERT_NOCARRY (mpn_add_1(tp, tp, n2, cc));
+        }
+	a[n] = mpn_sub_n(a, tp, tpn, n) && mpn_add_1(a, a, n, 1); 
+     }
+  }
+  TMP_FREE(marker); 
+}
+
+
+/* input: A^[l[k][0]] A^[l[k][1]] ... A^[l[k][K-1]]
+   output: K*A[0] K*A[K-1] ... K*A[1] */
+
+static void
+#if __STDC__
+mpn_fft_fftinv (mp_limb_t **Ap, int K, mp_size_t omega, mp_size_t n,
+                mp_limb_t *tp)
+#else
+mpn_fft_fftinv(Ap,K,omega,n,tp)
+     mp_limb_t **Ap, *tp;
+     int       K;
+     mp_size_t omega, n;
+#endif
+{
+    if (K==2) {
+#ifdef ADDSUB
+      if (mpn_addsub_n(Ap[0], Ap[1], Ap[0], Ap[1], n+1) & 1)
+#else
+      MPN_COPY(tp, Ap[0], n+1);
+      mpn_add_n(Ap[0], Ap[0], Ap[1], n+1);
+      if (mpn_sub_n(Ap[1], tp, Ap[1], n+1))
+#endif
+        Ap[1][n] = mpn_add_1(Ap[1], Ap[1], n, 1);
+    }
+    else {
+	int j, K2=K/2; mp_limb_t **Bp=Ap+K2, *tmp; 
+	TMP_DECL(marker);
+
+	TMP_MARK(marker);
+	tmp = TMP_ALLOC_LIMBS (n+1);
+	mpn_fft_fftinv(Ap, K2, 2*omega, n, tp);
+	mpn_fft_fftinv(Bp, K2, 2*omega, n, tp);
+	/* A[j]     <- A[j] + omega^j A[j+K/2]
+	   A[j+K/2] <- A[j] + omega^(j+K/2) A[j+K/2] */
+        for (j=0;j<K2;j++,Ap++,Bp++) {
+	  MPN_COPY(tp, Bp[0], n+1);
+	  mpn_fft_mul_2exp_modF(Bp[0], (j+K2)*omega, n, tmp);
+	  mpn_fft_add_modF(Bp[0], Ap[0], n);
+	  mpn_fft_mul_2exp_modF(tp, j*omega, n, tmp);
+	  mpn_fft_add_modF(Ap[0], tp, n);
+	}
+	TMP_FREE(marker);
+    }
+}
+
+
+/* A <- A/2^k mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_div_2exp_modF (mp_limb_t *ap, int k, mp_size_t n, mp_limb_t *tp)
+#else
+mpn_fft_div_2exp_modF(ap,k,n,tp)
+     mp_limb_t *ap,*tp;
+     int       k;
+     mp_size_t n;
+#endif
+{
+    int i;
+    
+    i = 2*n*BITS_PER_MP_LIMB;
+    i = (i-k) % i;
+    mpn_fft_mul_2exp_modF(ap,i,n,tp); 
+    /* 1/2^k = 2^(2nL-k) mod 2^(n*BITS_PER_MP_LIMB)+1 */
+    /* normalize so that A < 2^(n*BITS_PER_MP_LIMB)+1 */
+    if (ap[n]==1) {
+      for (i=0;i<n && ap[i]==0;i++);
+      if (i<n) {
+	ap[n]=0;
+	mpn_sub_1(ap, ap, n, 1);
+      }
+    }
+}
+
+
+/* R <- A mod 2^(n*BITS_PER_MP_LIMB)+1, n<=an<=3*n */
+static void
+#if __STDC__
+mpn_fft_norm_modF(mp_limb_t *rp, mp_limb_t *ap, mp_size_t n, mp_size_t an) 
+#else
+mpn_fft_norm_modF(rp, ap, n, an)
+     mp_limb_t *rp;
+     mp_limb_t *ap;
+     mp_size_t n;
+     mp_size_t an;
+#endif
+{
+  mp_size_t l;
+
+   if (an>2*n) {
+     l = n;
+     rp[n] = mpn_add_1(rp+an-2*n, ap+an-2*n, 3*n-an, 
+		       mpn_add_n(rp,ap,ap+2*n,an-2*n));
+   }
+   else {
+     l = an-n;
+     MPN_COPY(rp, ap, n);
+     rp[n]=0;
+   }
+   if (mpn_sub_n(rp,rp,ap+n,l)) {
+     if (mpn_sub_1(rp+l,rp+l,n+1-l,1))
+       rp[n]=mpn_add_1(rp,rp,n,1); 
+   }
+}
+
+
+static void
+#if __STDC__
+mpn_mul_fft_internal(mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
+                     int k, int K,
+                     mp_limb_t **Ap, mp_limb_t **Bp,
+                     mp_limb_t *A, mp_limb_t *B,
+                     mp_size_t nprime, mp_size_t l, mp_size_t Mp,
+                     int **_fft_l,
+                     mp_limb_t *T, int rec)
+#else
+mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,rec)
+     mp_limb_t *op;
+     mp_srcptr n, m;
+     mp_limb_t **Ap,**Bp,*A,*B,*T;
+     mp_size_t pl,nprime;
+     int       **_fft_l;
+     int       k,K,l,Mp,rec;
+#endif
+{
+  int       i, sqr, pla, lo, sh, j;
+  mp_limb_t *p;
+
+    sqr = (n==m);
+
+    TRACE (printf ("pl=%d k=%d K=%d np=%d l=%d Mp=%d rec=%d sqr=%d\n",
+                   pl,k,K,nprime,l,Mp,rec,sqr));
+
+    /* decomposition of inputs into arrays Ap[i] and Bp[i] */
+    if (rec) for (i=0;i<K;i++) {
+      Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1);
+      /* store the next M bits of n into A[i] */
+      /* supposes that M is a multiple of BITS_PER_MP_LIMB */
+      MPN_COPY(Ap[i], n, l); n+=l; MPN_ZERO(Ap[i]+l, nprime+1-l);
+      /* set most significant bits of n and m (important in recursive calls) */
+      if (i==K-1) Ap[i][l]=n[0];
+      mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T);
+      if (!sqr) {
+	MPN_COPY(Bp[i], m, l); m+=l; MPN_ZERO(Bp[i]+l, nprime+1-l);
+	if (i==K-1) Bp[i][l]=m[0];
+	mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T);
+      }
+    }
+
+    /* direct fft's */
+    if (sqr) mpn_fft_fft_sqr(Ap,K,_fft_l+k,2*Mp,nprime,1, T);
+    else mpn_fft_fft(Ap,Bp,K,_fft_l+k,2*Mp,nprime,1, T);
+
+    /* term to term multiplications */
+    mpn_fft_mul_modF_K(Ap, (sqr) ? Ap : Bp, nprime, K);
+
+    /* inverse fft's */
+    mpn_fft_fftinv(Ap, K, 2*Mp, nprime, T);
+
+    /* division of terms after inverse fft */
+    for (i=0;i<K;i++) mpn_fft_div_2exp_modF(Ap[i],k+((K-i)%K)*Mp,nprime, T);
+
+    /* addition of terms in result p */
+    MPN_ZERO(T,nprime+1); 
+    pla = l*(K-1)+nprime+1; /* number of required limbs for p */
+    p = B; /* B has K*(n'+1) limbs, which is >= pla, i.e. enough */
+    MPN_ZERO(p, pla);
+    sqr=0; /* will accumulate the (signed) carry at p[pla] */
+    for (i=K-1,lo=l*i+nprime,sh=l*i;i>=0;i--,lo-=l,sh-=l) {
+        mp_ptr n = p+sh;
+	j = (K-i)%K;
+	if (mpn_add_n(n,n,Ap[j],nprime+1))
+	  sqr += mpn_add_1(n+nprime+1,n+nprime+1,pla-sh-nprime-1,1);
+	T[2*l]=i+1; /* T = (i+1)*2^(2*M) */
+	if (mpn_cmp(Ap[j],T,nprime+1)>0) { /* subtract 2^N'+1 */
+	  sqr -= mpn_sub_1(n,n,pla-sh,1);
+	  sqr -= mpn_sub_1(p+lo,p+lo,pla-lo,1);
+	}
+    }
+    if (sqr==-1) {
+      if ((sqr=mpn_add_1(p+pla-pl,p+pla-pl,pl,1))) {
+	/* p[pla-pl]...p[pla-1] are all zero */
+        mpn_sub_1(p+pla-pl-1,p+pla-pl-1,pl+1,1);
+	mpn_sub_1(p+pla-1,p+pla-1,1,1);
+      }
+    }
+    else if (sqr==1) {
+	    if (pla>=2*pl)
+	      while ((sqr=mpn_add_1(p+pla-2*pl,p+pla-2*pl,2*pl,sqr)));
+	    else {
+	      sqr = mpn_sub_1(p+pla-pl,p+pla-pl,pl,sqr);
+              ASSERT (sqr == 0);
+	    }
+    }
+    else
+      ASSERT (sqr == 0);
+
+    /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ]
+              < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ]
+	      < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */
+    mpn_fft_norm_modF(op,p,pl,pla);
+}
+
+
+/* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*BITS_PER_MP_LIMB
+   n and m have respectively nl and ml limbs
+   op must have space for pl+1 limbs
+   One must have pl = mpn_fft_next_size(pl, k).
+*/
+
+void
+#if __STDC__
+mpn_mul_fft (mp_ptr op, mp_size_t pl,
+             mp_srcptr n, mp_size_t nl,
+             mp_srcptr m, mp_size_t ml,
+             int k)
+#else
+mpn_mul_fft (op, pl, n, nl, m, ml, k)
+     mp_ptr    op;
+     mp_size_t pl;
+     mp_srcptr n;
+     mp_size_t nl;
+     mp_srcptr m;
+     mp_size_t ml;
+     int k;
+#endif
+{
+    int        K,maxLK,i,j;
+    mp_size_t  N,Nprime,nprime,M,Mp,l;
+    mp_limb_t  **Ap,**Bp,*A,*T,*B;
+    int        **_fft_l;
+    int        sqr = (n==m && nl==ml);
+    TMP_DECL(marker);
+
+    TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n",
+                   pl, nl, ml, k));
+    ASSERT_ALWAYS (mpn_fft_next_size(pl, k) == pl);
+
+    TMP_MARK(marker);
+    N = pl*BITS_PER_MP_LIMB;
+    _fft_l = TMP_ALLOC_TYPE (k+1, int*);
+    for (i=0;i<=k;i++)
+      _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
+    mpn_fft_initl(_fft_l, k);
+    K = 1<<k;
+    M = N/K;	/* N = 2^k M */
+    l = M/BITS_PER_MP_LIMB;
+    maxLK = (K>BITS_PER_MP_LIMB) ? K : BITS_PER_MP_LIMB;
+
+    Nprime = ((2*M+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M+k+3)/maxLK)*maxLK; */
+    nprime = Nprime/BITS_PER_MP_LIMB; 
+    TRACE (printf ("N=%d K=%d, M=%d, l=%d, maxLK=%d, Np=%d, np=%d\n",
+                   N, K, M, l, maxLK, Nprime, nprime));
+    if (nprime >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) {
+      maxLK = (1<<mpn_fft_best_k(nprime,n==m))*BITS_PER_MP_LIMB;
+      if (Nprime % maxLK) {
+	Nprime=((Nprime/maxLK)+1)*maxLK;
+	nprime = Nprime/BITS_PER_MP_LIMB;
+      }
+      TRACE (printf ("new maxLK=%d, Np=%d, np=%d\n", maxLK, Nprime, nprime));
+    }
+
+    T = TMP_ALLOC_LIMBS (nprime+1);
+    Mp = Nprime/K;
+
+    TRACE (printf("%dx%d limbs -> %d times %dx%d limbs (%1.2f)\n",
+                  pl,pl,K,nprime,nprime,2.0*(double)N/Nprime/K);
+           printf("   temp space %ld\n", 2*K*(nprime+1)));
+
+    A = _MP_ALLOCATE_FUNC_LIMBS (2*K*(nprime+1));
+    B = A+K*(nprime+1);
+    Ap = TMP_ALLOC_MP_PTRS (K); 
+    Bp = TMP_ALLOC_MP_PTRS (K); 
+    /* special decomposition for main call */
+    for (i=0;i<K;i++) {
+      Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1);
+      /* store the next M bits of n into A[i] */
+      /* supposes that M is a multiple of BITS_PER_MP_LIMB */
+      if (nl>0) {
+	j = (nl>=l) ? l : nl; /* limbs to store in Ap[i] */
+	MPN_COPY(Ap[i], n, j); n+=l; MPN_ZERO(Ap[i]+j, nprime+1-j);
+	mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T);
+      }
+      else MPN_ZERO(Ap[i], nprime+1);
+      nl -= l;
+      if (n!=m) {
+	if (ml>0) {
+	  j = (ml>=l) ? l : ml; /* limbs to store in Bp[i] */
+	  MPN_COPY(Bp[i], m, j); m+=l; MPN_ZERO(Bp[i]+j, nprime+1-j);
+	  mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T);
+	}
+	else MPN_ZERO(Bp[i], nprime+1);
+      }
+      ml -= l;
+    }
+    mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,0);
+    TMP_FREE(marker);
+    _MP_FREE_FUNC_LIMBS (A, 2*K*(nprime+1));
+}
+
+
+#if WANT_ASSERT
+static int
+#if __STDC__
+mpn_zero_p (mp_ptr p, mp_size_t n)
+#else
+     mpn_zero_p (p, n)
+     mp_ptr p;
+     mp_size_t n;
+#endif
+{
+  mp_size_t i;
+
+  for (i = 0; i < n; i++)
+    {
+      if (p[i] != 0)
+        return 0;
+    }
+
+  return 1;
+}
+#endif
+
+
+/* Multiply {n,nl}*{m,ml} and write the result to {op,nl+ml}.
+
+   FIXME: Duplicating the result like this is wasteful, do something better
+   perhaps at the norm_modF stage above. */
+
+void
+#if __STDC__
+mpn_mul_fft_full (mp_ptr op,
+                  mp_srcptr n, mp_size_t nl,
+                  mp_srcptr m, mp_size_t ml)
+#else
+mpn_mul_fft_full (op, n, nl, m, ml)
+     mp_ptr    op;
+     mp_srcptr n;
+     mp_size_t nl;
+     mp_srcptr m;
+     mp_size_t ml;
+#endif
+{
+  mp_ptr     pad_op;
+  mp_size_t  pl;
+  int        k;
+  int        sqr = (n==m && nl==ml);
+
+  k = mpn_fft_best_k (nl+ml, sqr);
+  pl = mpn_fft_next_size (nl+ml, k);
+
+  TRACE (printf ("mpn_mul_fft_full nl=%ld ml=%ld -> pl=%ld k=%d\n",
+                 nl, ml, pl, k));
+
+  pad_op = _MP_ALLOCATE_FUNC_LIMBS (pl+1);
+  mpn_mul_fft (pad_op, pl, n, nl, m, ml, k);
+
+  ASSERT (mpn_zero_p (pad_op+nl+ml, pl+1-(nl+ml)));
+  MPN_COPY (op, pad_op, nl+ml);
+
+  _MP_FREE_FUNC_LIMBS (pad_op, pl+1);
+}
diff --git a/rts/gmp/mpn/generic/mul_n.c b/rts/gmp/mpn/generic/mul_n.c
new file mode 100644
index 0000000000..b7563be2d3
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_n.c
@@ -0,0 +1,1343 @@
+/* mpn_mul_n and helper function -- Multiply/square natural numbers.
+
+   THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n)
+   ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY SAFE TO REACH
+   THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
+   THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB.
+   0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */
+#define INVERSE_3      ((MP_LIMB_T_MAX / 3) * 2 + 1)
+
+#if !defined (__alpha) && !defined (__mips)
+/* For all other machines, we want to call mpn functions for the compund
+   operations instead of open-coding them.  */
+#define USE_MORE_MPN
+#endif
+
+/*== Function declarations =================================================*/
+
+static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr,
+                               mp_ptr, mp_ptr, mp_ptr,
+                               mp_srcptr, mp_srcptr, mp_srcptr,
+                               mp_size_t, mp_size_t));
+static void interpolate3 _PROTO ((mp_srcptr,
+                                  mp_ptr, mp_ptr, mp_ptr,
+                                  mp_srcptr,
+                                  mp_ptr, mp_ptr, mp_ptr,
+                                  mp_size_t, mp_size_t));
+static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+
+
+/*-- mpn_kara_mul_n ---------------------------------------------------------------*/
+
+/* Multiplies using 3 half-sized mults and so on recursively.
+ * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
+ * No overlap of p[...] with a[...] or b[...].
+ * ws is workspace.
+ */
+
+void
+#if __STDC__
+mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
+#else
+mpn_kara_mul_n(p, a, b, n, ws)
+     mp_ptr    p;
+     mp_srcptr a;
+     mp_srcptr b;
+     mp_size_t n;
+     mp_ptr    ws;
+#endif
+{
+  mp_limb_t i, sign, w, w0, w1;
+  mp_size_t n2;
+  mp_srcptr x, y;
+
+  n2 = n >> 1;
+  ASSERT (n2 > 0);
+
+  if (n & 1)
+    {
+      /* Odd length. */
+      mp_size_t n1, n3, nm1;
+
+      n3 = n - n2;
+
+      sign = 0;
+      w = a[n2];
+      if (w != 0)
+	w -= mpn_sub_n (p, a, a + n3, n2);
+      else
+	{
+	  i = n2;
+	  do
+	    {
+	      --i;
+	      w0 = a[i];
+	      w1 = a[n3+i];
+	    }
+	  while (w0 == w1 && i != 0);
+	  if (w0 < w1)
+	    {
+	      x = a + n3;
+	      y = a;
+	      sign = 1;
+	    }
+	  else
+	    {
+	      x = a;
+	      y = a + n3;
+	    }
+	  mpn_sub_n (p, x, y, n2);
+	}
+      p[n2] = w;
+
+      w = b[n2];
+      if (w != 0)
+	w -= mpn_sub_n (p + n3, b, b + n3, n2);
+      else
+	{
+	  i = n2;
+	  do 
+	    {
+	      --i;
+	      w0 = b[i]; 
+	      w1 = b[n3+i];
+	    }
+	  while (w0 == w1 && i != 0);
+	  if (w0 < w1)
+	    {
+	      x = b + n3;
+	      y = b;
+	      sign ^= 1;
+	    }
+	  else
+	    {
+	      x = b;
+	      y = b + n3;
+	    }
+	  mpn_sub_n (p + n3, x, y, n2);
+	}
+      p[n] = w;
+
+      n1 = n + 1;
+      if (n2 < KARATSUBA_MUL_THRESHOLD)
+	{
+	  if (n3 < KARATSUBA_MUL_THRESHOLD)
+	    {
+	      mpn_mul_basecase (ws, p, n3, p + n3, n3);
+	      mpn_mul_basecase (p, a, n3, b, n3);
+	    }
+	  else
+	    {
+	      mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
+	      mpn_kara_mul_n (p, a, b, n3, ws + n1);
+	    }
+	  mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2);
+	}
+      else
+	{
+	  mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
+	  mpn_kara_mul_n (p, a, b, n3, ws + n1);
+	  mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1);
+	}
+
+      if (sign)
+	mpn_add_n (ws, p, ws, n1);
+      else
+	mpn_sub_n (ws, p, ws, n1);
+
+      nm1 = n - 1;
+      if (mpn_add_n (ws, p + n1, ws, nm1))
+	{
+	  mp_limb_t x = ws[nm1] + 1;
+	  ws[nm1] = x;
+	  if (x == 0)
+	    ++ws[n];
+	}
+      if (mpn_add_n (p + n3, p + n3, ws, n1))
+	{
+	  mp_limb_t x;
+	  i = n1 + n3;
+	  do
+	    {
+	      x = p[i] + 1;
+	      p[i] = x;
+	      ++i;
+	    } while (x == 0);
+	}
+    }
+  else
+    {
+      /* Even length. */
+      mp_limb_t t;
+
+      i = n2;
+      do
+	{
+	  --i;
+	  w0 = a[i];
+	  w1 = a[n2+i];
+	}
+      while (w0 == w1 && i != 0);
+      sign = 0;
+      if (w0 < w1)
+	{
+	  x = a + n2;
+	  y = a;
+	  sign = 1;
+	}
+      else
+	{
+	  x = a;
+	  y = a + n2;
+	}
+      mpn_sub_n (p, x, y, n2);
+
+      i = n2;
+      do 
+	{
+	  --i;
+	  w0 = b[i];
+	  w1 = b[n2+i];
+	}
+      while (w0 == w1 && i != 0);
+      if (w0 < w1)
+	{
+	  x = b + n2;
+	  y = b;
+	  sign ^= 1;
+	}
+      else
+	{
+	  x = b;
+	  y = b + n2;
+	}
+      mpn_sub_n (p + n2, x, y, n2);
+
+      /* Pointwise products. */
+      if (n2 < KARATSUBA_MUL_THRESHOLD)
+	{
+	  mpn_mul_basecase (ws, p, n2, p + n2, n2);
+	  mpn_mul_basecase (p, a, n2, b, n2);
+	  mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2);
+	}
+      else
+	{
+	  mpn_kara_mul_n (ws, p, p + n2, n2, ws + n);
+	  mpn_kara_mul_n (p, a, b, n2, ws + n);
+	  mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n);
+	}
+
+      /* Interpolate. */
+      if (sign)
+	w = mpn_add_n (ws, p, ws, n);
+      else
+	w = -mpn_sub_n (ws, p, ws, n);
+      w += mpn_add_n (ws, p + n, ws, n);
+      w += mpn_add_n (p + n2, p + n2, ws, n);
+      /* TO DO: could put "if (w) { ... }" here.
+       * Less work but badly predicted branch.
+       * No measurable difference in speed on Alpha.
+       */
+      i = n + n2;
+      t = p[i] + w;
+      p[i] = t;
+      if (t < w)
+	{
+	  do
+	    {
+	      ++i;
+	      w = p[i] + 1;
+	      p[i] = w;
+	    }
+	  while (w == 0);
+	}
+    }
+}
+
+void
+#if __STDC__
+mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
+#else
+mpn_kara_sqr_n (p, a, n, ws)
+     mp_ptr    p;
+     mp_srcptr a;
+     mp_size_t n;
+     mp_ptr    ws;
+#endif
+{
+  mp_limb_t i, sign, w, w0, w1;
+  mp_size_t n2;
+  mp_srcptr x, y;
+
+  n2 = n >> 1;
+  ASSERT (n2 > 0);
+
+  if (n & 1)
+    {
+      /* Odd length. */
+      mp_size_t n1, n3, nm1;
+
+      n3 = n - n2;
+
+      sign = 0;
+      w = a[n2];
+      if (w != 0)
+	w -= mpn_sub_n (p, a, a + n3, n2);
+      else
+	{
+	  i = n2;
+	  do
+	    {
+	      --i;
+	      w0 = a[i];
+	      w1 = a[n3+i];
+	    }
+	  while (w0 == w1 && i != 0);
+	  if (w0 < w1)
+	    {
+	      x = a + n3;
+	      y = a;
+	      sign = 1;
+	    }
+	  else
+	    {
+	      x = a;
+	      y = a + n3;
+	    }
+	  mpn_sub_n (p, x, y, n2);
+	}
+      p[n2] = w;
+
+      w = a[n2];
+      if (w != 0)
+	w -= mpn_sub_n (p + n3, a, a + n3, n2);
+      else
+	{
+	  i = n2;
+	  do 
+	    {
+	      --i;
+	      w0 = a[i]; 
+	      w1 = a[n3+i];
+	    }
+	  while (w0 == w1 && i != 0);
+	  if (w0 < w1)
+	    {
+	      x = a + n3;
+	      y = a;
+	      sign ^= 1;
+	    }
+	  else
+	    {
+	      x = a;
+	      y = a + n3;
+	    }
+	  mpn_sub_n (p + n3, x, y, n2);
+	}
+      p[n] = w;
+
+      n1 = n + 1;
+      if (n2 < KARATSUBA_SQR_THRESHOLD)
+	{
+	  if (n3 < KARATSUBA_SQR_THRESHOLD)
+	    {
+	      mpn_sqr_basecase (ws, p, n3);
+	      mpn_sqr_basecase (p, a, n3);
+	    }
+	  else
+	    {
+	      mpn_kara_sqr_n (ws, p, n3, ws + n1);
+	      mpn_kara_sqr_n (p, a, n3, ws + n1);
+	    }
+	  mpn_sqr_basecase (p + n1, a + n3, n2);
+	}
+      else
+	{
+	  mpn_kara_sqr_n (ws, p, n3, ws + n1);
+	  mpn_kara_sqr_n (p, a, n3, ws + n1);
+	  mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1);
+	}
+
+      if (sign)
+	mpn_add_n (ws, p, ws, n1);
+      else
+	mpn_sub_n (ws, p, ws, n1);
+
+      nm1 = n - 1;
+      if (mpn_add_n (ws, p + n1, ws, nm1))
+	{
+	  mp_limb_t x = ws[nm1] + 1;
+	  ws[nm1] = x;
+	  if (x == 0)
+	    ++ws[n];
+	}
+      if (mpn_add_n (p + n3, p + n3, ws, n1))
+	{
+	  mp_limb_t x;
+	  i = n1 + n3;
+	  do
+	    {
+	      x = p[i] + 1;
+	      p[i] = x;
+	      ++i;
+	    } while (x == 0);
+	}
+    }
+  else
+    {
+      /* Even length. */
+      mp_limb_t t;
+
+      i = n2;
+      do
+	{
+	  --i;
+	  w0 = a[i];
+	  w1 = a[n2+i];
+	}
+      while (w0 == w1 && i != 0);
+      sign = 0;
+      if (w0 < w1)
+	{
+	  x = a + n2;
+	  y = a;
+	  sign = 1;
+	}
+      else
+	{
+	  x = a;
+	  y = a + n2;
+	}
+      mpn_sub_n (p, x, y, n2);
+
+      i = n2;
+      do 
+	{
+	  --i;
+	  w0 = a[i];
+	  w1 = a[n2+i];
+	}
+      while (w0 == w1 && i != 0);
+      if (w0 < w1)
+	{
+	  x = a + n2;
+	  y = a;
+	  sign ^= 1;
+	}
+      else
+	{
+	  x = a;
+	  y = a + n2;
+	}
+      mpn_sub_n (p + n2, x, y, n2);
+
+      /* Pointwise products. */
+      if (n2 < KARATSUBA_SQR_THRESHOLD)
+	{
+	  mpn_sqr_basecase (ws, p, n2);
+	  mpn_sqr_basecase (p, a, n2);
+	  mpn_sqr_basecase (p + n, a + n2, n2);
+	}
+      else
+	{
+	  mpn_kara_sqr_n (ws, p, n2, ws + n);
+	  mpn_kara_sqr_n (p, a, n2, ws + n);
+	  mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
+	}
+
+      /* Interpolate. */
+      if (sign)
+	w = mpn_add_n (ws, p, ws, n);
+      else
+	w = -mpn_sub_n (ws, p, ws, n);
+      w += mpn_add_n (ws, p + n, ws, n);
+      w += mpn_add_n (p + n2, p + n2, ws, n);
+      /* TO DO: could put "if (w) { ... }" here.
+       * Less work but badly predicted branch.
+       * No measurable difference in speed on Alpha.
+       */
+      i = n + n2;
+      t = p[i] + w;
+      p[i] = t;
+      if (t < w)
+	{
+	  do
+	    {
+	      ++i;
+	      w = p[i] + 1;
+	      p[i] = w;
+	    }
+	  while (w == 0);
+	}
+    }
+}
+
+/*-- add2Times -------------------------------------------------------------*/
+
+/* z[] = x[] + 2 * y[]
+   Note that z and x might point to the same vectors. */
+#ifdef USE_MORE_MPN
+static inline mp_limb_t
+#if __STDC__
+add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
+#else
+add2Times (z, x, y, n)
+     mp_ptr    z;
+     mp_srcptr x;
+     mp_srcptr y;
+     mp_size_t n;
+#endif
+{
+  mp_ptr t;
+  mp_limb_t c;
+  TMP_DECL (marker);
+  TMP_MARK (marker);
+  t = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+  c = mpn_lshift (t, y, n, 1);
+  c += mpn_add_n (z, x, t, n);
+  TMP_FREE (marker);
+  return c;
+}
+#else
+
+static mp_limb_t
+#if __STDC__
+add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
+#else
+add2Times (z, x, y, n)
+     mp_ptr    z;
+     mp_srcptr x;
+     mp_srcptr y;
+     mp_size_t n;
+#endif
+{
+  mp_limb_t c, v, w;
+
+  ASSERT (n > 0);
+  v = *x; w = *y;
+  c = w >> (BITS_PER_MP_LIMB - 1);
+  w <<= 1;
+  v += w;
+  c += v < w;
+  *z = v;
+  ++x; ++y; ++z;
+  while (--n)
+    {
+      v = *x;
+      w = *y;
+      v += c;
+      c = v < c;
+      c += w >> (BITS_PER_MP_LIMB - 1);
+      w <<= 1;
+      v += w;
+      c += v < w;
+      *z = v;
+      ++x; ++y; ++z;
+    }
+
+  return c;
+}
+#endif
+
+/*-- evaluate3 -------------------------------------------------------------*/
+
+/* Evaluates:
+ *   ph := 4*A+2*B+C
+ *   p1 := A+B+C
+ *   p2 := A+2*B+4*C
+ * where:
+ *   ph[], p1[], p2[], A[] and B[] all have length len,
+ *   C[] has length len2 with len-len2 = 0, 1 or 2.
+ * Returns top words (overflow) at pth, pt1 and pt2 respectively.
+ */
+#ifdef USE_MORE_MPN
+static void
+#if __STDC__
+evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
+	   mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len, mp_size_t len2)
+#else
+evaluate3 (ph, p1, p2, pth, pt1, pt2,
+           A, B, C, len, len2)
+     mp_ptr    ph;
+     mp_ptr    p1;
+     mp_ptr    p2;
+     mp_ptr    pth;
+     mp_ptr    pt1;
+     mp_ptr    pt2;
+     mp_srcptr A;
+     mp_srcptr B;
+     mp_srcptr C;
+     mp_size_t len;
+     mp_size_t len2;
+#endif
+{
+  mp_limb_t c, d, e;
+  
+  ASSERT (len - len2 <= 2);
+
+  e = mpn_lshift (p1, B, len, 1);
+
+  c = mpn_lshift (ph, A, len, 2);
+  c += e + mpn_add_n (ph, ph, p1, len);
+  d = mpn_add_n (ph, ph, C, len2);
+  if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d);
+  ASSERT (c < 7);
+  *pth = c;
+
+  c = mpn_lshift (p2, C, len2, 2);
+#if 1
+  if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; }
+  c += e + mpn_add_n (p2, p2, p1, len);
+#else
+  d = mpn_add_n (p2, p2, p1, len2);
+  c += d;
+  if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c);
+  c += e;
+#endif
+  c += mpn_add_n (p2, p2, A, len);
+  ASSERT (c < 7);
+  *pt2 = c;
+
+  c = mpn_add_n (p1, A, B, len);
+  d = mpn_add_n (p1, p1, C, len2);
+  if (len2 == len) c += d;
+  else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d);
+  ASSERT (c < 3);
+  *pt1 = c;
+
+}
+
+#else
+
+static void
+#if __STDC__
+evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
+	   mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls)
+#else
+evaluate3 (ph, p1, p2, pth, pt1, pt2,
+           A, B, C, l, ls)
+     mp_ptr    ph;
+     mp_ptr    p1;
+     mp_ptr    p2;
+     mp_ptr    pth;
+     mp_ptr    pt1;
+     mp_ptr    pt2;
+     mp_srcptr A;
+     mp_srcptr B;
+     mp_srcptr C;
+     mp_size_t l;
+     mp_size_t ls;
+#endif
+{
+  mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2;
+
+  ASSERT (l - ls <= 2);
+
+  th = t1 = t2 = 0;
+  for (i = 0; i < l; ++i)
+    {
+      a = *A;
+      b = *B;
+      c = i < ls ? *C : 0;
+
+      /* TO DO: choose one of the following alternatives. */
+#if 0
+      t = a << 2;
+      vh = th + t;
+      th = vh < t;
+      th += a >> (BITS_PER_MP_LIMB - 2);
+      t = b << 1;
+      vh += t;
+      th += vh < t;
+      th += b >> (BITS_PER_MP_LIMB - 1);
+      vh += c;
+      th += vh < c;
+#else
+      vh = th + c;
+      th = vh < c;
+      t = b << 1;
+      vh += t;
+      th += vh < t;
+      th += b >> (BITS_PER_MP_LIMB - 1);
+      t = a << 2;
+      vh += t;
+      th += vh < t;
+      th += a >> (BITS_PER_MP_LIMB - 2);
+#endif
+
+      v1 = t1 + a;
+      t1 = v1 < a;
+      v1 += b;
+      t1 += v1 < b;
+      v1 += c;
+      t1 += v1 < c;
+
+      v2 = t2 + a;
+      t2 = v2 < a;
+      t = b << 1;
+      v2 += t;
+      t2 += v2 < t;
+      t2 += b >> (BITS_PER_MP_LIMB - 1);
+      t = c << 2;
+      v2 += t;
+      t2 += v2 < t;
+      t2 += c >> (BITS_PER_MP_LIMB - 2);
+
+      *ph = vh;
+      *p1 = v1;
+      *p2 = v2;
+
+      ++A; ++B; ++C;
+      ++ph; ++p1; ++p2;
+    }
+
+  ASSERT (th < 7);
+  ASSERT (t1 < 3);
+  ASSERT (t2 < 7);
+
+  *pth = th;
+  *pt1 = t1;
+  *pt2 = t2;
+}
+#endif
+
+
+/*-- interpolate3 ----------------------------------------------------------*/
+
+/* Interpolates B, C, D (in-place) from:
+ *   16*A+8*B+4*C+2*D+E
+ *   A+B+C+D+E
+ *   A+2*B+4*C+8*D+16*E
+ * where:
+ *   A[], B[], C[] and D[] all have length l,
+ *   E[] has length ls with l-ls = 0, 2 or 4.
+ *
+ * Reads top words (from earlier overflow) from ptb, ptc and ptd,
+ * and returns new top words there.
+ */
+
+#ifdef USE_MORE_MPN
+static void
+#if __STDC__
+interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
+              mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len, mp_size_t len2)
+#else
+interpolate3 (A, B, C, D, E,
+              ptb, ptc, ptd, len, len2)
+     mp_srcptr A;
+     mp_ptr    B;
+     mp_ptr    C;
+     mp_ptr    D;
+     mp_srcptr E;
+     mp_ptr    ptb;
+     mp_ptr    ptc;
+     mp_ptr    ptd;
+     mp_size_t len;
+     mp_size_t len2;
+#endif
+{
+  mp_ptr ws;
+  mp_limb_t t, tb,tc,td;
+  TMP_DECL (marker);
+  TMP_MARK (marker);
+
+  ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4);
+
+  /* Let x1, x2, x3 be the values to interpolate.  We have:
+   *         b = 16*a + 8*x1 + 4*x2 + 2*x3 +    e
+   *         c =    a +   x1 +   x2 +   x3 +    e
+   *         d =    a + 2*x1 + 4*x2 + 8*x3 + 16*e
+   */
+
+  ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB);
+
+  tb = *ptb; tc = *ptc; td = *ptd;
+
+
+  /* b := b - 16*a -    e
+   * c := c -    a -    e
+   * d := d -    a - 16*e
+   */
+
+  t = mpn_lshift (ws, A, len, 4);
+  tb -= t + mpn_sub_n (B, B, ws, len);
+  t = mpn_sub_n (B, B, E, len2);
+  if (len2 == len) tb -= t;
+  else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t);
+
+  tc -= mpn_sub_n (C, C, A, len);
+  t = mpn_sub_n (C, C, E, len2);
+  if (len2 == len) tc -= t;
+  else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t);
+
+  t = mpn_lshift (ws, E, len2, 4);
+  t += mpn_add_n (ws, ws, A, len2);
+#if 1
+  if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t);
+  td -= t + mpn_sub_n (D, D, ws, len);
+#else
+  t += mpn_sub_n (D, D, ws, len2);
+  if (len2 != len) {
+    t = mpn_sub_1 (D+len2, D+len2, len-len2, t);
+    t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2);
+  } /* end if/else */
+  td -= t;
+#endif
+
+
+  /* b, d := b + d, b - d */
+
+#ifdef HAVE_MPN_ADD_SUB_N
+  /* #error TO DO ... */
+#else
+  t = tb + td + mpn_add_n (ws, B, D, len);  
+  td = tb - td - mpn_sub_n (D, B, D, len);
+  tb = t;
+  MPN_COPY (B, ws, len);
+#endif
+  
+  /* b := b-8*c */
+  t = 8 * tc + mpn_lshift (ws, C, len, 3);
+  tb -= t + mpn_sub_n (B, B, ws, len);
+
+  /* c := 2*c - b */
+  tc = 2 * tc + mpn_lshift (C, C, len, 1);
+  tc -= tb + mpn_sub_n (C, C, B, len);
+
+  /* d := d/3 */
+  td = (td - mpn_divexact_by3 (D, D, len)) * INVERSE_3;
+
+  /* b, d := b + d, b - d */
+#ifdef HAVE_MPN_ADD_SUB_N
+  /* #error TO DO ... */
+#else
+  t = tb + td + mpn_add_n (ws, B, D, len);  
+  td = tb - td - mpn_sub_n (D, B, D, len);
+  tb = t;
+  MPN_COPY (B, ws, len);
+#endif
+
+      /* Now:
+       *	 b = 4*x1
+       *	 c = 2*x2
+       *	 d = 4*x3
+       */
+
+  ASSERT(!(*B & 3));
+  mpn_rshift (B, B, len, 2);
+  B[len-1] |= tb<<(BITS_PER_MP_LIMB-2);
+  ASSERT((long)tb >= 0);
+  tb >>= 2;
+
+  ASSERT(!(*C & 1));
+  mpn_rshift (C, C, len, 1);
+  C[len-1] |= tc<<(BITS_PER_MP_LIMB-1);
+  ASSERT((long)tc >= 0);
+  tc >>= 1;
+
+  ASSERT(!(*D & 3));
+  mpn_rshift (D, D, len, 2);
+  D[len-1] |= td<<(BITS_PER_MP_LIMB-2);
+  ASSERT((long)td >= 0);
+  td >>= 2;
+
+#if WANT_ASSERT
+  ASSERT (tb < 2);
+  if (len == len2)
+    {
+      ASSERT (tc < 3);
+      ASSERT (td < 2);
+    }
+  else
+    {
+      ASSERT (tc < 2);
+      ASSERT (!td);
+    }
+#endif
+
+  *ptb = tb;
+  *ptc = tc;
+  *ptd = td;
+
+  TMP_FREE (marker);
+}
+
+#else
+
+static void
+#if __STDC__
+interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
+	      mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls)
+#else
+interpolate3 (A, B, C, D, E,
+              ptb, ptc, ptd, l, ls)
+     mp_srcptr A;
+     mp_ptr    B;
+     mp_ptr    C;
+     mp_ptr    D;
+     mp_srcptr E;
+     mp_ptr    ptb;
+     mp_ptr    ptc;
+     mp_ptr    ptd;
+     mp_size_t l;
+     mp_size_t ls;
+#endif
+{
+  mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od;
+  const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1);
+
+#if WANT_ASSERT
+  t = l - ls;
+  ASSERT (t == 0 || t == 2 || t == 4);
+#endif
+
+  sb = sc = sd = 0;
+  for (i = 0; i < l; ++i)
+    {
+      mp_limb_t tb, tc, td, tt;
+
+      a = *A;
+      b = *B;
+      c = *C;
+      d = *D;
+      e = i < ls ? *E : 0;
+
+      /* Let x1, x2, x3 be the values to interpolate.  We have:
+       *	 b = 16*a + 8*x1 + 4*x2 + 2*x3 +    e
+       *	 c =	a +   x1 +   x2 +   x3 +    e
+       *	 d =	a + 2*x1 + 4*x2 + 8*x3 + 16*e
+       */
+
+      /* b := b - 16*a -    e
+       * c := c -    a -    e
+       * d := d -    a - 16*e
+       */
+      t = a << 4;
+      tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t);
+      b -= t;
+      tb -= b < e;
+      b -= e;
+      tc = -(c < a);
+      c -= a;
+      tc -= c < e;
+      c -= e;
+      td = -(d < a);
+      d -= a;
+      t = e << 4;
+      td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t);
+      d -= t;
+
+      /* b, d := b + d, b - d */
+      t = b + d;
+      tt = tb + td + (t < b);
+      td = tb - td - (b < d);
+      d = b - d;
+      b = t;
+      tb = tt;
+
+      /* b := b-8*c */
+      t = c << 3;
+      tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t);
+      b -= t;
+
+      /* c := 2*c - b */
+      t = c << 1;
+      tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b);
+      c = t - b;
+
+      /* d := d/3 */
+      d *= INVERSE_3;
+      td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d);
+      td *= INVERSE_3;
+
+      /* b, d := b + d, b - d */
+      t = b + d;
+      tt = tb + td + (t < b);
+      td = tb - td - (b < d);
+      d = b - d;
+      b = t;
+      tb = tt;
+
+      /* Now:
+       *	 b = 4*x1
+       *	 c = 2*x2
+       *	 d = 4*x3
+       */
+
+      /* sb has period 2. */
+      b += sb;
+      tb += b < sb;
+      sb &= maskOffHalf;
+      sb |= sb >> (BITS_PER_MP_LIMB >> 1);
+      sb += tb;
+
+      /* sc has period 1. */
+      c += sc;
+      tc += c < sc;
+      /* TO DO: choose one of the following alternatives. */
+#if 1
+      sc = (mp_limb_t)((long)sc >> (BITS_PER_MP_LIMB - 1));
+      sc += tc;
+#else
+      sc = tc - ((long)sc < 0L);
+#endif
+
+      /* sd has period 2. */
+      d += sd;
+      td += d < sd;
+      sd &= maskOffHalf;
+      sd |= sd >> (BITS_PER_MP_LIMB >> 1);
+      sd += td;
+
+      if (i != 0)
+	{
+	  B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
+	  C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
+	  D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
+	}
+      ob = b >> 2;
+      oc = c >> 1;
+      od = d >> 2;
+
+      ++A; ++B; ++C; ++D; ++E;
+    }
+
+  /* Handle top words. */
+  b = *ptb;
+  c = *ptc;
+  d = *ptd;
+
+  t = b + d;
+  d = b - d;
+  b = t;
+  b -= c << 3;
+  c = (c << 1) - b;
+  d *= INVERSE_3;
+  t = b + d;
+  d = b - d;
+  b = t;
+
+  b += sb;
+  c += sc;
+  d += sd;
+
+  B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
+  C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
+  D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
+
+  b >>= 2;
+  c >>= 1;
+  d >>= 2;
+
+#if WANT_ASSERT
+  ASSERT (b < 2);
+  if (l == ls)
+    {
+      ASSERT (c < 3);
+      ASSERT (d < 2);
+    }
+  else
+    {
+      ASSERT (c < 2);
+      ASSERT (!d);
+    }
+#endif
+
+  *ptb = b;
+  *ptc = c;
+  *ptd = d;
+}
+#endif
+
+
+/*-- mpn_toom3_mul_n --------------------------------------------------------------*/
+
+/* Multiplies using 5 mults of one third size and so on recursively.
+ * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
+ * No overlap of p[...] with a[...] or b[...].
+ * ws is workspace.
+ */
+
+/* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the
+ *        recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n()
+ *        because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false.
+ */
+
+#define TOOM3_MUL_REC(p, a, b, n, ws) \
+  do {								\
+    if (n < KARATSUBA_MUL_THRESHOLD)				\
+      mpn_mul_basecase (p, a, n, b, n);				\
+    else if (n < TOOM3_MUL_THRESHOLD)				\
+      mpn_kara_mul_n (p, a, b, n, ws);				\
+    else							\
+      mpn_toom3_mul_n (p, a, b, n, ws);				\
+  } while (0)
+
+void
+#if __STDC__
+mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
+#else
+mpn_toom3_mul_n (p, a, b, n, ws)
+     mp_ptr    p;
+     mp_srcptr a;
+     mp_srcptr b;
+     mp_size_t n;
+     mp_ptr    ws;
+#endif
+{
+  mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD;
+  mp_limb_t *A,*B,*C,*D,*E, *W;
+  mp_size_t l,l2,l3,l4,l5,ls;
+
+  /* Break n words into chunks of size l, l and ls.
+   * n = 3*k   => l = k,   ls = k
+   * n = 3*k+1 => l = k+1, ls = k-1
+   * n = 3*k+2 => l = k+1, ls = k
+   */
+  {
+    mp_limb_t m;
+
+    ASSERT (n >= TOOM3_MUL_THRESHOLD);
+    l = ls = n / 3;
+    m = n - l * 3;
+    if (m != 0)
+      ++l;
+    if (m == 1)
+      --ls;
+
+    l2 = l * 2;
+    l3 = l * 3;
+    l4 = l * 4;
+    l5 = l * 5;
+    A = p;
+    B = ws;
+    C = p + l2;
+    D = ws + l2;
+    E = p + l4;
+    W = ws + l4;
+  }
+
+  /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
+  evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
+  evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls);
+
+  /** Second stage: pointwise multiplies. **/
+  TOOM3_MUL_REC(D, C, C + l, l, W);
+  tD = cD*dD;
+  if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD);
+  if (dD) tD += mpn_addmul_1 (D + l, C, l, dD);
+  ASSERT (tD < 49);
+  TOOM3_MUL_REC(C, B, B + l, l, W);
+  tC = cC*dC;
+  /* TO DO: choose one of the following alternatives. */
+#if 0
+  if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC);
+  if (dC) tC += mpn_addmul_1 (C + l, B, l, dC);
+#else
+  if (cC)
+    {
+      if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l);
+      else tC += add2Times (C + l, C + l, B + l, l);
+    }
+  if (dC)
+    {
+      if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l);
+      else tC += add2Times (C + l, C + l, B, l);
+    }
+#endif
+  ASSERT (tC < 9);
+  TOOM3_MUL_REC(B, A, A + l, l, W);
+  tB = cB*dB;
+  if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB);
+  if (dB) tB += mpn_addmul_1 (B + l, A, l, dB);
+  ASSERT (tB < 49);
+  TOOM3_MUL_REC(A, a, b, l, W);
+  TOOM3_MUL_REC(E, a + l2, b + l2, ls, W);
+
+  /** Third stage: interpolation. **/
+  interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
+
+  /** Final stage: add up the coefficients. **/
+  {
+    mp_limb_t i, x, y;
+    tB += mpn_add_n (p + l, p + l, B, l2);
+    tD += mpn_add_n (p + l3, p + l3, D, l2);
+    mpn_incr_u (p + l3, tB);
+    mpn_incr_u (p + l4, tC);
+    mpn_incr_u (p + l5, tD);
+  }
+}
+
+/*-- mpn_toom3_sqr_n --------------------------------------------------------------*/
+
+/* Like previous function but for squaring */
+
+#define TOOM3_SQR_REC(p, a, n, ws) \
+  do {								\
+    if (n < KARATSUBA_SQR_THRESHOLD)				\
+      mpn_sqr_basecase (p, a, n);				\
+    else if (n < TOOM3_SQR_THRESHOLD)				\
+      mpn_kara_sqr_n (p, a, n, ws);				\
+    else							\
+      mpn_toom3_sqr_n (p, a, n, ws);				\
+  } while (0)
+
+void
+#if __STDC__
+mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
+#else
+mpn_toom3_sqr_n (p, a, n, ws)
+     mp_ptr    p;
+     mp_srcptr a;
+     mp_size_t n;
+     mp_ptr    ws;
+#endif
+{
+  mp_limb_t cB,cC,cD, tB,tC,tD;
+  mp_limb_t *A,*B,*C,*D,*E, *W;
+  mp_size_t l,l2,l3,l4,l5,ls;
+
+  /* Break n words into chunks of size l, l and ls.
+   * n = 3*k   => l = k,   ls = k
+   * n = 3*k+1 => l = k+1, ls = k-1
+   * n = 3*k+2 => l = k+1, ls = k
+   */
+  {
+    mp_limb_t m;
+
+    ASSERT (n >= TOOM3_MUL_THRESHOLD);
+    l = ls = n / 3;
+    m = n - l * 3;
+    if (m != 0)
+      ++l;
+    if (m == 1)
+      --ls;
+
+    l2 = l * 2;
+    l3 = l * 3;
+    l4 = l * 4;
+    l5 = l * 5;
+    A = p;
+    B = ws;
+    C = p + l2;
+    D = ws + l2;
+    E = p + l4;
+    W = ws + l4;
+  }
+
+  /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
+  evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
+
+  /** Second stage: pointwise multiplies. **/
+  TOOM3_SQR_REC(D, C, l, W);
+  tD = cD*cD;
+  if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD);
+  ASSERT (tD < 49);
+  TOOM3_SQR_REC(C, B, l, W);
+  tC = cC*cC;
+  /* TO DO: choose one of the following alternatives. */
+#if 0
+  if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC);
+#else
+  if (cC >= 1)
+    {
+      tC += add2Times (C + l, C + l, B, l);
+      if (cC == 2)
+        tC += add2Times (C + l, C + l, B, l);
+    }
+#endif
+  ASSERT (tC < 9);
+  TOOM3_SQR_REC(B, A, l, W);
+  tB = cB*cB;
+  if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB);
+  ASSERT (tB < 49);
+  TOOM3_SQR_REC(A, a, l, W);
+  TOOM3_SQR_REC(E, a + l2, ls, W);
+
+  /** Third stage: interpolation. **/
+  interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
+
+  /** Final stage: add up the coefficients. **/
+  {
+    mp_limb_t i, x, y;
+    tB += mpn_add_n (p + l, p + l, B, l2);
+    tD += mpn_add_n (p + l3, p + l3, D, l2);
+    mpn_incr_u (p + l3, tB);
+    mpn_incr_u (p + l4, tC);
+    mpn_incr_u (p + l5, tD);
+  }
+}
+
+void
+#if __STDC__
+mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
+#else
+mpn_mul_n (p, a, b, n)
+     mp_ptr    p;
+     mp_srcptr a;
+     mp_srcptr b;
+     mp_size_t n;
+#endif
+{
+  if (n < KARATSUBA_MUL_THRESHOLD)
+    mpn_mul_basecase (p, a, n, b, n);
+  else if (n < TOOM3_MUL_THRESHOLD)
+    {
+      /* Allocate workspace of fixed size on stack: fast! */
+#if TUNE_PROGRAM_BUILD
+      mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD_LIMIT-1) + 2 * BITS_PER_MP_LIMB];
+#else
+      mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD-1) + 2 * BITS_PER_MP_LIMB];
+#endif
+      mpn_kara_mul_n (p, a, b, n, ws);
+    }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+  else if (n < FFT_MUL_THRESHOLD)
+#else
+  else
+#endif
+    {
+      /* Use workspace of unknown size in heap, as stack space may
+       * be limited.  Since n is at least TOOM3_MUL_THRESHOLD, the
+       * multiplication will take much longer than malloc()/free().  */
+      mp_limb_t wsLen, *ws;
+      wsLen = 2 * n + 3 * BITS_PER_MP_LIMB;
+      ws = (mp_ptr) (*_mp_allocate_func) ((size_t) wsLen * sizeof (mp_limb_t));
+      mpn_toom3_mul_n (p, a, b, n, ws);
+      (*_mp_free_func) (ws, (size_t) wsLen * sizeof (mp_limb_t));
+    }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+  else
+    {
+      mpn_mul_fft_full (p, a, n, b, n);      
+    }
+#endif
+}
diff --git a/rts/gmp/mpn/generic/perfsqr.c b/rts/gmp/mpn/generic/perfsqr.c
new file mode 100644
index 0000000000..42ee3405d7
--- /dev/null
+++ b/rts/gmp/mpn/generic/perfsqr.c
@@ -0,0 +1,123 @@
+/* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square,
+   zero otherwise.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* sq_res_0x100[x mod 0x100] == 1 iff x mod 0x100 is a quadratic residue
+   modulo 0x100.  */
+static unsigned char const sq_res_0x100[0x100] =
+{
+  1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+};
+
+int
+#if __STDC__
+mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
+#else
+mpn_perfect_square_p (up, usize)
+     mp_srcptr up;
+     mp_size_t usize;
+#endif
+{
+  mp_limb_t rem;
+  mp_ptr root_ptr;
+  int res;
+  TMP_DECL (marker);
+
+  /* The first test excludes 55/64 (85.9%) of the perfect square candidates
+     in O(1) time.  */
+  if ((sq_res_0x100[(unsigned int) up[0] % 0x100] & 1) == 0)
+    return 0;
+
+#if defined (PP)
+  /* The second test excludes 30652543/30808063 (99.5%) of the remaining
+     perfect square candidates in O(n) time.  */
+
+  /* Firstly, compute REM = A mod PP.  */
+  if (UDIV_TIME > (2 * UMUL_TIME + 6))
+    rem = mpn_preinv_mod_1 (up, usize, (mp_limb_t) PP, (mp_limb_t) PP_INVERTED);
+  else
+    rem = mpn_mod_1 (up, usize, (mp_limb_t) PP);
+
+  /* Now decide if REM is a quadratic residue modulo the factors in PP.  */
+
+  /* If A is just a few limbs, computing the square root does not take long
+     time, so things might run faster if we limit this loop according to the
+     size of A.  */
+
+#if BITS_PER_MP_LIMB == 64
+  if (((CNST_LIMB(0x12DD703303AED3) >> rem % 53) & 1) == 0)
+    return 0;
+  if (((CNST_LIMB(0x4351B2753DF) >> rem % 47) & 1) == 0)
+    return 0;
+  if (((CNST_LIMB(0x35883A3EE53) >> rem % 43) & 1) == 0)
+    return 0;
+  if (((CNST_LIMB(0x1B382B50737) >> rem % 41) & 1) == 0)
+    return 0;
+  if (((CNST_LIMB(0x165E211E9B) >> rem % 37) & 1) == 0)
+    return 0;
+  if (((CNST_LIMB(0x121D47B7) >> rem % 31) & 1) == 0)
+    return 0;
+#endif
+  if (((0x13D122F3L >> rem % 29) & 1) == 0)
+    return 0;
+  if (((0x5335FL >> rem % 23) & 1) == 0)
+    return 0;
+  if (((0x30AF3L >> rem % 19) & 1) == 0)
+    return 0;
+  if (((0x1A317L >> rem % 17) & 1) == 0)
+    return 0;
+  if (((0x161BL >> rem % 13) & 1) == 0)
+    return 0;
+  if (((0x23BL >> rem % 11) & 1) == 0)
+    return 0;
+  if (((0x017L >> rem % 7) & 1) == 0)
+    return 0;
+  if (((0x13L >> rem % 5) & 1) == 0)
+    return 0;
+  if (((0x3L >> rem % 3) & 1) == 0)
+    return 0;
+#endif
+
+  TMP_MARK (marker);
+
+  /* For the third and last test, we finally compute the square root,
+     to make sure we've really got a perfect square.  */
+  root_ptr = (mp_ptr) TMP_ALLOC ((usize + 1) / 2 * BYTES_PER_MP_LIMB);
+
+  /* Iff mpn_sqrtrem returns zero, the square is perfect.  */
+  res = ! mpn_sqrtrem (root_ptr, NULL, up, usize);
+  TMP_FREE (marker);
+  return res;
+}
diff --git a/rts/gmp/mpn/generic/popcount.c b/rts/gmp/mpn/generic/popcount.c
new file mode 100644
index 0000000000..387be9536d
--- /dev/null
+++ b/rts/gmp/mpn/generic/popcount.c
@@ -0,0 +1,93 @@
+/* popcount.c
+
+Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#if defined __GNUC__
+/* No processor claiming to be SPARC v9 compliant seem to
+   implement the POPC instruction.  Disable pattern for now.  */
+#if 0 && defined __sparc_v9__ && BITS_PER_MP_LIMB == 64
+#define popc_limb(a) \
+  ({									\
+    DItype __res;							\
+    asm ("popc %1,%0" : "=r" (__res) : "rI" (a));			\
+    __res;								\
+  })
+#endif
+#endif
+
+#ifndef popc_limb
+
+/* Cool population count of a mp_limb_t.
+   You have to figure out how this works, I won't tell you!  */
+
+static inline unsigned int
+#if __STDC__
+popc_limb (mp_limb_t x)
+#else
+popc_limb (x)
+     mp_limb_t x;
+#endif
+{
+#if BITS_PER_MP_LIMB == 64
+  /* We have to go into some trouble to define these constants.
+     (For mp_limb_t being `long long'.)  */
+  mp_limb_t cnst;
+  cnst = 0xaaaaaaaaL | ((mp_limb_t) 0xaaaaaaaaL << BITS_PER_MP_LIMB/2);
+  x -= (x & cnst) >> 1;
+  cnst = 0x33333333L | ((mp_limb_t) 0x33333333L << BITS_PER_MP_LIMB/2);
+  x = ((x & ~cnst) >> 2) + (x & cnst);
+  cnst = 0x0f0f0f0fL | ((mp_limb_t) 0x0f0f0f0fL << BITS_PER_MP_LIMB/2);
+  x = ((x >> 4) + x) & cnst;
+  x = ((x >> 8) + x);
+  x = ((x >> 16) + x);
+  x = ((x >> 32) + x) & 0xff;
+#endif
+#if BITS_PER_MP_LIMB == 32
+  x -= (x & 0xaaaaaaaa) >> 1;
+  x = ((x >> 2) & 0x33333333L) + (x & 0x33333333L);
+  x = ((x >> 4) + x) & 0x0f0f0f0fL;
+  x = ((x >> 8) + x);
+  x = ((x >> 16) + x) & 0xff;
+#endif
+  return x;
+}
+#endif
+
+unsigned long int
+#if __STDC__
+mpn_popcount (register mp_srcptr p, register mp_size_t size)
+#else
+mpn_popcount (p, size)
+     register mp_srcptr p;
+     register mp_size_t size;
+#endif
+{
+  unsigned long int popcnt;
+  mp_size_t i;
+
+  popcnt = 0;
+  for (i = 0; i < size; i++)
+    popcnt += popc_limb (p[i]);
+
+  return popcnt;
+}
diff --git a/rts/gmp/mpn/generic/pre_mod_1.c b/rts/gmp/mpn/generic/pre_mod_1.c
new file mode 100644
index 0000000000..27179683b3
--- /dev/null
+++ b/rts/gmp/mpn/generic/pre_mod_1.c
@@ -0,0 +1,69 @@
+/* mpn_preinv_mod_1 (dividend_ptr, dividend_size, divisor_limb,
+		       divisor_limb_inverted) --
+   Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by the normalized DIVISOR_LIMB.
+   DIVISOR_LIMB_INVERTED should be 2^(2*BITS_PER_MP_LIMB) / DIVISOR_LIMB +
+   - 2^BITS_PER_MP_LIMB.
+   Return the single-limb remainder.
+
+Copyright (C) 1991, 1993, 1994, Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+mp_limb_t
+#if __STDC__
+mpn_preinv_mod_1 (mp_srcptr dividend_ptr, mp_size_t dividend_size,
+		  mp_limb_t divisor_limb, mp_limb_t divisor_limb_inverted)
+#else
+mpn_preinv_mod_1 (dividend_ptr, dividend_size, divisor_limb, divisor_limb_inverted)
+     mp_srcptr dividend_ptr;
+     mp_size_t dividend_size;
+     mp_limb_t divisor_limb;
+     mp_limb_t divisor_limb_inverted;
+#endif
+{
+  mp_size_t i;
+  mp_limb_t n0, r;
+  int dummy;
+
+  i = dividend_size - 1;
+  r = dividend_ptr[i];
+
+  if (r >= divisor_limb)
+    r = 0;
+  else
+    i--;
+
+  for (; i >= 0; i--)
+    {
+      n0 = dividend_ptr[i];
+      udiv_qrnnd_preinv (dummy, r, r, n0, divisor_limb, divisor_limb_inverted);
+    }
+  return r;
+}
diff --git a/rts/gmp/mpn/generic/random.c b/rts/gmp/mpn/generic/random.c
new file mode 100644
index 0000000000..dea4e20e56
--- /dev/null
+++ b/rts/gmp/mpn/generic/random.c
@@ -0,0 +1,43 @@
+/* mpn_random -- Generate random numbers.
+
+Copyright (C) 1996, 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "urandom.h"
+
+void
+#if __STDC__
+mpn_random (mp_ptr res_ptr, mp_size_t size)
+#else
+mpn_random (res_ptr, size)
+     mp_ptr res_ptr;
+     mp_size_t size;
+#endif
+{
+  mp_size_t i;
+
+  for (i = 0; i < size; i++)
+    res_ptr[i] = urandom ();
+
+  /* Make sure the most significant limb is non-zero.  */
+  while (res_ptr[size - 1] == 0)
+    res_ptr[size - 1] = urandom ();
+}
diff --git a/rts/gmp/mpn/generic/random2.c b/rts/gmp/mpn/generic/random2.c
new file mode 100644
index 0000000000..86682f81fa
--- /dev/null
+++ b/rts/gmp/mpn/generic/random2.c
@@ -0,0 +1,105 @@
+/* mpn_random2 -- Generate random numbers with relatively long strings
+   of ones and zeroes.  Suitable for border testing.
+
+Copyright (C) 1992, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#if defined (__hpux) || defined (__alpha)  || defined (__svr4__) || defined (__SVR4)
+/* HPUX lacks random().  DEC OSF/1 1.2 random() returns a double.  */
+long mrand48 ();
+static inline long
+random ()
+{
+  return mrand48 ();
+}
+#elif defined(_WIN32) && !(defined(__CYGWIN__) || defined(__CYGWIN32__))
+/* MS CRT supplies just the poxy rand(), with an upper bound of 0x7fff */
+static inline unsigned long
+random ()
+{
+  return rand () ^ (rand () << 16) ^ (rand() << 32);
+}
+
+#else
+long random ();
+#endif
+
+/* It's a bit tricky to get this right, so please test the code well
+   if you hack with it.  Some early versions of the function produced
+   random numbers with the leading limb == 0, and some versions never
+   made the most significant bit set.  */
+
+void
+#if __STDC__
+mpn_random2 (mp_ptr res_ptr, mp_size_t size)
+#else
+mpn_random2 (res_ptr, size)
+     mp_ptr res_ptr;
+     mp_size_t size;
+#endif
+{
+  int n_bits;
+  int bit_pos;
+  mp_size_t limb_pos;
+  unsigned int ran;
+  mp_limb_t limb;
+
+  limb = 0;
+
+  /* Start off in a random bit position in the most significant limb.  */
+  bit_pos = random () & (BITS_PER_MP_LIMB - 1);
+
+  /* Least significant bit of RAN chooses string of ones/string of zeroes.
+     Make most significant limb be non-zero by setting bit 0 of RAN.  */
+  ran = random () | 1;
+
+  for (limb_pos = size - 1; limb_pos >= 0; )
+    {
+      n_bits = (ran >> 1) % BITS_PER_MP_LIMB + 1;
+      if ((ran & 1) != 0)
+	{
+	  /* Generate a string of ones.  */
+	  if (n_bits >= bit_pos)
+	    {
+	      res_ptr[limb_pos--] = limb | ((((mp_limb_t) 2) << bit_pos) - 1);
+	      bit_pos += BITS_PER_MP_LIMB;
+	      limb = (~(mp_limb_t) 0) << (bit_pos - n_bits);
+	    }
+	  else
+	    {
+	      limb |= ((((mp_limb_t) 1) << n_bits) - 1) << (bit_pos - n_bits + 1);
+	    }
+	}
+      else
+	{
+	  /* Generate a string of zeroes.  */
+	  if (n_bits >= bit_pos)
+	    {
+	      res_ptr[limb_pos--] = limb;
+	      limb = 0;
+	      bit_pos += BITS_PER_MP_LIMB;
+	    }
+	}
+      bit_pos -= n_bits;
+      ran = random ();
+    }
+}
diff --git a/rts/gmp/mpn/generic/rshift.c b/rts/gmp/mpn/generic/rshift.c
new file mode 100644
index 0000000000..59caf73529
--- /dev/null
+++ b/rts/gmp/mpn/generic/rshift.c
@@ -0,0 +1,88 @@
+/* mpn_rshift -- Shift right a low-level natural-number integer.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Shift U (pointed to by UP and USIZE limbs long) CNT bits to the right
+   and store the USIZE least significant limbs of the result at WP.
+   The bits shifted out to the right are returned.
+
+   Argument constraints:
+   1. 0 < CNT < BITS_PER_MP_LIMB
+   2. If the result is to be written over the input, WP must be <= UP.
+*/
+
+mp_limb_t
+#if __STDC__
+mpn_rshift (register mp_ptr wp,
+	    register mp_srcptr up, mp_size_t usize,
+	    register unsigned int cnt)
+#else
+mpn_rshift (wp, up, usize, cnt)
+     register mp_ptr wp;
+     register mp_srcptr up;
+     mp_size_t usize;
+     register unsigned int cnt;
+#endif
+{
+  register mp_limb_t high_limb, low_limb;
+  register unsigned sh_1, sh_2;
+  register mp_size_t i;
+  mp_limb_t retval;
+
+#ifdef DEBUG
+  if (usize == 0 || cnt == 0)
+    abort ();
+#endif
+
+  sh_1 = cnt;
+
+#if 0
+  if (sh_1 == 0)
+    {
+      if (wp != up)
+	{
+	  /* Copy from low end to high end, to allow specified input/output
+	     overlapping.  */
+	  for (i = 0; i < usize; i++)
+	    wp[i] = up[i];
+	}
+      return usize;
+    }
+#endif
+
+  wp -= 1;
+  sh_2 = BITS_PER_MP_LIMB - sh_1;
+  high_limb = up[0];
+  retval = high_limb << sh_2;
+  low_limb = high_limb;
+
+  for (i = 1; i < usize; i++)
+    {
+      high_limb = up[i];
+      wp[i] = (low_limb >> sh_1) | (high_limb << sh_2);
+      low_limb = high_limb;
+    }
+  wp[i] = low_limb >> sh_1;
+
+  return retval;
+}
diff --git a/rts/gmp/mpn/generic/sb_divrem_mn.c b/rts/gmp/mpn/generic/sb_divrem_mn.c
new file mode 100644
index 0000000000..a269e34f5f
--- /dev/null
+++ b/rts/gmp/mpn/generic/sb_divrem_mn.c
@@ -0,0 +1,201 @@
+/* mpn_sb_divrem_mn -- Divide natural numbers, producing both remainder and
+   quotient.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE
+   INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
+   IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A
+   FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1993, 1994, 1995, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Divide num (NP/NSIZE) by den (DP/DSIZE) and write
+   the NSIZE-DSIZE least significant quotient limbs at QP
+   and the DSIZE long remainder at NP.  If QEXTRA_LIMBS is
+   non-zero, generate that many fraction bits and append them after the
+   other quotient limbs.
+   Return the most significant limb of the quotient, this is always 0 or 1.
+
+   Preconditions:
+   0. NSIZE >= DSIZE.
+   1. The most significant bit of the divisor must be set.
+   2. QP must either not overlap with the input operands at all, or
+      QP + DSIZE >= NP must hold true.  (This means that it's
+      possible to put the quotient in the high part of NUM, right after the
+      remainder in NUM.
+   3. NSIZE >= DSIZE, even if QEXTRA_LIMBS is non-zero.
+   4. DSIZE >= 2.  */
+
+
+#define PREINVERT_VIABLE \
+  (UDIV_TIME > 2 * UMUL_TIME + 6 /* && ! TARGET_REGISTER_STARVED */)
+
+mp_limb_t
+#if __STDC__
+mpn_sb_divrem_mn (mp_ptr qp,
+	       mp_ptr np, mp_size_t nsize,
+	       mp_srcptr dp, mp_size_t dsize)
+#else
+mpn_sb_divrem_mn (qp, np, nsize, dp, dsize)
+     mp_ptr qp;
+     mp_ptr np;
+     mp_size_t nsize;
+     mp_srcptr dp;
+     mp_size_t dsize;
+#endif
+{
+  mp_limb_t most_significant_q_limb = 0;
+  mp_size_t i;
+  mp_limb_t dx, d1, n0;
+  mp_limb_t dxinv;
+  int have_preinv;
+
+  ASSERT_ALWAYS (dsize > 2);
+
+  np += nsize - dsize;
+  dx = dp[dsize - 1];
+  d1 = dp[dsize - 2];
+  n0 = np[dsize - 1];
+
+  if (n0 >= dx)
+    {
+      if (n0 > dx || mpn_cmp (np, dp, dsize - 1) >= 0)
+	{
+	  mpn_sub_n (np, np, dp, dsize);
+	  most_significant_q_limb = 1;
+	}
+    }
+
+  /* If multiplication is much faster than division, preinvert the
+     most significant divisor limb before entering the loop.  */
+  if (PREINVERT_VIABLE)
+    {
+      have_preinv = 0;
+      if ((UDIV_TIME - (2 * UMUL_TIME + 6)) * (nsize - dsize) > UDIV_TIME)
+	{
+	  invert_limb (dxinv, dx);
+	  have_preinv = 1;
+	}
+    }
+
+  for (i = nsize - dsize - 1; i >= 0; i--)
+    {
+      mp_limb_t q;
+      mp_limb_t nx;
+      mp_limb_t cy_limb;
+
+      nx = np[dsize - 1];
+      np--;
+
+      if (nx == dx)
+	{
+	  /* This might over-estimate q, but it's probably not worth
+	     the extra code here to find out.  */
+	  q = ~(mp_limb_t) 0;
+
+#if 1
+	  cy_limb = mpn_submul_1 (np, dp, dsize, q);
+#else
+	  /* This should be faster on many machines */
+	  cy_limb = mpn_sub_n (np + 1, np + 1, dp, dsize);
+	  cy = mpn_add_n (np, np, dp, dsize);
+	  np[dsize] += cy;
+#endif
+
+	  if (nx != cy_limb)
+	    {
+	      mpn_add_n (np, np, dp, dsize);
+	      q--;
+	    }
+
+	  qp[i] = q;
+	}
+      else
+	{
+	  mp_limb_t rx, r1, r0, p1, p0;
+
+          /* "workaround" avoids a problem with gcc 2.7.2.3 i386 register
+             usage when np[dsize-1] is used in an asm statement like
+             umul_ppmm in udiv_qrnnd_preinv.  The symptom is seg faults due
+             to registers being clobbered.  gcc 2.95 i386 doesn't have the
+             problem. */
+          {
+            mp_limb_t  workaround = np[dsize - 1];
+            if (PREINVERT_VIABLE && have_preinv)
+              udiv_qrnnd_preinv (q, r1, nx, workaround, dx, dxinv);
+            else
+              udiv_qrnnd (q, r1, nx, workaround, dx);
+          }
+	  umul_ppmm (p1, p0, d1, q);
+
+	  r0 = np[dsize - 2];
+	  rx = 0;
+	  if (r1 < p1 || (r1 == p1 && r0 < p0))
+	    {
+	      p1 -= p0 < d1;
+	      p0 -= d1;
+	      q--;
+	      r1 += dx;
+	      rx = r1 < dx;
+	    }
+
+	  p1 += r0 < p0;	/* cannot carry! */
+	  rx -= r1 < p1;	/* may become 11..1 if q is still too large */
+	  r1 -= p1;
+	  r0 -= p0;
+
+	  cy_limb = mpn_submul_1 (np, dp, dsize - 2, q);
+
+	  {
+	    mp_limb_t cy1, cy2;
+	    cy1 = r0 < cy_limb;
+	    r0 -= cy_limb;
+	    cy2 = r1 < cy1;
+	    r1 -= cy1;
+	    np[dsize - 1] = r1;
+	    np[dsize - 2] = r0;
+	    if (cy2 != rx)
+	      {
+		mpn_add_n (np, np, dp, dsize);
+		q--;
+	      }
+	  }
+	  qp[i] = q;
+	}
+    }
+
+  /* ______ ______ ______
+    |__rx__|__r1__|__r0__|		partial remainder
+	    ______ ______
+	 - |__p1__|__p0__|		partial product to subtract
+	    ______ ______
+	 - |______|cylimb|		
+
+     rx is -1, 0 or 1.  If rx=1, then q is correct (it should match
+     carry out).  If rx=-1 then q is too large.  If rx=0, then q might
+     be too large, but it is most likely correct.
+  */
+
+  return most_significant_q_limb;
+}
diff --git a/rts/gmp/mpn/generic/scan0.c b/rts/gmp/mpn/generic/scan0.c
new file mode 100644
index 0000000000..96f05ce854
--- /dev/null
+++ b/rts/gmp/mpn/generic/scan0.c
@@ -0,0 +1,62 @@
+/* mpn_scan0 -- Scan from a given bit position for the next clear bit.
+
+Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Design issues:
+   1. What if starting_bit is not within U?  Caller's problem?
+   2. Bit index should be 'unsigned'?
+
+   Argument constraints:
+   1. U must sooner ot later have a limb with a clear bit.
+ */
+
+unsigned long int
+#if __STDC__
+mpn_scan0 (register mp_srcptr up,
+	   register unsigned long int starting_bit)
+#else
+mpn_scan0 (up, starting_bit)
+     register mp_srcptr up;
+     register unsigned long int starting_bit;
+#endif
+{
+  mp_size_t starting_word;
+  mp_limb_t alimb;
+  int cnt;
+  mp_srcptr p;
+
+  /* Start at the word implied by STARTING_BIT.  */
+  starting_word = starting_bit / BITS_PER_MP_LIMB;
+  p = up + starting_word;
+  alimb = ~*p++;
+
+  /* Mask off any bits before STARTING_BIT in the first limb.  */
+  alimb &= - (mp_limb_t) 1 << (starting_bit % BITS_PER_MP_LIMB);
+
+  while (alimb == 0)
+    alimb = ~*p++;
+
+  count_leading_zeros (cnt, alimb & -alimb);
+  return (p - up) * BITS_PER_MP_LIMB - 1 - cnt;
+}
diff --git a/rts/gmp/mpn/generic/scan1.c b/rts/gmp/mpn/generic/scan1.c
new file mode 100644
index 0000000000..98e2e0dcc0
--- /dev/null
+++ b/rts/gmp/mpn/generic/scan1.c
@@ -0,0 +1,62 @@
+/* mpn_scan1 -- Scan from a given bit position for the next set bit.
+
+Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Design issues:
+   1. What if starting_bit is not within U?  Caller's problem?
+   2. Bit index should be 'unsigned'?
+
+   Argument constraints:
+   1. U must sooner ot later have a limb != 0.
+ */
+
+unsigned long int
+#if __STDC__
+mpn_scan1 (register mp_srcptr up,
+	   register unsigned long int starting_bit)
+#else
+mpn_scan1 (up, starting_bit)
+     register mp_srcptr up;
+     register unsigned long int starting_bit;
+#endif
+{
+  mp_size_t starting_word;
+  mp_limb_t alimb;
+  int cnt;
+  mp_srcptr p;
+
+  /* Start at the word implied by STARTING_BIT.  */
+  starting_word = starting_bit / BITS_PER_MP_LIMB;
+  p = up + starting_word;
+  alimb = *p++;
+
+  /* Mask off any bits before STARTING_BIT in the first limb.  */
+  alimb &= - (mp_limb_t) 1 << (starting_bit % BITS_PER_MP_LIMB);
+
+  while (alimb == 0)
+    alimb = *p++;
+
+  count_leading_zeros (cnt, alimb & -alimb);
+  return (p - up) * BITS_PER_MP_LIMB - 1 - cnt;
+}
diff --git a/rts/gmp/mpn/generic/set_str.c b/rts/gmp/mpn/generic/set_str.c
new file mode 100644
index 0000000000..e6ccc92154
--- /dev/null
+++ b/rts/gmp/mpn/generic/set_str.c
@@ -0,0 +1,159 @@
+/* mpn_set_str (mp_ptr res_ptr, const char *str, size_t str_len, int base)
+   -- Convert a STR_LEN long base BASE byte string pointed to by STR to a
+   limb vector pointed to by RES_PTR.  Return the number of limbs in
+   RES_PTR.
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_size_t
+#if __STDC__
+mpn_set_str (mp_ptr xp, const unsigned char *str, size_t str_len, int base)
+#else
+mpn_set_str (xp, str, str_len, base)
+     mp_ptr xp;
+     const unsigned char *str;
+     size_t str_len;
+     int base;
+#endif
+{
+  mp_size_t size;
+  mp_limb_t big_base;
+  int indigits_per_limb;
+  mp_limb_t res_digit;
+
+  big_base = __mp_bases[base].big_base;
+  indigits_per_limb = __mp_bases[base].chars_per_limb;
+
+/*   size = str_len / indigits_per_limb + 1;  */
+
+  size = 0;
+
+  if ((base & (base - 1)) == 0)
+    {
+      /* The base is a power of 2.  Read the input string from
+	 least to most significant character/digit.  */
+
+      const unsigned char *s;
+      int next_bitpos;
+      int bits_per_indigit = big_base;
+
+      res_digit = 0;
+      next_bitpos = 0;
+
+      for (s = str + str_len - 1; s >= str; s--)
+	{
+	  int inp_digit = *s;
+
+	  res_digit |= (mp_limb_t) inp_digit << next_bitpos;
+	  next_bitpos += bits_per_indigit;
+	  if (next_bitpos >= BITS_PER_MP_LIMB)
+	    {
+	      xp[size++] = res_digit;
+	      next_bitpos -= BITS_PER_MP_LIMB;
+	      res_digit = inp_digit >> (bits_per_indigit - next_bitpos);
+	    }
+	}
+
+      if (res_digit != 0)
+	xp[size++] = res_digit;
+    }
+  else
+    {
+      /* General case.  The base is not a power of 2.  */
+
+      size_t i;
+      int j;
+      mp_limb_t cy_limb;
+
+      for (i = indigits_per_limb; i < str_len; i += indigits_per_limb)
+	{
+	  res_digit = *str++;
+	  if (base == 10)
+	    { /* This is a common case.
+		 Help the compiler to avoid multiplication.  */
+	      for (j = 1; j < indigits_per_limb; j++)
+		res_digit = res_digit * 10 + *str++;
+	    }
+	  else
+	    {
+	      for (j = 1; j < indigits_per_limb; j++)
+		res_digit = res_digit * base + *str++;
+	    }
+
+	  if (size == 0)
+	    {
+	      if (res_digit != 0)
+		{
+		  xp[0] = res_digit;
+		  size = 1;
+		}
+	    }
+	  else
+	    {
+	      cy_limb = mpn_mul_1 (xp, xp, size, big_base);
+	      cy_limb += mpn_add_1 (xp, xp, size, res_digit);
+	      if (cy_limb != 0)
+		xp[size++] = cy_limb;
+	    }
+	}
+
+      big_base = base;
+      res_digit = *str++;
+      if (base == 10)
+	{ /* This is a common case.
+	     Help the compiler to avoid multiplication.  */
+	  for (j = 1; j < str_len - (i - indigits_per_limb); j++)
+	    {
+	      res_digit = res_digit * 10 + *str++;
+	      big_base *= 10;
+	    }
+	}
+      else
+	{
+	  for (j = 1; j < str_len - (i - indigits_per_limb); j++)
+	    {
+	      res_digit = res_digit * base + *str++;
+	      big_base *= base;
+	    }
+	}
+
+      if (size == 0)
+	{
+	  if (res_digit != 0)
+	    {
+	      xp[0] = res_digit;
+	      size = 1;
+	    }
+	}
+      else
+	{
+	  cy_limb = mpn_mul_1 (xp, xp, size, big_base);
+	  cy_limb += mpn_add_1 (xp, xp, size, res_digit);
+	  if (cy_limb != 0)
+	    xp[size++] = cy_limb;
+	}
+    }
+
+  return size;
+}
diff --git a/rts/gmp/mpn/generic/sqr_basecase.c b/rts/gmp/mpn/generic/sqr_basecase.c
new file mode 100644
index 0000000000..760258a3e0
--- /dev/null
+++ b/rts/gmp/mpn/generic/sqr_basecase.c
@@ -0,0 +1,83 @@
+/* mpn_sqr_basecase -- Internal routine to square two natural numbers
+   of length m and n.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+#if __STDC__
+mpn_sqr_basecase (mp_ptr prodp, mp_srcptr up, mp_size_t n)
+#else
+mpn_sqr_basecase (prodp, up, n)
+     mp_ptr prodp;
+     mp_srcptr up;
+     mp_size_t n;
+#endif
+{
+  mp_size_t i;
+
+  {
+    /* N.B.!  We need the superfluous indirection through argh to work around
+       a reloader bug in GCC 2.7.*.  */
+    mp_limb_t x;
+    mp_limb_t argh;
+    x = up[0];
+    umul_ppmm (argh, prodp[0], x, x);
+    prodp[1] = argh;
+  }
+  if (n > 1)
+    {
+      mp_limb_t tarr[2 * KARATSUBA_SQR_THRESHOLD];
+      mp_ptr tp = tarr;
+      mp_limb_t cy;
+
+      /* must fit 2*n limbs in tarr */
+      ASSERT (n <= KARATSUBA_SQR_THRESHOLD);
+
+      cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
+      tp[n - 1] = cy;
+      for (i = 2; i < n; i++)
+	{
+	  mp_limb_t cy;
+	  cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]);
+	  tp[n + i - 2] = cy;
+	}
+      for (i = 1; i < n; i++)
+	{
+	  mp_limb_t x;
+	  x = up[i];
+	  umul_ppmm (prodp[2 * i + 1], prodp[2 * i], x, x);
+	}
+      {
+	mp_limb_t cy;
+	cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
+	cy += mpn_add_n (prodp + 1, prodp + 1, tp, 2 * n - 2);
+	prodp[2 * n - 1] += cy;
+      }
+    }
+}
diff --git a/rts/gmp/mpn/generic/sqrtrem.c b/rts/gmp/mpn/generic/sqrtrem.c
new file mode 100644
index 0000000000..ee3b5144dd
--- /dev/null
+++ b/rts/gmp/mpn/generic/sqrtrem.c
@@ -0,0 +1,509 @@
+/* mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
+
+   Write the square root of {OP_PTR, OP_SIZE} at ROOT_PTR.
+   Write the remainder at REM_PTR, if REM_PTR != NULL.
+   Return the size of the remainder.
+   (The size of the root is always half of the size of the operand.)
+
+   OP_PTR and ROOT_PTR may not point to the same object.
+   OP_PTR and REM_PTR may point to the same object.
+
+   If REM_PTR is NULL, only the root is computed and the return value of
+   the function is 0 if OP is a perfect square, and *any* non-zero number
+   otherwise.
+
+Copyright (C) 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* This code is just correct if "unsigned char" has at least 8 bits.  It
+   doesn't help to use CHAR_BIT from limits.h, as the real problem is
+   the static arrays.  */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Square root algorithm:
+
+   1. Shift OP (the input) to the left an even number of bits s.t. there
+      are an even number of words and either (or both) of the most
+      significant bits are set.  This way, sqrt(OP) has exactly half as
+      many words as OP, and has its most significant bit set.
+
+   2. Get a 9-bit approximation to sqrt(OP) using the pre-computed tables.
+      This approximation is used for the first single-precision
+      iterations of Newton's method, yielding a full-word approximation
+      to sqrt(OP).
+
+   3. Perform multiple-precision Newton iteration until we have the
+      exact result.  Only about half of the input operand is used in
+      this calculation, as the square root is perfectly determinable
+      from just the higher half of a number.  */
+
+/* Define this macro for IEEE P854 machines with a fast sqrt instruction.  */
+#if defined __GNUC__ && ! defined __SOFT_FLOAT
+
+#if defined (__sparc__) && BITS_PER_MP_LIMB == 32
+#define SQRT(a) \
+  ({									\
+    double __sqrt_res;							\
+    asm ("fsqrtd %1,%0" : "=f" (__sqrt_res) : "f" (a));			\
+    __sqrt_res;								\
+  })
+#endif
+
+#if defined (__HAVE_68881__)
+#define SQRT(a) \
+  ({									\
+    double __sqrt_res;							\
+    asm ("fsqrtx %1,%0" : "=f" (__sqrt_res) : "f" (a));			\
+    __sqrt_res;								\
+  })
+#endif
+
+#if defined (__hppa) && BITS_PER_MP_LIMB == 32
+#define SQRT(a) \
+  ({									\
+    double __sqrt_res;							\
+    asm ("fsqrt,dbl %1,%0" : "=fx" (__sqrt_res) : "fx" (a));		\
+    __sqrt_res;								\
+  })
+#endif
+
+#if defined (_ARCH_PWR2) && BITS_PER_MP_LIMB == 32
+#define SQRT(a) \
+  ({									\
+    double __sqrt_res;							\
+    asm ("fsqrt %0,%1" : "=f" (__sqrt_res) : "f" (a));			\
+    __sqrt_res;								\
+  })
+#endif
+
+#if 0
+#if defined (__i386__) || defined (__i486__)
+#define SQRT(a) \
+  ({									\
+    double __sqrt_res;							\
+    asm ("fsqrt" : "=t" (__sqrt_res) : "0" (a));			\
+    __sqrt_res;								\
+  })
+#endif
+#endif
+
+#endif
+
+#ifndef SQRT
+
+/* Tables for initial approximation of the square root.  These are
+   indexed with bits 1-8 of the operand for which the square root is
+   calculated, where bit 0 is the most significant non-zero bit.  I.e.
+   the most significant one-bit is not used, since that per definition
+   is one.  Likewise, the tables don't return the highest bit of the
+   result.  That bit must be inserted by or:ing the returned value with
+   0x100.  This way, we get a 9-bit approximation from 8-bit tables!  */
+
+/* Table to be used for operands with an even total number of bits.
+   (Exactly as in the decimal system there are similarities between the
+   square root of numbers with the same initial digits and an even
+   difference in the total number of digits.  Consider the square root
+   of 1, 10, 100, 1000, ...)  */
+static const unsigned char even_approx_tab[256] =
+{
+  0x6a, 0x6a, 0x6b, 0x6c, 0x6c, 0x6d, 0x6e, 0x6e,
+  0x6f, 0x70, 0x71, 0x71, 0x72, 0x73, 0x73, 0x74,
+  0x75, 0x75, 0x76, 0x77, 0x77, 0x78, 0x79, 0x79,
+  0x7a, 0x7b, 0x7b, 0x7c, 0x7d, 0x7d, 0x7e, 0x7f,
+  0x80, 0x80, 0x81, 0x81, 0x82, 0x83, 0x83, 0x84,
+  0x85, 0x85, 0x86, 0x87, 0x87, 0x88, 0x89, 0x89,
+  0x8a, 0x8b, 0x8b, 0x8c, 0x8d, 0x8d, 0x8e, 0x8f,
+  0x8f, 0x90, 0x90, 0x91, 0x92, 0x92, 0x93, 0x94,
+  0x94, 0x95, 0x96, 0x96, 0x97, 0x97, 0x98, 0x99,
+  0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9c, 0x9d, 0x9e,
+  0x9e, 0x9f, 0xa0, 0xa0, 0xa1, 0xa1, 0xa2, 0xa3,
+  0xa3, 0xa4, 0xa4, 0xa5, 0xa6, 0xa6, 0xa7, 0xa7,
+  0xa8, 0xa9, 0xa9, 0xaa, 0xaa, 0xab, 0xac, 0xac,
+  0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0, 0xb1,
+  0xb2, 0xb2, 0xb3, 0xb3, 0xb4, 0xb5, 0xb5, 0xb6,
+  0xb6, 0xb7, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xba,
+  0xbb, 0xbb, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe, 0xbf,
+  0xc0, 0xc0, 0xc1, 0xc1, 0xc2, 0xc2, 0xc3, 0xc3,
+  0xc4, 0xc5, 0xc5, 0xc6, 0xc6, 0xc7, 0xc7, 0xc8,
+  0xc9, 0xc9, 0xca, 0xca, 0xcb, 0xcb, 0xcc, 0xcc,
+  0xcd, 0xce, 0xce, 0xcf, 0xcf, 0xd0, 0xd0, 0xd1,
+  0xd1, 0xd2, 0xd3, 0xd3, 0xd4, 0xd4, 0xd5, 0xd5,
+  0xd6, 0xd6, 0xd7, 0xd7, 0xd8, 0xd9, 0xd9, 0xda,
+  0xda, 0xdb, 0xdb, 0xdc, 0xdc, 0xdd, 0xdd, 0xde,
+  0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe1, 0xe2, 0xe2,
+  0xe3, 0xe3, 0xe4, 0xe4, 0xe5, 0xe5, 0xe6, 0xe6,
+  0xe7, 0xe7, 0xe8, 0xe8, 0xe9, 0xea, 0xea, 0xeb,
+  0xeb, 0xec, 0xec, 0xed, 0xed, 0xee, 0xee, 0xef,
+  0xef, 0xf0, 0xf0, 0xf1, 0xf1, 0xf2, 0xf2, 0xf3,
+  0xf3, 0xf4, 0xf4, 0xf5, 0xf5, 0xf6, 0xf6, 0xf7,
+  0xf7, 0xf8, 0xf8, 0xf9, 0xf9, 0xfa, 0xfa, 0xfb,
+  0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfe, 0xff,
+};
+
+/* Table to be used for operands with an odd total number of bits.
+   (Further comments before previous table.)  */
+static const unsigned char odd_approx_tab[256] =
+{
+  0x00, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
+  0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07,
+  0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b,
+  0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f,
+  0x0f, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12,
+  0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16,
+  0x16, 0x17, 0x17, 0x18, 0x18, 0x19, 0x19, 0x1a,
+  0x1a, 0x1b, 0x1b, 0x1b, 0x1c, 0x1c, 0x1d, 0x1d,
+  0x1e, 0x1e, 0x1f, 0x1f, 0x20, 0x20, 0x20, 0x21,
+  0x21, 0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24,
+  0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28,
+  0x28, 0x29, 0x29, 0x2a, 0x2a, 0x2a, 0x2b, 0x2b,
+  0x2c, 0x2c, 0x2d, 0x2d, 0x2d, 0x2e, 0x2e, 0x2f,
+  0x2f, 0x30, 0x30, 0x30, 0x31, 0x31, 0x32, 0x32,
+  0x32, 0x33, 0x33, 0x34, 0x34, 0x35, 0x35, 0x35,
+  0x36, 0x36, 0x37, 0x37, 0x37, 0x38, 0x38, 0x39,
+  0x39, 0x39, 0x3a, 0x3a, 0x3b, 0x3b, 0x3b, 0x3c,
+  0x3c, 0x3d, 0x3d, 0x3d, 0x3e, 0x3e, 0x3f, 0x3f,
+  0x40, 0x40, 0x40, 0x41, 0x41, 0x41, 0x42, 0x42,
+  0x43, 0x43, 0x43, 0x44, 0x44, 0x45, 0x45, 0x45,
+  0x46, 0x46, 0x47, 0x47, 0x47, 0x48, 0x48, 0x49,
+  0x49, 0x49, 0x4a, 0x4a, 0x4b, 0x4b, 0x4b, 0x4c,
+  0x4c, 0x4c, 0x4d, 0x4d, 0x4e, 0x4e, 0x4e, 0x4f,
+  0x4f, 0x50, 0x50, 0x50, 0x51, 0x51, 0x51, 0x52,
+  0x52, 0x53, 0x53, 0x53, 0x54, 0x54, 0x54, 0x55,
+  0x55, 0x56, 0x56, 0x56, 0x57, 0x57, 0x57, 0x58,
+  0x58, 0x59, 0x59, 0x59, 0x5a, 0x5a, 0x5a, 0x5b,
+  0x5b, 0x5b, 0x5c, 0x5c, 0x5d, 0x5d, 0x5d, 0x5e,
+  0x5e, 0x5e, 0x5f, 0x5f, 0x60, 0x60, 0x60, 0x61,
+  0x61, 0x61, 0x62, 0x62, 0x62, 0x63, 0x63, 0x63,
+  0x64, 0x64, 0x65, 0x65, 0x65, 0x66, 0x66, 0x66,
+  0x67, 0x67, 0x67, 0x68, 0x68, 0x68, 0x69, 0x69,
+};
+#endif
+
+
+mp_size_t
+#if __STDC__
+mpn_sqrtrem (mp_ptr root_ptr, mp_ptr rem_ptr, mp_srcptr op_ptr, mp_size_t op_size)
+#else
+mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
+     mp_ptr root_ptr;
+     mp_ptr rem_ptr;
+     mp_srcptr op_ptr;
+     mp_size_t op_size;
+#endif
+{
+  /* R (root result) */
+  mp_ptr rp;			/* Pointer to least significant word */
+  mp_size_t rsize;		/* The size in words */
+
+  /* T (OP shifted to the left a.k.a. normalized) */
+  mp_ptr tp;			/* Pointer to least significant word */
+  mp_size_t tsize;		/* The size in words */
+  mp_ptr t_end_ptr;		/* Pointer right beyond most sign. word */
+  mp_limb_t t_high0, t_high1;	/* The two most significant words */
+
+  /* TT (temporary for numerator/remainder) */
+  mp_ptr ttp;			/* Pointer to least significant word */
+
+  /* X (temporary for quotient in main loop) */
+  mp_ptr xp;			/* Pointer to least significant word */
+  mp_size_t xsize;		/* The size in words */
+
+  unsigned cnt;
+  mp_limb_t initial_approx;	/* Initially made approximation */
+  mp_size_t tsizes[BITS_PER_MP_LIMB];	/* Successive calculation precisions */
+  mp_size_t tmp;
+  mp_size_t i;
+
+  mp_limb_t cy_limb;
+  TMP_DECL (marker);
+
+  /* If OP is zero, both results are zero.  */
+  if (op_size == 0)
+    return 0;
+
+  count_leading_zeros (cnt, op_ptr[op_size - 1]);
+  tsize = op_size;
+  if ((tsize & 1) != 0)
+    {
+      cnt += BITS_PER_MP_LIMB;
+      tsize++;
+    }
+
+  rsize = tsize / 2;
+  rp = root_ptr;
+
+  TMP_MARK (marker);
+
+  /* Shift OP an even number of bits into T, such that either the most or
+     the second most significant bit is set, and such that the number of
+     words in T becomes even.  This way, the number of words in R=sqrt(OP)
+     is exactly half as many as in OP, and the most significant bit of R
+     is set.
+
+     Also, the initial approximation is simplified by this up-shifted OP.
+
+     Finally, the Newtonian iteration which is the main part of this
+     program performs division by R.  The fast division routine expects
+     the divisor to be "normalized" in exactly the sense of having the
+     most significant bit set.  */
+
+  tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+
+  if ((cnt & ~1) % BITS_PER_MP_LIMB != 0)
+    t_high0 = mpn_lshift (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size,
+			  (cnt & ~1) % BITS_PER_MP_LIMB);
+  else
+    MPN_COPY (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size);
+
+  if (cnt >= BITS_PER_MP_LIMB)
+    tp[0] = 0;
+
+  t_high0 = tp[tsize - 1];
+  t_high1 = tp[tsize - 2];	/* Never stray.  TSIZE is >= 2.  */
+
+/* Is there a fast sqrt instruction defined for this machine?  */
+#ifdef SQRT
+  {
+    initial_approx = SQRT (t_high0 * MP_BASE_AS_DOUBLE + t_high1);
+    /* If t_high0,,t_high1 is big, the result in INITIAL_APPROX might have
+       become incorrect due to overflow in the conversion from double to
+       mp_limb_t above.  It will typically be zero in that case, but might be
+       a small number on some machines.  The most significant bit of
+       INITIAL_APPROX should be set, so that bit is a good overflow
+       indication.  */
+    if ((mp_limb_signed_t) initial_approx >= 0)
+      initial_approx = ~(mp_limb_t)0;
+  }
+#else
+  /* Get a 9 bit approximation from the tables.  The tables expect to
+     be indexed with the 8 high bits right below the highest bit.
+     Also, the highest result bit is not returned by the tables, and
+     must be or:ed into the result.  The scheme gives 9 bits of start
+     approximation with just 256-entry 8 bit tables.  */
+
+  if ((cnt & 1) == 0)
+    {
+      /* The most significant bit of t_high0 is set.  */
+      initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 1);
+      initial_approx &= 0xff;
+      initial_approx = even_approx_tab[initial_approx];
+    }
+  else
+    {
+      /* The most significant bit of t_high0 is unset,
+	 the second most significant is set.  */
+      initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 2);
+      initial_approx &= 0xff;
+      initial_approx = odd_approx_tab[initial_approx];
+    }
+  initial_approx |= 0x100;
+  initial_approx <<= BITS_PER_MP_LIMB - 8 - 1;
+
+  /* Perform small precision Newtonian iterations to get a full word
+     approximation.  For small operands, these iterations will do the
+     entire job.  */
+  if (t_high0 == ~(mp_limb_t)0)
+    initial_approx = t_high0;
+  else
+    {
+      mp_limb_t quot;
+
+      if (t_high0 >= initial_approx)
+	initial_approx = t_high0 + 1;
+
+      /* First get about 18 bits with pure C arithmetics.  */
+      quot = t_high0 / (initial_approx >> BITS_PER_MP_LIMB/2) << BITS_PER_MP_LIMB/2;
+      initial_approx = (initial_approx + quot) / 2;
+      initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
+
+      /* Now get a full word by one (or for > 36 bit machines) several
+	 iterations.  */
+      for (i = 18; i < BITS_PER_MP_LIMB; i <<= 1)
+	{
+	  mp_limb_t ignored_remainder;
+
+	  udiv_qrnnd (quot, ignored_remainder,
+		      t_high0, t_high1, initial_approx);
+	  initial_approx = (initial_approx + quot) / 2;
+	  initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
+	}
+    }
+#endif
+
+  rp[0] = initial_approx;
+  rsize = 1;
+
+#ifdef SQRT_DEBUG
+	  printf ("\n\nT = ");
+	  mpn_dump (tp, tsize);
+#endif
+
+  if (tsize > 2)
+    {
+      /* Determine the successive precisions to use in the iteration.  We
+	 minimize the precisions, beginning with the highest (i.e. last
+	 iteration) to the lowest (i.e. first iteration).  */
+
+      xp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+      ttp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+
+      t_end_ptr = tp + tsize;
+
+      tmp = tsize / 2;
+      for (i = 0;; i++)
+	{
+	  tsize = (tmp + 1) / 2;
+	  if (tmp == tsize)
+	    break;
+	  tsizes[i] = tsize + tmp;
+	  tmp = tsize;
+	}
+
+      /* Main Newton iteration loop.  For big arguments, most of the
+	 time is spent here.  */
+
+      /* It is possible to do a great optimization here.  The successive
+	 divisors in the mpn_divmod call below have more and more leading
+	 words equal to its predecessor.  Therefore the beginning of
+	 each division will repeat the same work as did the last
+	 division.  If we could guarantee that the leading words of two
+	 consecutive divisors are the same (i.e. in this case, a later
+	 divisor has just more digits at the end) it would be a simple
+	 matter of just using the old remainder of the last division in
+	 a subsequent division, to take care of this optimization.  This
+	 idea would surely make a difference even for small arguments.  */
+
+      /* Loop invariants:
+
+	 R <= shiftdown_to_same_size(floor(sqrt(OP))) < R + 1.
+	 X - 1 < shiftdown_to_same_size(floor(sqrt(OP))) <= X.
+	 R <= shiftdown_to_same_size(X).  */
+
+      while (--i >= 0)
+	{
+	  mp_limb_t cy;
+#ifdef SQRT_DEBUG
+	  mp_limb_t old_least_sign_r = rp[0];
+	  mp_size_t old_rsize = rsize;
+
+	  printf ("R = ");
+	  mpn_dump (rp, rsize);
+#endif
+	  tsize = tsizes[i];
+
+	  /* Need to copy the numerator into temporary space, as
+	     mpn_divmod overwrites its numerator argument with the
+	     remainder (which we currently ignore).  */
+	  MPN_COPY (ttp, t_end_ptr - tsize, tsize);
+	  cy = mpn_divmod (xp, ttp, tsize, rp, rsize);
+	  xsize = tsize - rsize;
+
+#ifdef SQRT_DEBUG
+	  printf ("X =%d ", cy);
+	  mpn_dump (xp, xsize);
+#endif
+
+	  /* Add X and R with the most significant limbs aligned,
+	     temporarily ignoring at least one limb at the low end of X.  */
+	  tmp = xsize - rsize;
+	  cy += mpn_add_n (xp + tmp, rp, xp + tmp, rsize);
+
+	  /* If T begins with more than 2 x BITS_PER_MP_LIMB of ones, we get
+	     intermediate roots that'd need an extra bit.  We don't want to
+	     handle that since it would make the subsequent divisor
+	     non-normalized, so round such roots down to be only ones in the
+	     current precision.  */
+	  if (cy == 2)
+	    {
+	      mp_size_t j;
+	      for (j = xsize; j >= 0; j--)
+		xp[j] = ~(mp_limb_t)0;
+	    }
+
+	  /* Divide X by 2 and put the result in R.  This is the new
+	     approximation.  Shift in the carry from the addition.  */
+	  mpn_rshift (rp, xp, xsize, 1);
+	  rp[xsize - 1] |= ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1));
+	  rsize = xsize;
+#ifdef SQRT_DEBUG
+	  if (old_least_sign_r != rp[rsize - old_rsize])
+	    printf (">>>>>>>> %d: %0*lX, %0*lX <<<<<<<<\n",
+		    i, 2 * BYTES_PER_MP_LIMB, old_least_sign_r,
+		    2 * BYTES_PER_MP_LIMB, rp[rsize - old_rsize]);
+#endif
+	}
+    }
+
+#ifdef SQRT_DEBUG
+  printf ("(final) R = ");
+  mpn_dump (rp, rsize);
+#endif
+
+  /* We computed the square root of OP * 2**(2*floor(cnt/2)).
+     This has resulted in R being 2**floor(cnt/2) to large.
+     Shift it down here to fix that.  */
+  if (cnt / 2 != 0)
+    {
+      mpn_rshift (rp, rp, rsize, cnt/2);
+      rsize -= rp[rsize - 1] == 0;
+    }
+
+  /* Calculate the remainder.  */
+  mpn_mul_n (tp, rp, rp, rsize);
+  tsize = rsize + rsize;
+  tsize -= tp[tsize - 1] == 0;
+  if (op_size < tsize
+      || (op_size == tsize && mpn_cmp (op_ptr, tp, op_size) < 0))
+    {
+      /* R is too large.  Decrement it.  */
+
+      /* These operations can't overflow.  */
+      cy_limb  = mpn_sub_n (tp, tp, rp, rsize);
+      cy_limb += mpn_sub_n (tp, tp, rp, rsize);
+      mpn_decr_u (tp + rsize, cy_limb);
+      mpn_incr_u (tp, (mp_limb_t) 1);
+
+      mpn_decr_u (rp, (mp_limb_t) 1);
+
+#ifdef SQRT_DEBUG
+      printf ("(adjusted) R = ");
+      mpn_dump (rp, rsize);
+#endif
+    }
+
+  if (rem_ptr != NULL)
+    {
+      cy_limb = mpn_sub (rem_ptr, op_ptr, op_size, tp, tsize);
+      MPN_NORMALIZE (rem_ptr, op_size);
+      TMP_FREE (marker);
+      return op_size;
+    }
+  else
+    {
+      int res;
+      res = op_size != tsize || mpn_cmp (op_ptr, tp, op_size);
+      TMP_FREE (marker);
+      return res;
+    }
+}
diff --git a/rts/gmp/mpn/generic/sub_n.c b/rts/gmp/mpn/generic/sub_n.c
new file mode 100644
index 0000000000..4f2f06099c
--- /dev/null
+++ b/rts/gmp/mpn/generic/sub_n.c
@@ -0,0 +1,62 @@
+/* mpn_sub_n -- Subtract two limb vectors of equal, non-zero length.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+#if __STDC__
+mpn_sub_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size)
+#else
+mpn_sub_n (res_ptr, s1_ptr, s2_ptr, size)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     register mp_srcptr s2_ptr;
+     mp_size_t size;
+#endif
+{
+  register mp_limb_t x, y, cy;
+  register mp_size_t j;
+
+  /* The loop counter and index J goes from -SIZE to -1.  This way
+     the loop becomes faster.  */
+  j = -size;
+
+  /* Offset the base pointers to compensate for the negative indices.  */
+  s1_ptr -= j;
+  s2_ptr -= j;
+  res_ptr -= j;
+
+  cy = 0;
+  do
+    {
+      y = s2_ptr[j];
+      x = s1_ptr[j];
+      y += cy;			/* add previous carry to subtrahend */
+      cy = (y < cy);		/* get out carry from that addition */
+      y = x - y;		/* main subtract */
+      cy = (y > x) + cy;	/* get out carry from the subtract, combine */
+      res_ptr[j] = y;
+    }
+  while (++j != 0);
+
+  return cy;
+}
diff --git a/rts/gmp/mpn/generic/submul_1.c b/rts/gmp/mpn/generic/submul_1.c
new file mode 100644
index 0000000000..c7c08ee4af
--- /dev/null
+++ b/rts/gmp/mpn/generic/submul_1.c
@@ -0,0 +1,65 @@
+/* mpn_submul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR
+   by S2_LIMB, subtract the S1_SIZE least significant limbs of the product
+   from the limb vector pointed to by RES_PTR.  Return the most significant
+   limb of the product, adjusted for carry-out from the subtraction.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_submul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     mp_size_t s1_size;
+     register mp_limb_t s2_limb;
+{
+  register mp_limb_t cy_limb;
+  register mp_size_t j;
+  register mp_limb_t prod_high, prod_low;
+  register mp_limb_t x;
+
+  /* The loop counter and index J goes from -SIZE to -1.  This way
+     the loop becomes faster.  */
+  j = -s1_size;
+
+  /* Offset the base pointers to compensate for the negative indices.  */
+  res_ptr -= j;
+  s1_ptr -= j;
+
+  cy_limb = 0;
+  do
+    {
+      umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb);
+
+      prod_low += cy_limb;
+      cy_limb = (prod_low < cy_limb) + prod_high;
+
+      x = res_ptr[j];
+      prod_low = x - prod_low;
+      cy_limb += (prod_low > x);
+      res_ptr[j] = prod_low;
+    }
+  while (++j != 0);
+
+  return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/tdiv_qr.c b/rts/gmp/mpn/generic/tdiv_qr.c
new file mode 100644
index 0000000000..b748b5d810
--- /dev/null
+++ b/rts/gmp/mpn/generic/tdiv_qr.c
@@ -0,0 +1,401 @@
+/* mpn_tdiv_qr -- Divide the numerator (np,nn) by the denominator (dp,dn) and
+   write the nn-dn+1 quotient limbs at qp and the dn remainder limbs at rp.  If
+   qxn is non-zero, generate that many fraction limbs and append them after the
+   other quotient limbs, and update the remainder accordningly.  The input
+   operands are unaffected.
+
+   Preconditions:
+   1. The most significant limb of of the divisor must be non-zero.
+   2. No argument overlap is permitted.  (??? relax this ???)
+   3. nn >= dn, even if qxn is non-zero.  (??? relax this ???)
+
+   The time complexity of this is O(qn*qn+M(dn,qn)), where M(m,n) is the time
+   complexity of multiplication.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD (7 * KARATSUBA_MUL_THRESHOLD)
+#endif
+
+/* Extract the middle limb from ((h,,l) << cnt) */
+#define SHL(h,l,cnt) \
+  ((h << cnt) | ((l >> 1) >> ((~cnt) & (BITS_PER_MP_LIMB - 1))))
+
+void
+#if __STDC__
+mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
+	     mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn)
+#else
+mpn_tdiv_qr (qp, rp, qxn, np, nn, dp, dn)
+     mp_ptr qp;
+     mp_ptr rp;
+     mp_size_t qxn;
+     mp_srcptr np;
+     mp_size_t nn;
+     mp_srcptr dp;
+     mp_size_t dn;
+#endif
+{
+  /* FIXME:
+     1. qxn
+     2. pass allocated storage in additional parameter?
+  */
+  if (qxn != 0)
+    abort ();
+
+  switch (dn)
+    {
+    case 0:
+      DIVIDE_BY_ZERO;
+
+    case 1:
+      {
+	rp[0] = mpn_divmod_1 (qp, np, nn, dp[0]);
+	return;
+      }
+
+    case 2:
+      {
+	int cnt;
+	mp_ptr n2p, d2p;
+	mp_limb_t qhl, cy;
+	TMP_DECL (marker);
+	TMP_MARK (marker);
+	count_leading_zeros (cnt, dp[dn - 1]);
+	if (cnt != 0)
+	  {
+	    d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+	    mpn_lshift (d2p, dp, dn, cnt);
+	    n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
+	    cy = mpn_lshift (n2p, np, nn, cnt);
+	    n2p[nn] = cy;
+	    qhl = mpn_divrem_2 (qp, 0L, n2p, nn + (cy != 0), d2p);
+	    if (cy == 0)
+	      qp[nn - 2] = qhl;	/* always store nn-dn+1 quotient limbs */
+	  }
+	else
+	  {
+	    d2p = (mp_ptr) dp;
+	    n2p = (mp_ptr) TMP_ALLOC (nn * BYTES_PER_MP_LIMB);
+	    MPN_COPY (n2p, np, nn);
+	    qhl = mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
+	    qp[nn - 2] = qhl;	/* always store nn-dn+1 quotient limbs */
+	  }
+
+	if (cnt != 0)
+	  mpn_rshift (rp, n2p, dn, cnt);
+	else
+	  MPN_COPY (rp, n2p, dn);
+	TMP_FREE (marker);
+	return;
+      }
+
+    default:
+      {
+	int adjust;
+	TMP_DECL (marker);
+	TMP_MARK (marker);
+	adjust = np[nn - 1] >= dp[dn - 1];	/* conservative tests for quotient size */
+	if (nn + adjust >= 2 * dn)
+	  {
+	    mp_ptr n2p, d2p;
+	    mp_limb_t cy;
+	    int cnt;
+	    count_leading_zeros (cnt, dp[dn - 1]);
+
+	    qp[nn - dn] = 0;			/* zero high quotient limb */
+	    if (cnt != 0)			/* normalize divisor if needed */
+	      {
+		d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+		mpn_lshift (d2p, dp, dn, cnt);
+		n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
+		cy = mpn_lshift (n2p, np, nn, cnt);
+		n2p[nn] = cy;
+		nn += adjust;
+	      }
+	    else
+	      {
+		d2p = (mp_ptr) dp;
+		n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
+		MPN_COPY (n2p, np, nn);
+		n2p[nn] = 0;
+		nn += adjust;
+	      }
+
+	    if (dn == 2)
+	      mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
+	    else if (dn < BZ_THRESHOLD)
+	      mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn);
+	    else
+	      {
+		/* Perform 2*dn / dn limb divisions as long as the limbs
+		   in np last.  */
+		mp_ptr q2p = qp + nn - 2 * dn;
+		n2p += nn - 2 * dn;
+		mpn_bz_divrem_n (q2p, n2p, d2p, dn);
+		nn -= dn;
+		while (nn >= 2 * dn)
+		  {
+		    mp_limb_t c;
+		    q2p -= dn;  n2p -= dn;
+		    c = mpn_bz_divrem_n (q2p, n2p, d2p, dn);
+		    ASSERT_ALWAYS (c == 0);
+		    nn -= dn;
+		  }
+
+		if (nn != dn)
+		  {
+		    n2p -= nn - dn;
+		    /* In theory, we could fall out to the cute code below
+		       since we now have exactly the situation that code
+		       is designed to handle.  We botch this badly and call
+		       the basic mpn_sb_divrem_mn!  */
+		    if (dn == 2)
+		      mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
+		    else
+		      mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn);
+		  }
+	      }
+
+
+	    if (cnt != 0)
+	      mpn_rshift (rp, n2p, dn, cnt);
+	    else
+	      MPN_COPY (rp, n2p, dn);
+	    TMP_FREE (marker);
+	    return;
+	  }
+
+	/* When we come here, the numerator/partial remainder is less
+	   than twice the size of the denominator.  */
+
+	  {
+	    /* Problem:
+
+	       Divide a numerator N with nn limbs by a denominator D with dn
+	       limbs forming a quotient of nn-dn+1 limbs.  When qn is small
+	       compared to dn, conventional division algorithms perform poorly.
+	       We want an algorithm that has an expected running time that is
+	       dependent only on qn.  It is assumed that the most significant
+	       limb of the numerator is smaller than the most significant limb
+	       of the denominator.
+
+	       Algorithm (very informally stated):
+
+	       1) Divide the 2 x qn most significant limbs from the numerator
+		  by the qn most significant limbs from the denominator.  Call
+		  the result qest.  This is either the correct quotient, but
+		  might be 1 or 2 too large.  Compute the remainder from the
+		  division.  (This step is implemented by a mpn_divrem call.)
+
+	       2) Is the most significant limb from the remainder < p, where p
+		  is the product of the most significant limb from the quotient
+		  and the next(d).  (Next(d) denotes the next ignored limb from
+		  the denominator.)  If it is, decrement qest, and adjust the
+		  remainder accordingly.
+
+	       3) Is the remainder >= qest?  If it is, qest is the desired
+		  quotient.  The algorithm terminates.
+
+	       4) Subtract qest x next(d) from the remainder.  If there is
+		  borrow out, decrement qest, and adjust the remainder
+		  accordingly.
+
+	       5) Skip one word from the denominator (i.e., let next(d) denote
+		  the next less significant limb.  */
+
+	    mp_size_t qn;
+	    mp_ptr n2p, d2p;
+	    mp_ptr tp;
+	    mp_limb_t cy;
+	    mp_size_t in, rn;
+	    mp_limb_t quotient_too_large;
+	    int cnt;
+
+	    qn = nn - dn;
+	    qp[qn] = 0;				/* zero high quotient limb */
+	    qn += adjust;			/* qn cannot become bigger */
+
+	    if (qn == 0)
+	      {
+		MPN_COPY (rp, np, dn);
+		TMP_FREE (marker);
+		return;
+	      }
+
+	    in = dn - qn;		/* (at least partially) ignored # of limbs in ops */
+	    /* Normalize denominator by shifting it to the left such that its
+	       most significant bit is set.  Then shift the numerator the same
+	       amount, to mathematically preserve quotient.  */
+	    count_leading_zeros (cnt, dp[dn - 1]);
+	    if (cnt != 0)
+	      {
+		d2p = (mp_ptr) TMP_ALLOC (qn * BYTES_PER_MP_LIMB);
+
+		mpn_lshift (d2p, dp + in, qn, cnt);
+		d2p[0] |= dp[in - 1] >> (BITS_PER_MP_LIMB - cnt);
+
+		n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB);
+		cy = mpn_lshift (n2p, np + nn - 2 * qn, 2 * qn, cnt);
+		if (adjust)
+		  {
+		    n2p[2 * qn] = cy;
+		    n2p++;
+		  }
+		else
+		  {
+		    n2p[0] |= np[nn - 2 * qn - 1] >> (BITS_PER_MP_LIMB - cnt);
+		  }
+	      }
+	    else
+	      {
+		d2p = (mp_ptr) dp + in;
+
+		n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB);
+		MPN_COPY (n2p, np + nn - 2 * qn, 2 * qn);
+		if (adjust)
+		  {
+		    n2p[2 * qn] = 0;
+		    n2p++;
+		  }
+	      }
+
+	    /* Get an approximate quotient using the extracted operands.  */
+	    if (qn == 1)
+	      {
+		mp_limb_t q0, r0;
+		mp_limb_t gcc272bug_n1, gcc272bug_n0, gcc272bug_d0;
+		/* Due to a gcc 2.7.2.3 reload pass bug, we have to use some
+		   temps here.  This doesn't hurt code quality on any machines
+		   so we do it unconditionally.  */
+		gcc272bug_n1 = n2p[1];
+		gcc272bug_n0 = n2p[0];
+		gcc272bug_d0 = d2p[0];
+		udiv_qrnnd (q0, r0, gcc272bug_n1, gcc272bug_n0, gcc272bug_d0);
+		n2p[0] = r0;
+		qp[0] = q0;
+	      }
+	    else if (qn == 2)
+	      mpn_divrem_2 (qp, 0L, n2p, 4L, d2p);
+	    else if (qn < BZ_THRESHOLD)
+	      mpn_sb_divrem_mn (qp, n2p, qn * 2, d2p, qn);
+	    else
+	      mpn_bz_divrem_n (qp, n2p, d2p, qn);
+
+	    rn = qn;
+	    /* Multiply the first ignored divisor limb by the most significant
+	       quotient limb.  If that product is > the partial remainder's
+	       most significant limb, we know the quotient is too large.  This
+	       test quickly catches most cases where the quotient is too large;
+	       it catches all cases where the quotient is 2 too large.  */
+	    {
+	      mp_limb_t dl, x;
+	      mp_limb_t h, l;
+
+	      if (in - 2 < 0)
+		dl = 0;
+	      else
+		dl = dp[in - 2];
+
+	      x = SHL (dp[in - 1], dl, cnt);
+	      umul_ppmm (h, l, x, qp[qn - 1]);
+
+	      if (n2p[qn - 1] < h)
+		{
+		  mp_limb_t cy;
+
+		  mpn_decr_u (qp, (mp_limb_t) 1);
+		  cy = mpn_add_n (n2p, n2p, d2p, qn);
+		  if (cy)
+		    {
+		      /* The partial remainder is safely large.  */
+		      n2p[qn] = cy;
+		      ++rn;
+		    }
+		}
+	    }
+
+	    quotient_too_large = 0;
+	    if (cnt != 0)
+	      {
+		mp_limb_t cy1, cy2;
+
+		/* Append partially used numerator limb to partial remainder.  */
+		cy1 = mpn_lshift (n2p, n2p, rn, BITS_PER_MP_LIMB - cnt);
+		n2p[0] |= np[in - 1] & (~(mp_limb_t) 0 >> cnt);
+
+		/* Update partial remainder with partially used divisor limb.  */
+		cy2 = mpn_submul_1 (n2p, qp, qn, dp[in - 1] & (~(mp_limb_t) 0 >> cnt));
+		if (qn != rn)
+		  {
+		    if (n2p[qn] < cy2)
+		      abort ();
+		    n2p[qn] -= cy2;
+		  }
+		else
+		  {
+		    n2p[qn] = cy1 - cy2;
+
+		    quotient_too_large = (cy1 < cy2);
+		    ++rn;
+		  }
+		--in;
+	      }
+	    /* True: partial remainder now is neutral, i.e., it is not shifted up.  */
+
+	    tp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+
+	    if (in < qn)
+	      {
+		if (in == 0)
+		  {
+		    MPN_COPY (rp, n2p, rn);
+		    if (rn != dn)
+		      abort ();
+		    goto foo;
+		  }
+		mpn_mul (tp, qp, qn, dp, in);
+	      }
+	    else
+	      mpn_mul (tp, dp, in, qp, qn);
+
+	    cy = mpn_sub (n2p, n2p, rn, tp + in, qn);
+	    MPN_COPY (rp + in, n2p, dn - in);
+	    quotient_too_large |= cy;
+	    cy = mpn_sub_n (rp, np, tp, in);
+	    cy = mpn_sub_1 (rp + in, rp + in, rn, cy);
+	    quotient_too_large |= cy;
+	  foo:
+	    if (quotient_too_large)
+	      {
+		mpn_decr_u (qp, (mp_limb_t) 1);
+		mpn_add_n (rp, rp, dp, dn);
+	      }
+	  }
+	TMP_FREE (marker);
+	return;
+      }
+    }
+}
diff --git a/rts/gmp/mpn/generic/udiv_w_sdiv.c b/rts/gmp/mpn/generic/udiv_w_sdiv.c
new file mode 100644
index 0000000000..061cce86e1
--- /dev/null
+++ b/rts/gmp/mpn/generic/udiv_w_sdiv.c
@@ -0,0 +1,131 @@
+/* mpn_udiv_w_sdiv -- implement udiv_qrnnd on machines with only signed
+   division.
+
+   Contributed by Peter L. Montgomery.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY SAFE
+   TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS
+   ALMOST GUARANTEED THAT THIS FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE
+   GNU MP RELEASE.
+
+
+Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_udiv_w_sdiv (rp, a1, a0, d)
+     mp_limb_t *rp, a1, a0, d;
+{
+  mp_limb_t q, r;
+  mp_limb_t c0, c1, b1;
+
+  if ((mp_limb_signed_t) d >= 0)
+    {
+      if (a1 < d - a1 - (a0 >> (BITS_PER_MP_LIMB - 1)))
+	{
+	  /* dividend, divisor, and quotient are nonnegative */
+	  sdiv_qrnnd (q, r, a1, a0, d);
+	}
+      else
+	{
+	  /* Compute c1*2^32 + c0 = a1*2^32 + a0 - 2^31*d */
+	  sub_ddmmss (c1, c0, a1, a0, d >> 1, d << (BITS_PER_MP_LIMB - 1));
+	  /* Divide (c1*2^32 + c0) by d */
+	  sdiv_qrnnd (q, r, c1, c0, d);
+	  /* Add 2^31 to quotient */
+	  q += (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
+	}
+    }
+  else
+    {
+      b1 = d >> 1;			/* d/2, between 2^30 and 2^31 - 1 */
+      c1 = a1 >> 1;			/* A/2 */
+      c0 = (a1 << (BITS_PER_MP_LIMB - 1)) + (a0 >> 1);
+
+      if (a1 < b1)			/* A < 2^32*b1, so A/2 < 2^31*b1 */
+	{
+	  sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */
+
+	  r = 2*r + (a0 & 1);		/* Remainder from A/(2*b1) */
+	  if ((d & 1) != 0)
+	    {
+	      if (r >= q)
+		r = r - q;
+	      else if (q - r <= d)
+		{
+		  r = r - q + d;
+		  q--;
+		}
+	      else
+		{
+		  r = r - q + 2*d;
+		  q -= 2;
+		}
+	    }
+	}
+      else if (c1 < b1)			/* So 2^31 <= (A/2)/b1 < 2^32 */
+	{
+	  c1 = (b1 - 1) - c1;
+	  c0 = ~c0;			/* logical NOT */
+
+	  sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */
+
+	  q = ~q;			/* (A/2)/b1 */
+	  r = (b1 - 1) - r;
+
+	  r = 2*r + (a0 & 1);		/* A/(2*b1) */
+
+	  if ((d & 1) != 0)
+	    {
+	      if (r >= q)
+		r = r - q;
+	      else if (q - r <= d)
+		{
+		  r = r - q + d;
+		  q--;
+		}
+	      else
+		{
+		  r = r - q + 2*d;
+		  q -= 2;
+		}
+	    }
+	}
+      else				/* Implies c1 = b1 */
+	{				/* Hence a1 = d - 1 = 2*b1 - 1 */
+	  if (a0 >= -d)
+	    {
+	      q = -1;
+	      r = a0 + d;
+	    }
+	  else
+	    {
+	      q = -2;
+	      r = a0 + 2*d;
+	    }
+	}
+    }
+
+  *rp = r;
+  return q;
+}
diff --git a/rts/gmp/mpn/hppa/README b/rts/gmp/mpn/hppa/README
new file mode 100644
index 0000000000..97e7abe011
--- /dev/null
+++ b/rts/gmp/mpn/hppa/README
@@ -0,0 +1,91 @@
+This directory contains mpn functions for various HP PA-RISC chips.  Code
+that runs faster on the PA7100 and later implementations, is in the pa7100
+directory.
+
+RELEVANT OPTIMIZATION ISSUES
+
+  Load and Store timing
+
+On the PA7000 no memory instructions can issue the two cycles after a store.
+For the PA7100, this is reduced to one cycle.
+
+The PA7100 has a lookup-free cache, so it helps to schedule loads and the
+dependent instruction really far from each other.
+
+STATUS
+
+1. mpn_mul_1 could be improved to 6.5 cycles/limb on the PA7100, using the
+   instructions below (but some sw pipelining is needed to avoid the
+   xmpyu-fstds delay):
+
+	fldds	s1_ptr
+
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+
+	addc
+	stws	res_ptr
+	addc
+	stws	res_ptr
+
+	addib	Loop
+
+2. mpn_addmul_1 could be improved from the current 10 to 7.5 cycles/limb
+   (asymptotically) on the PA7100, using the instructions below.  With proper
+   sw pipelining and the unrolling level below, the speed becomes 8
+   cycles/limb.
+
+	fldds	s1_ptr
+	fldds	s1_ptr
+
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	addc
+	addc
+	addc
+	addc
+	addc	%r0,%r0,cy-limb
+
+	ldws	res_ptr
+	ldws	res_ptr
+	ldws	res_ptr
+	ldws	res_ptr
+	add
+	stws	res_ptr
+	addc
+	stws	res_ptr
+	addc
+	stws	res_ptr
+	addc
+	stws	res_ptr
+
+	addib
+
+3. For the PA8000 we have to stick to using 32-bit limbs before compiler
+   support emerges.  But we want to use 64-bit operations whenever possible,
+   in particular for loads and stores.  It is possible to handle mpn_add_n
+   efficiently by rotating (when s1/s2 are aligned), masking+bit field
+   inserting when (they are not).  The speed should double compared to the
+   code used today.
diff --git a/rts/gmp/mpn/hppa/add_n.s b/rts/gmp/mpn/hppa/add_n.s
new file mode 100644
index 0000000000..c53b2f71b3
--- /dev/null
+++ b/rts/gmp/mpn/hppa/add_n.s
@@ -0,0 +1,58 @@
+; HP-PA  __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; One might want to unroll this as for other processors, but it turns
+; out that the data cache contention after a store makes such
+; unrolling useless.  We can't come under 5 cycles/limb anyway.
+
+	.code
+	.export		__gmpn_add_n
+__gmpn_add_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,=		-1,%r23,L$end	; check for (SIZE == 1)
+	 add		%r20,%r19,%r28	; add first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,<>	-1,%r23,L$loop
+	 addc		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r0,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/gmp-mparam.h b/rts/gmp/mpn/hppa/gmp-mparam.h
new file mode 100644
index 0000000000..98b6d9ce3c
--- /dev/null
+++ b/rts/gmp/mpn/hppa/gmp-mparam.h
@@ -0,0 +1,63 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values are for the PA7100 using GCC.  */
+/* Generated by tuneup.c, 2000-07-25. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   30
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      172
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   59
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      185
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              96
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            122
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            18
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD       46
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          33
+#endif
diff --git a/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s b/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s
new file mode 100644
index 0000000000..c7d218f922
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s
@@ -0,0 +1,102 @@
+; HP-PA-1.1 __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r26
+; s1_ptr	r25
+; size		r24
+; s2_limb	r23
+
+; This runs at 11 cycles/limb on a PA7000.  With the used instructions, it
+; can not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 10 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu.  Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+	.code
+	.export		__gmpn_addmul_1
+__gmpn_addmul_1
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		; move s2_limb ...
+	addib,=		-1,%r24,L$just_one_limb
+	 fldws		-16(%r30),%fr4		; ... into fr4
+	add		%r0,%r0,%r0		; clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds		%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		; least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L$end
+	 ldw		-12(%r30),%r1
+
+; Main loop
+L$loop	ldws		0(%r26),%r29
+	fldws,ma	4(%r25),%fr5
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addc		%r0,%r28,%r28
+	addib,<>	-1,%r24,L$loop
+	 ldw		-12(%r30),%r1
+
+L$end	ldw		0(%r26),%r29
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	ldws		0(%r26),%r29
+	addc		%r0,%r28,%r28
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+L$just_one_limb
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		0(%r26),%r29
+	fstds		%fr6,-16(%r30)
+	ldw		-12(%r30),%r1
+	ldw		-16(%r30),%r28
+	add		%r29,%r1,%r19
+	stw		%r19,0(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/mul_1.s b/rts/gmp/mpn/hppa/hppa1_1/mul_1.s
new file mode 100644
index 0000000000..4512fddec9
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/mul_1.s
@@ -0,0 +1,98 @@
+; HP-PA-1.1 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+; the result in a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r26
+; s1_ptr	r25
+; size		r24
+; s2_limb	r23
+
+; This runs at 9 cycles/limb on a PA7000.  With the used instructions, it can
+; not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 7 cycles/limb, and that can not be improved either, since
+; only the xmpyu does not need the integer pipeline, so the only dual-issue
+; we will get are addc+xmpyu.  Unrolling would not help either CPU.
+
+; We could use fldds to read two limbs at a time from the S1 array, and that
+; could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and
+; PA7100, respectively.  We don't do that since it does not seem worth the
+; (alignment) troubles...
+
+; At least the PA7100 is rumored to be able to deal with cache-misses
+; without stalling instruction issue.  If this is true, and the cache is
+; actually also lockup-free, we should use a deeper software pipeline, and
+; load from S1 very early!  (The loads and stores to -12(sp) will surely be
+; in the cache.)
+
+	.code
+	.export		__gmpn_mul_1
+__gmpn_mul_1
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		; move s2_limb ...
+	addib,=		-1,%r24,L$just_one_limb
+	 fldws		-16(%r30),%fr4		; ... into fr4
+	add		%r0,%r0,%r0		; clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds	 	%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		; least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L$end
+	 ldw		-12(%r30),%r1
+
+; Main loop
+L$loop	fldws,ma	4(%r25),%fr5
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addib,<>	-1,%r24,L$loop
+	 ldw		-12(%r30),%r1
+
+L$end	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	stws,ma		%r19,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+L$just_one_limb
+	xmpyu		%fr4,%fr5,%fr6
+	fstds		%fr6,-16(%r30)
+	ldw		-16(%r30),%r28
+	ldo		-64(%r30),%r30
+	bv		0(%r2)
+	 fstws		%fr6R,0(%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s
new file mode 100644
index 0000000000..4f4be08b37
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s
@@ -0,0 +1,75 @@
+; HP-PA  __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+; This is optimized for the PA7100, where is runs at 4.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+	.code
+	.export		__gmpn_add_n
+__gmpn_add_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,<=	-5,%r23,L$rest
+	 add		%r20,%r19,%r28	; add first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addc		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addc		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addc		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-4,%r23,L$loop
+	addc		%r20,%r19,%r28
+
+L$rest	addib,=		4,%r23,L$end
+	nop
+L$eloop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-1,%r23,L$eloop
+	addc		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r0,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S b/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S
new file mode 100644
index 0000000000..04db06822e
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S
@@ -0,0 +1,189 @@
+; HP-PA 7100/7200 __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define res_ptr	%r26
+#define s1_ptr	%r25
+#define size	%r24
+#define s2_limb	%r23
+
+#define cylimb	%r28
+#define s0	%r19
+#define s1	%r20
+#define s2	%r3
+#define s3	%r4
+#define lo0	%r21
+#define lo1	%r5
+#define lo2	%r6
+#define lo3	%r7
+#define hi0	%r22
+#define hi1	%r23				/* safe to reuse */
+#define hi2	%r29
+#define hi3	%r1
+
+	.code
+	.export		__gmpn_addmul_1
+__gmpn_addmul_1
+	.proc
+	.callinfo	frame=128,no_calls
+	.entry
+
+	ldo	128(%r30),%r30
+	stws	s2_limb,-16(%r30)
+	add	 %r0,%r0,cylimb			; clear cy and cylimb
+	addib,<	-4,size,L$few_limbs
+	fldws	-16(%r30),%fr31R
+
+	ldo	-112(%r30),%r31
+	stw	%r3,-96(%r30)
+	stw	%r4,-92(%r30)
+	stw	%r5,-88(%r30)
+	stw	%r6,-84(%r30)
+	stw	%r7,-80(%r30)
+
+	bb,>=,n	 s1_ptr,29,L$0
+
+	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r31)
+	ldws	-16(%r31),cylimb
+	ldws	-12(%r31),lo0
+	add	 s0,lo0,s0
+	addib,< -1,size,L$few_limbs
+	stws,ma	 s0,4(res_ptr)
+
+; start software pipeline ----------------------------------------------------
+L$0	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	xmpyu	 %fr4L,%fr31R,%fr5
+	xmpyu	 %fr4R,%fr31R,%fr6
+	xmpyu	 %fr8L,%fr31R,%fr9
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	fstds	 %fr6,-8(%r31)
+	fstds	 %fr9,0(%r31)
+	fstds	 %fr10,8(%r31)
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	addc	 lo1,hi0,lo1
+	addc	 lo2,hi1,lo2
+	addc	 lo3,hi2,lo3
+
+	addib,<	 -4,size,L$end
+	addc	 %r0,hi3,cylimb			; propagate carry into cylimb
+; main loop ------------------------------------------------------------------
+L$loop	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4L,%fr31R,%fr5
+	ldws	 4(res_ptr),s1
+	xmpyu	 %fr4R,%fr31R,%fr6
+	ldws	 8(res_ptr),s2
+	xmpyu	 %fr8L,%fr31R,%fr9
+	ldws	12(res_ptr),s3
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	add	 s0,lo0,s0
+	fstds	 %fr6,-8(%r31)
+	addc	 s1,lo1,s1
+	fstds	 %fr9,0(%r31)
+	addc	 s2,lo2,s2
+	fstds	 %fr10,8(%r31)
+	addc	 s3,lo3,s3
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	stws,ma	 s0,4(res_ptr)
+	addc	 lo1,hi0,lo1
+	stws,ma	 s1,4(res_ptr)
+	addc	 lo2,hi1,lo2
+	stws,ma	 s2,4(res_ptr)
+	addc	 lo3,hi2,lo3
+	stws,ma	 s3,4(res_ptr)
+
+	addib,>= -4,size,L$loop
+	addc	 %r0,hi3,cylimb			; propagate carry into cylimb
+; finish software pipeline ---------------------------------------------------
+L$end	ldws	 0(res_ptr),s0
+	ldws	 4(res_ptr),s1
+	ldws	 8(res_ptr),s2
+	ldws	12(res_ptr),s3
+
+	add	 s0,lo0,s0
+	stws,ma	 s0,4(res_ptr)
+	addc	 s1,lo1,s1
+	stws,ma	 s1,4(res_ptr)
+	addc	 s2,lo2,s2
+	stws,ma	 s2,4(res_ptr)
+	addc	 s3,lo3,s3
+	stws,ma	 s3,4(res_ptr)
+
+; restore callee-saves registers ---------------------------------------------
+	ldw	-96(%r30),%r3
+	ldw	-92(%r30),%r4
+	ldw	-88(%r30),%r5
+	ldw	-84(%r30),%r6
+	ldw	-80(%r30),%r7
+
+L$few_limbs
+	addib,=,n 4,size,L$ret
+L$loop2	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r30)
+	ldws	-16(%r30),hi0
+	ldws	-12(%r30),lo0
+	addc	 lo0,cylimb,lo0
+	addc	 %r0,hi0,cylimb
+	add	 s0,lo0,s0
+	stws,ma	 s0,4(res_ptr)
+	addib,<> -1,size,L$loop2
+	nop
+
+L$ret	addc	 %r0,cylimb,cylimb
+	bv	 0(%r2)
+	ldo	 -128(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s
new file mode 100644
index 0000000000..31669b1a55
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s
@@ -0,0 +1,83 @@
+; HP-PA  __gmpn_lshift --
+; This is optimized for the PA7100, where is runs at 3.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__gmpn_lshift
+__gmpn_lshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	sh2add		%r24,%r25,%r25
+	sh2add		%r24,%r26,%r26
+	ldws,mb		-4(0,%r25),%r22
+	subi		32,%r23,%r1
+	mtsar		%r1
+	addib,=		-1,%r24,L$0004
+	vshd		%r0,%r22,%r28		; compute carry out limb
+	ldws,mb		-4(0,%r25),%r29
+	addib,<=	-5,%r24,L$rest
+	vshd		%r22,%r29,%r20
+
+L$loop	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	vshd		%r22,%r29,%r20
+	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	addib,>		-4,%r24,L$loop
+	vshd		%r22,%r29,%r20
+
+L$rest	addib,=		4,%r24,L$end1
+	nop
+L$eloop	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	addib,<=	-1,%r24,L$end2
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	addib,>		-1,%r24,L$eloop
+	vshd		%r22,%r29,%r20
+
+L$end1	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+L$end2	stws,mb		%r20,-4(0,%r26)
+L$0004	vshd		%r22,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s
new file mode 100644
index 0000000000..d32b10b4b1
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s
@@ -0,0 +1,80 @@
+; HP-PA  __gmpn_rshift --
+; This is optimized for the PA7100, where is runs at 3.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__gmpn_rshift
+__gmpn_rshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r22
+	mtsar		%r23
+	addib,=		-1,%r24,L$0004
+	vshd		%r22,%r0,%r28		; compute carry out limb
+	ldws,ma		4(0,%r25),%r29
+	addib,<=	-5,%r24,L$rest
+	vshd		%r29,%r22,%r20
+
+L$loop	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	vshd		%r29,%r22,%r20
+	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	addib,>		-4,%r24,L$loop
+	vshd		%r29,%r22,%r20
+
+L$rest	addib,=		4,%r24,L$end1
+	nop
+L$eloop	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	addib,<=	-1,%r24,L$end2
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	addib,>		-1,%r24,L$eloop
+	vshd		%r29,%r22,%r20
+
+L$end1	stws,ma		%r20,4(0,%r26)
+	vshd		%r0,%r29,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+L$end2	stws,ma		%r20,4(0,%r26)
+L$0004	vshd		%r0,%r22,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s
new file mode 100644
index 0000000000..0eec41c4b3
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s
@@ -0,0 +1,76 @@
+; HP-PA  __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+; This is optimized for the PA7100, where is runs at 4.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+	.code
+	.export		__gmpn_sub_n
+__gmpn_sub_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,<=	-5,%r23,L$rest
+	 sub		%r20,%r19,%r28	; subtract first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	subb		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	subb		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	subb		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-4,%r23,L$loop
+	subb		%r20,%r19,%r28
+
+L$rest	addib,=		4,%r23,L$end
+	nop
+L$eloop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-1,%r23,L$eloop
+	subb		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	addc		%r0,%r0,%r28
+	bv		0(%r2)
+	 subi		1,%r28,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S b/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S
new file mode 100644
index 0000000000..0fba21dcef
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S
@@ -0,0 +1,195 @@
+; HP-PA 7100/7200 __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define res_ptr	%r26
+#define s1_ptr	%r25
+#define size	%r24
+#define s2_limb	%r23
+
+#define cylimb	%r28
+#define s0	%r19
+#define s1	%r20
+#define s2	%r3
+#define s3	%r4
+#define lo0	%r21
+#define lo1	%r5
+#define lo2	%r6
+#define lo3	%r7
+#define hi0	%r22
+#define hi1	%r23				/* safe to reuse */
+#define hi2	%r29
+#define hi3	%r1
+
+	.code
+	.export		__gmpn_submul_1
+__gmpn_submul_1
+	.proc
+	.callinfo	frame=128,no_calls
+	.entry
+
+	ldo	128(%r30),%r30
+	stws	s2_limb,-16(%r30)
+	add	 %r0,%r0,cylimb			; clear cy and cylimb
+	addib,<	-4,size,L$few_limbs
+	fldws	-16(%r30),%fr31R
+
+	ldo	-112(%r30),%r31
+	stw	%r3,-96(%r30)
+	stw	%r4,-92(%r30)
+	stw	%r5,-88(%r30)
+	stw	%r6,-84(%r30)
+	stw	%r7,-80(%r30)
+
+	bb,>=,n	 s1_ptr,29,L$0
+
+	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r31)
+	ldws	-16(%r31),cylimb
+	ldws	-12(%r31),lo0
+	sub	 s0,lo0,s0
+	add	 s0,lo0,%r0			; invert cy
+	addib,< -1,size,L$few_limbs
+	stws,ma	 s0,4(res_ptr)
+
+; start software pipeline ----------------------------------------------------
+L$0	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	xmpyu	 %fr4L,%fr31R,%fr5
+	xmpyu	 %fr4R,%fr31R,%fr6
+	xmpyu	 %fr8L,%fr31R,%fr9
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	fstds	 %fr6,-8(%r31)
+	fstds	 %fr9,0(%r31)
+	fstds	 %fr10,8(%r31)
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	addc	 lo1,hi0,lo1
+	addc	 lo2,hi1,lo2
+	addc	 lo3,hi2,lo3
+
+	addib,<	 -4,size,L$end
+	addc	 %r0,hi3,cylimb			; propagate carry into cylimb
+; main loop ------------------------------------------------------------------
+L$loop	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4L,%fr31R,%fr5
+	ldws	 4(res_ptr),s1
+	xmpyu	 %fr4R,%fr31R,%fr6
+	ldws	 8(res_ptr),s2
+	xmpyu	 %fr8L,%fr31R,%fr9
+	ldws	12(res_ptr),s3
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	sub	 s0,lo0,s0
+	fstds	 %fr6,-8(%r31)
+	subb	 s1,lo1,s1
+	fstds	 %fr9,0(%r31)
+	subb	 s2,lo2,s2
+	fstds	 %fr10,8(%r31)
+	subb	 s3,lo3,s3
+	subb	 %r0,%r0,lo0			; these two insns ...
+	add	 lo0,lo0,%r0			; ... just invert cy
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	stws,ma	 s0,4(res_ptr)
+	addc	 lo1,hi0,lo1
+	stws,ma	 s1,4(res_ptr)
+	addc	 lo2,hi1,lo2
+	stws,ma	 s2,4(res_ptr)
+	addc	 lo3,hi2,lo3
+	stws,ma	 s3,4(res_ptr)
+
+	addib,>= -4,size,L$loop
+	addc	 %r0,hi3,cylimb			; propagate carry into cylimb
+; finish software pipeline ---------------------------------------------------
+L$end	ldws	 0(res_ptr),s0
+	ldws	 4(res_ptr),s1
+	ldws	 8(res_ptr),s2
+	ldws	12(res_ptr),s3
+
+	sub	 s0,lo0,s0
+	stws,ma	 s0,4(res_ptr)
+	subb	 s1,lo1,s1
+	stws,ma	 s1,4(res_ptr)
+	subb	 s2,lo2,s2
+	stws,ma	 s2,4(res_ptr)
+	subb	 s3,lo3,s3
+	stws,ma	 s3,4(res_ptr)
+	subb	 %r0,%r0,lo0			; these two insns ...
+	add	 lo0,lo0,%r0			; ... invert cy
+
+; restore callee-saves registers ---------------------------------------------
+	ldw	-96(%r30),%r3
+	ldw	-92(%r30),%r4
+	ldw	-88(%r30),%r5
+	ldw	-84(%r30),%r6
+	ldw	-80(%r30),%r7
+
+L$few_limbs
+	addib,=,n 4,size,L$ret
+L$loop2	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r30)
+	ldws	-16(%r30),hi0
+	ldws	-12(%r30),lo0
+	addc	 lo0,cylimb,lo0
+	addc	 %r0,hi0,cylimb
+	sub	 s0,lo0,s0
+	add	 s0,lo0,%r0			; invert cy
+	stws,ma	 s0,4(res_ptr)
+	addib,<> -1,size,L$loop2
+	nop
+
+L$ret	addc	 %r0,cylimb,cylimb
+	bv	 0(%r2)
+	ldo	 -128(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/submul_1.s b/rts/gmp/mpn/hppa/hppa1_1/submul_1.s
new file mode 100644
index 0000000000..20a5b5ce0a
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/submul_1.s
@@ -0,0 +1,111 @@
+; HP-PA-1.1 __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r26
+; s1_ptr	r25
+; size		r24
+; s2_limb	r23
+
+; This runs at 12 cycles/limb on a PA7000.  With the used instructions, it
+; can not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 11 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu.  Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+; It seems possible to make this run as fast as __gmpn_addmul_1, if we use
+; 	sub,>>=	%r29,%r19,%r22
+;	addi	1,%r28,%r28
+; but that requires reworking the hairy software pipeline...
+
+	.code
+	.export		__gmpn_submul_1
+__gmpn_submul_1
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		; move s2_limb ...
+	addib,=		-1,%r24,L$just_one_limb
+	 fldws		-16(%r30),%fr4		; ... into fr4
+	add		%r0,%r0,%r0		; clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds		%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		; least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L$end
+	 ldw		-12(%r30),%r1
+
+; Main loop
+L$loop	ldws		0(%r26),%r29
+	fldws,ma	4(%r25),%fr5
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addc		%r0,%r28,%r28
+	addib,<>	-1,%r24,L$loop
+	 ldw		-12(%r30),%r1
+
+L$end	ldw		0(%r26),%r29
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	ldws		0(%r26),%r29
+	addc		%r0,%r28,%r28
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+L$just_one_limb
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		0(%r26),%r29
+	fstds		%fr6,-16(%r30)
+	ldw		-12(%r30),%r1
+	ldw		-16(%r30),%r28
+	sub		%r29,%r1,%r22
+	add		%r22,%r1,%r0
+	stw		%r22,0(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S b/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S
new file mode 100644
index 0000000000..b83d6f4dd2
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S
@@ -0,0 +1,80 @@
+; HP-PA  __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on PA 7000 and later.
+
+; Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr	gr26
+; n1		gr25
+; n0		gr24
+; d		gr23
+
+	.code
+L$0000	.word		0x43f00000		; 2^64
+	.word		0x0
+	.export		__gmpn_udiv_qrnnd
+__gmpn_udiv_qrnnd
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+	ldo		64(%r30),%r30
+
+	stws		%r25,-16(0,%r30)	; n_hi
+	stws		%r24,-12(0,%r30)	; n_lo
+#ifdef PIC
+	addil		LT%L$0000,%r19
+	ldo		RT%L$0000(%r1),%r19
+#else
+	ldil		L%L$0000,%r19
+	ldo		R%L$0000(%r19),%r19
+#endif	
+	fldds		-16(0,%r30),%fr5
+	stws		%r23,-12(0,%r30)
+	comib,<=	0,%r25,L$1
+	fcnvxf,dbl,dbl	%fr5,%fr5
+	fldds		0(0,%r19),%fr4
+	fadd,dbl	%fr4,%fr5,%fr5
+L$1
+	fcpy,sgl	%fr0,%fr6L
+	fldws		-12(0,%r30),%fr6R
+	fcnvxf,dbl,dbl	%fr6,%fr4
+
+	fdiv,dbl	%fr5,%fr4,%fr5
+
+	fcnvfx,dbl,dbl	%fr5,%fr4
+	fstws		%fr4R,-16(%r30)
+	xmpyu		%fr4R,%fr6R,%fr6
+	ldws		-16(%r30),%r28
+	fstds		%fr6,-16(0,%r30)
+	ldws		-12(0,%r30),%r21
+	ldws		-16(0,%r30),%r20
+	sub		%r24,%r21,%r22
+	subb		%r25,%r20,%r19
+	comib,=		0,%r19,L$2
+	ldo		-64(%r30),%r30
+
+	add		%r22,%r23,%r22
+	ldo		-1(%r28),%r28
+L$2	bv		0(%r2)
+	stws		%r22,0(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/umul.s b/rts/gmp/mpn/hppa/hppa1_1/umul.s
new file mode 100644
index 0000000000..1f1300ac9b
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/umul.s
@@ -0,0 +1,42 @@
+; Copyright (C) 1999 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+	.code
+	.export		__umul_ppmm
+	.align 4
+__umul_ppmm
+	.proc
+	.callinfo frame=64,no_calls
+	.entry
+
+	ldo 64(%r30),%r30
+	stw %r25,-16(0,%r30)
+	fldws -16(0,%r30),%fr22R
+	stw %r24,-16(0,%r30)
+	fldws -16(0,%r30),%fr22L
+	xmpyu %fr22R,%fr22L,%fr22
+	fstds %fr22,-16(0,%r30)
+	ldw -16(0,%r30),%r28
+	ldw -12(0,%r30),%r29
+	stw %r29,0(0,%r26)
+	bv 0(%r2)
+	ldo -64(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa2_0/add_n.s b/rts/gmp/mpn/hppa/hppa2_0/add_n.s
new file mode 100644
index 0000000000..6e97278a39
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa2_0/add_n.s
@@ -0,0 +1,88 @@
+; HP-PA 2.0 32-bit __gmpn_add_n -- Add two limb vectors of the same length > 0
+; and store sum in a third limb vector.
+
+; Copyright (C) 1997, 1998, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.code
+	.export	__gmpn_add_n
+__gmpn_add_n
+	.proc
+	.callinfo frame=0,no_calls
+	.entry
+
+	sub		%r0,%r23,%r22
+	zdep		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	zdep		%r22,29,3,%r22		; r22 = 4 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	sub		%r26,%r22,%r26		; offset res_ptr
+	blr		%r28,%r0		; branch into loop
+	add		%r0,%r0,%r0		; reset carry
+
+L$loop	ldw		0(%r25),%r20
+	ldw		0(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,0(%r26)
+L$7	ldw		4(%r25),%r21
+	ldw		4(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,4(%r26)
+L$6	ldw		8(%r25),%r20
+	ldw		8(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,8(%r26)
+L$5	ldw		12(%r25),%r21
+	ldw		12(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,12(%r26)
+L$4	ldw		16(%r25),%r20
+	ldw		16(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,16(%r26)
+L$3	ldw		20(%r25),%r21
+	ldw		20(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,20(%r26)
+L$2	ldw		24(%r25),%r20
+	ldw		24(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,24(%r26)
+L$1	ldw		28(%r25),%r21
+	ldo		32(%r25),%r25
+	ldw		28(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,28(%r26)
+	ldo		32(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		32(%r26),%r26
+
+	bv		(%r2)
+	.exit
+	addc		%r0,%r0,%r28
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa2_0/sub_n.s b/rts/gmp/mpn/hppa/hppa2_0/sub_n.s
new file mode 100644
index 0000000000..7d9b50fc27
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa2_0/sub_n.s
@@ -0,0 +1,88 @@
+; HP-PA 2.0 32-bit __gmpn_sub_n -- Subtract two limb vectors of the same
+; length > 0 and store difference in a third limb vector.
+
+; Copyright (C) 1997, 1998, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.code
+	.export	__gmpn_sub_n
+__gmpn_sub_n
+	.proc
+	.callinfo frame=0,no_calls
+	.entry
+
+	sub		%r0,%r23,%r22
+	zdep		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	zdep		%r22,29,3,%r22		; r22 = 4 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	blr		%r28,%r0		; branch into loop
+	sub		%r26,%r22,%r26		; offset res_ptr and set carry
+
+L$loop	ldw		0(%r25),%r20
+	ldw		0(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,0(%r26)
+L$7	ldw		4(%r25),%r21
+	ldw		4(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,4(%r26)
+L$6	ldw		8(%r25),%r20
+	ldw		8(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,8(%r26)
+L$5	ldw		12(%r25),%r21
+	ldw		12(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,12(%r26)
+L$4	ldw		16(%r25),%r20
+	ldw		16(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,16(%r26)
+L$3	ldw		20(%r25),%r21
+	ldw		20(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,20(%r26)
+L$2	ldw		24(%r25),%r20
+	ldw		24(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,24(%r26)
+L$1	ldw		28(%r25),%r21
+	ldo		32(%r25),%r25
+	ldw		28(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,28(%r26)
+	ldo		32(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		32(%r26),%r26
+
+	addc		%r0,%r0,%r28
+	bv		(%r2)
+	.exit
+	subi		1,%r28,%r28
+	.procend
diff --git a/rts/gmp/mpn/hppa/lshift.s b/rts/gmp/mpn/hppa/lshift.s
new file mode 100644
index 0000000000..f5a2daad60
--- /dev/null
+++ b/rts/gmp/mpn/hppa/lshift.s
@@ -0,0 +1,66 @@
+; HP-PA  __gmpn_lshift --
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__gmpn_lshift
+__gmpn_lshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	sh2add		%r24,%r25,%r25
+	sh2add		%r24,%r26,%r26
+	ldws,mb		-4(0,%r25),%r22
+	subi		32,%r23,%r1
+	mtsar		%r1
+	addib,=		-1,%r24,L$0004
+	vshd		%r0,%r22,%r28		; compute carry out limb
+	ldws,mb		-4(0,%r25),%r29
+	addib,=		-1,%r24,L$0002
+	vshd		%r22,%r29,%r20
+
+L$loop	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	addib,=		-1,%r24,L$0003
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	addib,<>	-1,%r24,L$loop
+	vshd		%r22,%r29,%r20
+
+L$0002	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+L$0003	stws,mb		%r20,-4(0,%r26)
+L$0004	vshd		%r22,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/rshift.s b/rts/gmp/mpn/hppa/rshift.s
new file mode 100644
index 0000000000..e05e2f10b5
--- /dev/null
+++ b/rts/gmp/mpn/hppa/rshift.s
@@ -0,0 +1,63 @@
+; HP-PA  __gmpn_rshift -- 
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__gmpn_rshift
+__gmpn_rshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r22
+	mtsar		%r23
+	addib,=		-1,%r24,L$0004
+	vshd		%r22,%r0,%r28		; compute carry out limb
+	ldws,ma		4(0,%r25),%r29
+	addib,=		-1,%r24,L$0002
+	vshd		%r29,%r22,%r20
+
+L$loop	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	addib,=		-1,%r24,L$0003
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	addib,<>	-1,%r24,L$loop
+	vshd		%r29,%r22,%r20
+
+L$0002	stws,ma		%r20,4(0,%r26)
+	vshd		%r0,%r29,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+L$0003	stws,ma		%r20,4(0,%r26)
+L$0004	vshd		%r0,%r22,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/sub_n.s b/rts/gmp/mpn/hppa/sub_n.s
new file mode 100644
index 0000000000..8f770ad1ad
--- /dev/null
+++ b/rts/gmp/mpn/hppa/sub_n.s
@@ -0,0 +1,59 @@
+; HP-PA  __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; One might want to unroll this as for other processors, but it turns
+; out that the data cache contention after a store makes such
+; unrolling useless.  We can't come under 5 cycles/limb anyway.
+
+	.code
+	.export		__gmpn_sub_n
+__gmpn_sub_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,=		-1,%r23,L$end	; check for (SIZE == 1)
+	 sub		%r20,%r19,%r28	; subtract first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,<>	-1,%r23,L$loop
+	 subb		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	addc		%r0,%r0,%r28
+	bv		0(%r2)
+	 subi		1,%r28,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/udiv_qrnnd.s b/rts/gmp/mpn/hppa/udiv_qrnnd.s
new file mode 100644
index 0000000000..9aa3b8a830
--- /dev/null
+++ b/rts/gmp/mpn/hppa/udiv_qrnnd.s
@@ -0,0 +1,286 @@
+; HP-PA  __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on pre-PA7000 CPUs.
+
+; Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr	gr26
+; n1		gr25
+; n0		gr24
+; d		gr23
+
+; The code size is a bit excessive.  We could merge the last two ds;addc
+; sequences by simply moving the "bb,< Odd" instruction down.  The only
+; trouble is the FFFFFFFF code that would need some hacking.
+
+	.code
+	.export		__gmpn_udiv_qrnnd
+__gmpn_udiv_qrnnd
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	comb,<		%r23,0,L$largedivisor
+	 sub		%r0,%r23,%r1		; clear cy as side-effect
+	ds		%r0,%r1,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r28
+	ds		%r25,%r23,%r25
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r23,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r28,%r28,%r28
+
+L$largedivisor
+	extru		%r24,31,1,%r19		; r19 = n0 & 1
+	bb,<		%r23,31,L$odd
+	 extru		%r23,30,31,%r22		; r22 = d >> 1
+	shd		%r25,%r24,1,%r24	; r24 = new n0
+	extru		%r25,30,31,%r25		; r25 = new n1
+	sub		%r0,%r22,%r21
+	ds		%r0,%r21,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r22,%r25
+	sh1addl		%r25,%r19,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r24,%r24,%r28
+
+L$odd	addib,sv,n	1,%r22,L$FF..		; r22 = (d / 2 + 1)
+	shd		%r25,%r24,1,%r24	; r24 = new n0
+	extru		%r25,30,31,%r25		; r25 = new n1
+	sub		%r0,%r22,%r21
+	ds		%r0,%r21,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r28
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r22,%r25
+	sh1addl		%r25,%r19,%r25
+; We have computed (n1,,n0) / (d + 1), q' = r28, r' = r25
+	add,nuv		%r28,%r25,%r25
+	addl		%r25,%r1,%r25
+	addc		%r0,%r28,%r28
+	sub,<<		%r25,%r23,%r0
+	addl		%r25,%r1,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r28,%r28
+
+; This is just a special case of the code above.
+; We come here when d == 0xFFFFFFFF
+L$FF..	add,uv		%r25,%r24,%r24
+	sub,<<		%r24,%r23,%r0
+	ldo		1(%r24),%r24
+	stws		%r24,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r25,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/i960/README b/rts/gmp/mpn/i960/README
new file mode 100644
index 0000000000..d68a0a83eb
--- /dev/null
+++ b/rts/gmp/mpn/i960/README
@@ -0,0 +1,9 @@
+This directory contains mpn functions for Intel i960 processors.
+
+RELEVANT OPTIMIZATION ISSUES
+
+The code in this directory is not well optimized.
+
+STATUS
+
+The code in this directory has not been tested.
diff --git a/rts/gmp/mpn/i960/add_n.s b/rts/gmp/mpn/i960/add_n.s
new file mode 100644
index 0000000000..387317a397
--- /dev/null
+++ b/rts/gmp/mpn/i960/add_n.s
@@ -0,0 +1,43 @@
+# I960 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align 4
+	.globl ___gmpn_add_n
+___gmpn_add_n:
+	mov	0,g6		# clear carry-save register
+	cmpo	1,0		# clear cy
+
+Loop:	subo	1,g3,g3		# update loop counter
+	ld	(g1),g5		# load from s1_ptr
+	addo	4,g1,g1		# s1_ptr++
+	ld	(g2),g4		# load from s2_ptr
+	addo	4,g2,g2		# s2_ptr++
+	cmpo	g6,1		# restore cy from g6, relies on cy being 0
+	addc	g4,g5,g4	# main add
+	subc	0,0,g6		# save cy in g6
+	st	g4,(g0)		# store result to res_ptr
+	addo	4,g0,g0		# res_ptr++
+	cmpobne	0,g3,Loop	# when branch is taken, clears C bit
+
+	mov	g6,g0
+	ret
diff --git a/rts/gmp/mpn/i960/addmul_1.s b/rts/gmp/mpn/i960/addmul_1.s
new file mode 100644
index 0000000000..7df1418356
--- /dev/null
+++ b/rts/gmp/mpn/i960/addmul_1.s
@@ -0,0 +1,48 @@
+# I960 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align	4
+	.globl	___gmpn_mul_1
+___gmpn_mul_1:
+	subo	g2,0,g2
+	shlo	2,g2,g4
+	subo	g4,g1,g1
+	subo	g4,g0,g13
+	mov	0,g0
+
+	cmpo	1,0		# clear C bit on AC.cc
+
+Loop:	ld	(g1)[g2*4],g5
+	emul	g3,g5,g6
+	ld	(g13)[g2*4],g5
+
+	addc	g0,g6,g6	# relies on that C bit is clear
+	addc	0,g7,g7
+	addc	g5,g6,g6	# relies on that C bit is clear
+	st	g6,(g13)[g2*4]
+	addc	0,g7,g0
+
+	addo	g2,1,g2
+	cmpobne	0,g2,Loop	# when branch is taken, clears C bit
+
+	ret
diff --git a/rts/gmp/mpn/i960/mul_1.s b/rts/gmp/mpn/i960/mul_1.s
new file mode 100644
index 0000000000..5c0c985aa5
--- /dev/null
+++ b/rts/gmp/mpn/i960/mul_1.s
@@ -0,0 +1,45 @@
+# I960 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align	4
+	.globl	___gmpn_mul_1
+___gmpn_mul_1:
+	subo	g2,0,g2
+	shlo	2,g2,g4
+	subo	g4,g1,g1
+	subo	g4,g0,g13
+	mov	0,g0
+
+	cmpo	1,0		# clear C bit on AC.cc
+
+Loop:	ld	(g1)[g2*4],g5
+	emul	g3,g5,g6
+
+	addc	g0,g6,g6	# relies on that C bit is clear
+	st	g6,(g13)[g2*4]
+	addc	0,g7,g0
+
+	addo	g2,1,g2
+	cmpobne	0,g2,Loop	# when branch is taken, clears C bit
+
+	ret
diff --git a/rts/gmp/mpn/i960/sub_n.s b/rts/gmp/mpn/i960/sub_n.s
new file mode 100644
index 0000000000..2db2d46aad
--- /dev/null
+++ b/rts/gmp/mpn/i960/sub_n.s
@@ -0,0 +1,43 @@
+# I960 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align 4
+	.globl ___gmpn_sub_n
+___gmpn_sub_n:
+	mov	1,g6		# set carry-save register
+	cmpo	1,0		# clear cy
+
+Loop:	subo	1,g3,g3		# update loop counter
+	ld	(g1),g5		# load from s1_ptr
+	addo	4,g1,g1		# s1_ptr++
+	ld	(g2),g4		# load from s2_ptr
+	addo	4,g2,g2		# s2_ptr++
+	cmpo	g6,1		# restore cy from g6, relies on cy being 0
+	subc	g4,g5,g4	# main subtract
+	subc	0,0,g6		# save cy in g6
+	st	g4,(g0)		# store result to res_ptr
+	addo	4,g0,g0		# res_ptr++
+	cmpobne	0,g3,Loop	# when branch is taken, cy will be 0
+
+	mov	g6,g0
+	ret
diff --git a/rts/gmp/mpn/lisp/gmpasm-mode.el b/rts/gmp/mpn/lisp/gmpasm-mode.el
new file mode 100644
index 0000000000..5d9da7fa1f
--- /dev/null
+++ b/rts/gmp/mpn/lisp/gmpasm-mode.el
@@ -0,0 +1,351 @@
+;;; gmpasm-mode.el -- GNU MP asm and m4 editing mode.
+
+
+;; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+;;
+;; This file is part of the GNU MP Library.
+;;
+;; The GNU MP Library is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU Lesser General Public License as published by
+;; the Free Software Foundation; either version 2.1 of the License, or (at your
+;; option) any later version.
+;;
+;; The GNU MP Library is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU Lesser General Public License
+;; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+;; MA 02111-1307, USA.
+
+
+;;; Commentary:
+;;
+;; gmpasm-mode is an editing mode for m4 processed assembler code and m4
+;; macro files in GMP.  It's similar to m4-mode, but has a number of
+;; settings better suited to GMP.
+;;
+;;
+;; Install
+;; -------
+;;
+;; To make M-x gmpasm-mode available, put gmpasm-mode.el somewhere in the
+;; load-path and the following in .emacs
+;;
+;;	(autoload 'gmpasm-mode "gmpasm-mode" nil t)
+;;
+;; To use gmpasm-mode automatically on all .asm and .m4 files, put the
+;; following in .emacs
+;;
+;;	(add-to-list 'auto-mode-alist '("\\.asm\\'" . gmpasm-mode))
+;;	(add-to-list 'auto-mode-alist '("\\.m4\\'" . gmpasm-mode))
+;;
+;; To have gmpasm-mode only on gmp files, try instead something like the
+;; following, which uses it only in a directory starting with "gmp", or a
+;; sub-directory of such.
+;;
+;;	(add-to-list 'auto-mode-alist
+;;	             '("/gmp.*/.*\\.\\(asm\\|m4\\)\\'" . gmpasm-mode))
+;;
+;; Byte compiling will slightly speed up loading.  If you want a docstring
+;; in the autoload you can use M-x update-file-autoloads if you set it up
+;; right.
+;;
+;;
+;; Emacsen
+;; -------
+;;
+;; FSF Emacs 20.x - gmpasm-mode is designed for this.
+;; XEmacs 20.x - seems to work.
+;;
+;; FSF Emacs 19.x - should work if replacements for some 20.x-isms are
+;;    available.  comment-region with "C" won't really do the right thing
+;;    though.
+
+
+;;; Code:
+
+(defgroup gmpasm nil
+  "GNU MP m4 and asm editing."
+  :prefix "gmpasm-"
+  :group 'languages)
+
+(defcustom gmpasm-mode-hook nil
+  "*Hook called by `gmpasm-mode'."
+  :type 'hook
+  :group 'gmpasm)
+
+(defcustom gmpasm-comment-start-regexp "[#;!@C]"
+  "*Regexp matching possible comment styles.
+See `gmpasm-mode' docstring for how this is used."
+  :type 'regexp
+  :group 'gmpasm)
+
+
+(defun gmpasm-add-to-list-second (list-var element)
+  "(gmpasm-add-to-list-second LIST-VAR ELEMENT)
+
+Add ELEMENT to LIST-VAR as the second element in the list, if it isn't
+already in the list.  If LIST-VAR is nil, then ELEMENT is just added as the
+sole element in the list.
+
+This is like `add-to-list', but it puts the new value second in the list.
+
+The first cons cell is copied rather than changed in-place, so references to
+the list elsewhere won't be affected."
+
+  (if (member element (symbol-value list-var))
+      (symbol-value list-var)
+    (set list-var
+	 (if (symbol-value list-var)
+	     (cons (car (symbol-value list-var))
+		   (cons element
+			 (cdr (symbol-value list-var))))
+	   (list element)))))
+
+
+(defun gmpasm-delete-from-list (list-var element)
+  "(gmpasm-delete-from-list LIST-VAR ELEMENT)
+
+Delete ELEMENT from LIST-VAR, using `delete'.
+This is like `add-to-list', but the element is deleted from the list.
+The list is copied rather than changed in-place, so references to it elsewhere
+won't be affected."
+
+  (set list-var (delete element (copy-sequence (symbol-value list-var)))))
+
+
+(defvar gmpasm-mode-map
+  (let ((map (make-sparse-keymap)))
+    
+    ;; assembler and dnl commenting
+    (define-key map "\C-c\C-c" 'comment-region)
+    (define-key map "\C-c\C-d" 'gmpasm-comment-region-dnl)
+    
+    ;; kill an M-x compile, since it's not hard to put m4 into an infinite
+    ;; loop
+    (define-key map "\C-c\C-k" 'kill-compilation)
+    
+    map)
+  "Keymap for `gmpasm-mode'.")
+
+
+(defvar gmpasm-mode-syntax-table
+  (let ((table (make-syntax-table)))
+    ;; underscore left as a symbol char, like C mode
+    
+    ;; m4 quotes
+    (modify-syntax-entry ?`  "('"  table)
+    (modify-syntax-entry ?'  ")`"  table)
+
+    table)
+  "Syntax table used in `gmpasm-mode'.
+
+m4 ignores quote marks in # comments at the top level, but inside quotes #
+isn't special and all quotes are active.  There seems no easy way to express
+this in the syntax table, so nothing is done for comments.  Usually this is
+best, since it picks up invalid apostrophes in comments inside quotes.")
+
+
+(defvar gmpasm-font-lock-keywords
+  (eval-when-compile
+    (list
+     (cons
+      (concat
+       "\\b"
+       (regexp-opt
+	'("deflit" "defreg" "defframe" "defframe_pushl"
+	  "define_not_for_expansion"
+	  "ASM_START" "ASM_END" "PROLOGUE" "EPILOGUE"
+	  "forloop"
+	  "TEXT" "DATA" "ALIGN" "W32"
+	  "builtin" "changecom" "changequote" "changeword" "debugfile"
+	  "debugmode" "decr" "define" "defn" "divert" "divnum" "dumpdef"
+	  "errprint" "esyscmd" "eval" "__file__" "format" "gnu" "ifdef"
+	  "ifelse" "include" "incr" "index" "indir" "len" "__line__"
+	  "m4exit" "m4wrap" "maketemp" "patsubst" "popdef" "pushdef"
+	  "regexp" "shift" "sinclude" "substr" "syscmd" "sysval"
+	  "traceoff" "traceon" "translit" "undefine" "undivert" "unix")
+	t)
+       "\\b") 'font-lock-keyword-face)))
+
+  "`font-lock-keywords' for `gmpasm-mode'.
+
+The keywords are m4 builtins and some of the GMP macros used in asm files.
+L and LF don't look good fontified, so they're omitted.
+
+The right assembler comment regexp is added dynamically buffer-local (with
+dnl too).")
+
+
+;; Initialized if gmpasm-mode finds filladapt loaded.
+(defvar gmpasm-filladapt-token-table nil
+  "Filladapt token table used in `gmpasm-mode'.")
+(defvar gmpasm-filladapt-token-match-table nil
+  "Filladapt token match table used in `gmpasm-mode'.")
+(defvar gmpasm-filladapt-token-conversion-table nil
+  "Filladapt token conversion table used in `gmpasm-mode'.")
+
+
+;;;###autoload
+(defun gmpasm-mode ()
+  "A major mode for editing GNU MP asm and m4 files.
+
+\\{gmpasm-mode-map}
+`comment-start' and `comment-end' are set buffer-local to assembler
+commenting appropriate for the CPU by looking for something matching
+`gmpasm-comment-start-regexp' at the start of a line, or \"#\" is used if
+there's no match (if \"#\" isn't what you want, type in a desired comment
+and do \\[gmpasm-mode] to reinitialize).
+
+`adaptive-fill-regexp' is set buffer-local to the standard regexp with
+`comment-start' and dnl added.  If filladapt.el has been loaded it similarly
+gets `comment-start' and dnl added as buffer-local fill prefixes.
+
+Font locking has the m4 builtins, some of the GMP macros, m4 dnl commenting,
+and assembler commenting (based on the `comment-start' determined).
+
+Note that `gmpasm-comment-start-regexp' is only matched as a whole word, so
+the `C' in it is only matched as a whole word, not on something that happens
+to start with `C'.  Also it's only the particular `comment-start' determined
+that's added for filling etc, not the whole `gmpasm-comment-start-regexp'.
+
+`gmpasm-mode-hook' is run after initializations are complete.
+"
+
+  (interactive)
+  (kill-all-local-variables)
+  (setq major-mode 'gmpasm-mode
+        mode-name  "gmpasm")
+  (use-local-map gmpasm-mode-map)
+  (set-syntax-table gmpasm-mode-syntax-table)
+  (setq fill-column 76)
+
+  ;; Short instructions might fit with 32, but anything with labels or
+  ;; expressions soon needs the comments pushed out to column 40.
+  (setq comment-column 40)
+
+  ;; Don't want to find out the hard way which dumb assemblers don't like a
+  ;; missing final newline.
+  (set (make-local-variable 'require-final-newline) t)
+
+  ;; The first match of gmpasm-comment-start-regexp at the start of a line
+  ;; determines comment-start, or "#" if no match.
+  (set (make-local-variable 'comment-start)
+       (save-excursion
+	 (goto-char (point-min))
+	 (if (re-search-forward
+	      (concat "^\\(" gmpasm-comment-start-regexp "\\)\\(\\s-\\|$\\)")
+	      nil t)
+	     (match-string 1)
+	   "#")))
+  (set (make-local-variable 'comment-end) "")
+
+  ;; If comment-start ends in an alphanumeric then \b is used to match it
+  ;; only as a separate word.  The test is for an alphanumeric rather than
+  ;; \w since we might try # or ! as \w characters but without wanting \b.
+  (let ((comment-regexp
+	 (concat (regexp-quote comment-start)
+		 (if (string-match "[a-zA-Z0-9]\\'" comment-start) "\\b"))))
+    
+    ;; Whitespace is required before a comment-start so m4 $# doesn't match
+    ;; when comment-start is "#".
+    ;; Only spaces or tabs match after, so newline isn't included in the
+    ;; font lock below.
+    (set (make-local-variable 'comment-start-skip)
+	 (concat "\\(^\\|\\s-\\)" comment-regexp "[ \t]*"))
+
+    ;; Comment fontification based on comment-start, matching through to the
+    ;; end of the line.
+    (add-to-list (make-local-variable 'gmpasm-font-lock-keywords)
+		 (cons (concat
+			"\\(\\bdnl\\b\\|" comment-start-skip "\\).*$")
+		       'font-lock-comment-face))
+
+    (set (make-local-variable 'font-lock-defaults)
+	 '(gmpasm-font-lock-keywords
+	   t	         ; no syntactic fontification (of strings etc)
+	   nil           ; no case-fold
+	   ((?_ . "w"))  ; _ part of a word while fontifying
+	   ))
+
+    ;; Paragraphs are separated by blank lines, or lines with only dnl or
+    ;; comment-start.
+    (set (make-local-variable 'paragraph-separate)
+	 (concat "[ \t\f]*\\(\\(" comment-regexp "\\|dnl\\)[ \t]*\\)*$"))
+    (set (make-local-variable 'paragraph-start)
+	 (concat "\f\\|" paragraph-separate))
+ 
+    ;; Adaptive fill gets dnl and comment-start as comment style prefixes on
+    ;; top of the standard regexp (which has # and ; already actually).
+    (set (make-local-variable 'adaptive-fill-regexp)
+	 (concat "[ \t]*\\(\\("
+		 comment-regexp
+		 "\\|dnl\\|[-|#;>*]+\\|(?[0-9]+[.)]\\)[ \t]*\\)*"))
+    (set (make-local-variable 'adaptive-fill-first-line-regexp)
+	 "\\`\\([ \t]*dnl\\)?[ \t]*\\'")
+
+    (when (fboundp 'filladapt-mode)
+      (when (not gmpasm-filladapt-token-table)
+	(setq gmpasm-filladapt-token-table
+	      filladapt-token-table)
+	(setq gmpasm-filladapt-token-match-table
+	      filladapt-token-match-table)
+	(setq gmpasm-filladapt-token-conversion-table
+	      filladapt-token-conversion-table)
+	
+	;; Numbered bullet points like "2.1" get matched at the start of a
+	;; line when it's really something like "2.1 cycles/limb", so delete
+	;; this from the list.  The regexp for "1.", "2." etc is left
+	;; though.
+	(gmpasm-delete-from-list 'gmpasm-filladapt-token-table
+				 '("[0-9]+\\(\\.[0-9]+\\)+[ \t]"
+				   bullet))
+	  
+	;; "%" as a comment prefix interferes with x86 register names
+	;; like %eax, so delete this.
+	(gmpasm-delete-from-list 'gmpasm-filladapt-token-table
+				 '("%+" postscript-comment))
+	
+	(add-to-list 'gmpasm-filladapt-token-match-table
+		     '(gmpasm-comment gmpasm-comment))
+	(add-to-list 'gmpasm-filladapt-token-conversion-table
+		     '(gmpasm-comment . exact))
+	)
+      
+      (set (make-local-variable 'filladapt-token-table)
+	   gmpasm-filladapt-token-table)
+      (set (make-local-variable 'filladapt-token-match-table)
+	   gmpasm-filladapt-token-match-table)
+      (set (make-local-variable 'filladapt-token-conversion-table)
+	   gmpasm-filladapt-token-conversion-table)
+    
+      ;; Add dnl and comment-start as fill prefixes.
+      ;; Comments in filladapt.el say filladapt-token-table must begin
+      ;; with ("^" beginning-of-line), so put our addition second.
+      (gmpasm-add-to-list-second 'filladapt-token-table
+				 (list (concat "dnl[ \t]\\|" comment-regexp)
+				       'gmpasm-comment))
+      ))
+  
+  (run-hooks 'gmpasm-mode-hook))
+
+
+(defun gmpasm-comment-region-dnl (beg end &optional arg)
+  "(gmpasm-comment-region BEG END &option ARG)
+
+Comment or uncomment each line in the region using `dnl'.
+With \\[universal-argument] prefix arg, uncomment each line in region.
+This is `comment-region', but using \"dnl\"."
+
+  (interactive "r\nP")
+  (let ((comment-start "dnl")
+	(comment-end ""))
+    (comment-region beg end arg)))
+
+
+(provide 'gmpasm-mode)
+
+;;; gmpasm-mode.el ends here
diff --git a/rts/gmp/mpn/m68k/add_n.S b/rts/gmp/mpn/m68k/add_n.S
new file mode 100644
index 0000000000..9e1d89d64f
--- /dev/null
+++ b/rts/gmp/mpn/m68k/add_n.S
@@ -0,0 +1,79 @@
+/* mc68020 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+   sum in a third limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s1_ptr	(sp + 8)
+  s2_ptr	(sp + 16)
+  size		(sp + 12)
+*/
+
+#include "asm-syntax.h"
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_add_n)
+
+C_SYMBOL_NAME(__gmpn_add_n:)
+PROLOG(__gmpn_add_n)
+/* Save used registers on the stack.  */
+	movel	R(d2),MEM_PREDEC(sp)
+	movel	R(a2),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  Better use movem?  */
+	movel	MEM_DISP(sp,12),R(a2)
+	movel	MEM_DISP(sp,16),R(a0)
+	movel	MEM_DISP(sp,20),R(a1)
+	movel	MEM_DISP(sp,24),R(d2)
+
+	eorw	#1,R(d2)
+	lsrl	#1,R(d2)
+	bcc	L(L1)
+	subql	#1,R(d2)	/* clears cy as side effect */
+
+L(Loop:)
+	movel	MEM_POSTINC(a0),R(d0)
+	movel	MEM_POSTINC(a1),R(d1)
+	addxl	R(d1),R(d0)
+	movel	R(d0),MEM_POSTINC(a2)
+L(L1:)	movel	MEM_POSTINC(a0),R(d0)
+	movel	MEM_POSTINC(a1),R(d1)
+	addxl	R(d1),R(d0)
+	movel	R(d0),MEM_POSTINC(a2)
+
+	dbf	R(d2),L(Loop)		/* loop until 16 lsb of %4 == -1 */
+	subxl	R(d0),R(d0)	/* d0 <= -cy; save cy as 0 or -1 in d0 */
+	subl	#0x10000,R(d2)
+	bcs	L(L2)
+	addl	R(d0),R(d0)	/* restore cy */
+	bra	L(Loop)
+
+L(L2:)
+	negl	R(d0)
+
+/* Restore used registers from stack frame.  */
+	movel	MEM_POSTINC(sp),R(a2)
+	movel	MEM_POSTINC(sp),R(d2)
+
+	rts
+EPILOG(__gmpn_add_n)
diff --git a/rts/gmp/mpn/m68k/lshift.S b/rts/gmp/mpn/m68k/lshift.S
new file mode 100644
index 0000000000..a539d5d42e
--- /dev/null
+++ b/rts/gmp/mpn/m68k/lshift.S
@@ -0,0 +1,150 @@
+/* mc68020 __gmpn_lshift -- Shift left a low-level natural-number integer.
+
+Copyright (C) 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s_ptr		(sp + 8)
+  s_size	(sp + 16)
+  cnt		(sp + 12)
+*/
+
+#include "asm-syntax.h"
+
+#define res_ptr a1
+#define s_ptr a0
+#define s_size d6
+#define cnt d4
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_lshift)
+
+C_SYMBOL_NAME(__gmpn_lshift:)
+PROLOG(__gmpn_lshift)
+
+/* Save used registers on the stack.  */
+	moveml	R(d2)-R(d6)/R(a2),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  */
+	movel	MEM_DISP(sp,28),R(res_ptr)
+	movel	MEM_DISP(sp,32),R(s_ptr)
+	movel	MEM_DISP(sp,36),R(s_size)
+	movel	MEM_DISP(sp,40),R(cnt)
+
+	moveql	#1,R(d5)
+	cmpl	R(d5),R(cnt)
+	bne	L(Lnormal)
+	cmpl	R(s_ptr),R(res_ptr)
+	bls	L(Lspecial)		/* jump if s_ptr >= res_ptr */
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+	lea	MEM_INDX1(s_ptr,s_size,l,4),R(a2)
+#else /* not mc68020 */
+	movel	R(s_size),R(d0)
+	asll	#2,R(d0)
+	lea	MEM_INDX(s_ptr,d0,l),R(a2)
+#endif
+	cmpl	R(res_ptr),R(a2)
+	bls	L(Lspecial)		/* jump if res_ptr >= s_ptr + s_size */
+
+L(Lnormal:)
+	moveql	#32,R(d5)
+	subl	R(cnt),R(d5)
+
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+	lea	MEM_INDX1(s_ptr,s_size,l,4),R(s_ptr)
+	lea	MEM_INDX1(res_ptr,s_size,l,4),R(res_ptr)
+#else /* not mc68000 */
+	movel	R(s_size),R(d0)
+	asll	#2,R(d0)
+	addl	R(s_size),R(s_ptr)
+	addl	R(s_size),R(res_ptr)
+#endif
+	movel	MEM_PREDEC(s_ptr),R(d2)
+	movel	R(d2),R(d0)
+	lsrl	R(d5),R(d0)		/* compute carry limb */
+
+	lsll	R(cnt),R(d2)
+	movel	R(d2),R(d1)
+	subql	#1,R(s_size)
+	beq	L(Lend)
+	lsrl	#1,R(s_size)
+	bcs	L(L1)
+	subql	#1,R(s_size)
+
+L(Loop:)
+	movel	MEM_PREDEC(s_ptr),R(d2)
+	movel	R(d2),R(d3)
+	lsrl	R(d5),R(d3)
+	orl	R(d3),R(d1)
+	movel	R(d1),MEM_PREDEC(res_ptr)
+	lsll	R(cnt),R(d2)
+L(L1:)
+	movel	MEM_PREDEC(s_ptr),R(d1)
+	movel	R(d1),R(d3)
+	lsrl	R(d5),R(d3)
+	orl	R(d3),R(d2)
+	movel	R(d2),MEM_PREDEC(res_ptr)
+	lsll	R(cnt),R(d1)
+
+	dbf	R(s_size),L(Loop)
+	subl	#0x10000,R(s_size)
+	bcc	L(Loop)
+
+L(Lend:)
+	movel	R(d1),MEM_PREDEC(res_ptr) /* store least significant limb */
+
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+	rts
+
+/* We loop from least significant end of the arrays, which is only
+   permissable if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.  */
+
+L(Lspecial:)
+	clrl	R(d0)			/* initialize carry */
+	eorw	#1,R(s_size)
+	lsrl	#1,R(s_size)
+	bcc	L(LL1)
+	subql	#1,R(s_size)
+
+L(LLoop:)
+	movel	MEM_POSTINC(s_ptr),R(d2)
+	addxl	R(d2),R(d2)
+	movel	R(d2),MEM_POSTINC(res_ptr)
+L(LL1:)
+	movel	MEM_POSTINC(s_ptr),R(d2)
+	addxl	R(d2),R(d2)
+	movel	R(d2),MEM_POSTINC(res_ptr)
+
+	dbf	R(s_size),L(LLoop)
+	addxl	R(d0),R(d0)		/* save cy in lsb */
+	subl	#0x10000,R(s_size)
+	bcs	L(LLend)
+	lsrl	#1,R(d0)		/* restore cy */
+	bra	L(LLoop)
+
+L(LLend:)
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+	rts
+EPILOG(__gmpn_lshift)
diff --git a/rts/gmp/mpn/m68k/mc68020/addmul_1.S b/rts/gmp/mpn/m68k/mc68020/addmul_1.S
new file mode 100644
index 0000000000..6638115d71
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/addmul_1.S
@@ -0,0 +1,83 @@
+/* mc68020 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+   the result to a second limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s1_ptr	(sp + 8)
+  s1_size	(sp + 12)
+  s2_limb	(sp + 16)
+*/
+
+#include "asm-syntax.h"
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_addmul_1)
+
+C_SYMBOL_NAME(__gmpn_addmul_1:)
+PROLOG(__gmpn_addmul_1)
+
+#define res_ptr a0
+#define s1_ptr a1
+#define s1_size d2
+#define s2_limb d4
+
+/* Save used registers on the stack.  */
+	moveml	R(d2)-R(d5),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  Better use movem?  */
+	movel	MEM_DISP(sp,20),R(res_ptr)
+	movel	MEM_DISP(sp,24),R(s1_ptr)
+	movel	MEM_DISP(sp,28),R(s1_size)
+	movel	MEM_DISP(sp,32),R(s2_limb)
+
+	eorw	#1,R(s1_size)
+	clrl	R(d1)
+	clrl	R(d5)
+	lsrl	#1,R(s1_size)
+	bcc	L(L1)
+	subql	#1,R(s1_size)
+	subl	R(d0),R(d0)		/* (d0,cy) <= (0,0) */
+
+L(Loop:)
+	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d1):R(d3)
+	addxl	R(d0),R(d3)
+	addxl	R(d5),R(d1)
+	addl	R(d3),MEM_POSTINC(res_ptr)
+L(L1:)	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d0):R(d3)
+	addxl	R(d1),R(d3)
+	addxl	R(d5),R(d0)
+	addl	R(d3),MEM_POSTINC(res_ptr)
+
+	dbf	R(s1_size),L(Loop)
+	addxl	R(d5),R(d0)
+	subl	#0x10000,R(s1_size)
+	bcc	L(Loop)
+
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d5)
+
+	rts
+EPILOG(__gmpn_addmul_1)
diff --git a/rts/gmp/mpn/m68k/mc68020/mul_1.S b/rts/gmp/mpn/m68k/mc68020/mul_1.S
new file mode 100644
index 0000000000..fdd4c39d70
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/mul_1.S
@@ -0,0 +1,90 @@
+/* mc68020 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+   the result in a second limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s1_ptr	(sp + 8)
+  s1_size	(sp + 12)
+  s2_limb	(sp + 16)
+*/
+
+#include "asm-syntax.h"
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_mul_1)
+
+C_SYMBOL_NAME(__gmpn_mul_1:)
+PROLOG(__gmpn_mul_1)
+
+#define res_ptr a0
+#define s1_ptr a1
+#define s1_size d2
+#define s2_limb d4
+
+/* Save used registers on the stack.  */
+	moveml	R(d2)-R(d4),MEM_PREDEC(sp)
+#if 0
+	movel	R(d2),MEM_PREDEC(sp)
+	movel	R(d3),MEM_PREDEC(sp)
+	movel	R(d4),MEM_PREDEC(sp)
+#endif
+
+/* Copy the arguments to registers.  Better use movem?  */
+	movel	MEM_DISP(sp,16),R(res_ptr)
+	movel	MEM_DISP(sp,20),R(s1_ptr)
+	movel	MEM_DISP(sp,24),R(s1_size)
+	movel	MEM_DISP(sp,28),R(s2_limb)
+
+	eorw	#1,R(s1_size)
+	clrl	R(d1)
+	lsrl	#1,R(s1_size)
+	bcc	L(L1)
+	subql	#1,R(s1_size)
+	subl	R(d0),R(d0)	/* (d0,cy) <= (0,0) */
+
+L(Loop:)
+	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d1):R(d3)
+	addxl	R(d0),R(d3)
+	movel	R(d3),MEM_POSTINC(res_ptr)
+L(L1:)	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d0):R(d3)
+	addxl	R(d1),R(d3)
+	movel	R(d3),MEM_POSTINC(res_ptr)
+
+	dbf	R(s1_size),L(Loop)
+	clrl	R(d3)
+	addxl	R(d3),R(d0)
+	subl	#0x10000,R(s1_size)
+	bcc	L(Loop)
+
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d4)
+#if 0
+	movel	MEM_POSTINC(sp),R(d4)
+	movel	MEM_POSTINC(sp),R(d3)
+	movel	MEM_POSTINC(sp),R(d2)
+#endif
+	rts
+EPILOG(__gmpn_mul_1)
diff --git a/rts/gmp/mpn/m68k/mc68020/submul_1.S b/rts/gmp/mpn/m68k/mc68020/submul_1.S
new file mode 100644
index 0000000000..3c36b70166
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/submul_1.S
@@ -0,0 +1,83 @@
+/* mc68020 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+   the result from a second limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s1_ptr	(sp + 8)
+  s1_size	(sp + 12)
+  s2_limb	(sp + 16)
+*/
+
+#include "asm-syntax.h"
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_submul_1)
+
+C_SYMBOL_NAME(__gmpn_submul_1:)
+PROLOG(__gmpn_submul_1)
+
+#define res_ptr a0
+#define s1_ptr a1
+#define s1_size d2
+#define s2_limb d4
+
+/* Save used registers on the stack.  */
+	moveml	R(d2)-R(d5),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  Better use movem?  */
+	movel	MEM_DISP(sp,20),R(res_ptr)
+	movel	MEM_DISP(sp,24),R(s1_ptr)
+	movel	MEM_DISP(sp,28),R(s1_size)
+	movel	MEM_DISP(sp,32),R(s2_limb)
+
+	eorw	#1,R(s1_size)
+	clrl	R(d1)
+	clrl	R(d5)
+	lsrl	#1,R(s1_size)
+	bcc	L(L1)
+	subql	#1,R(s1_size)
+	subl	R(d0),R(d0)	/* (d0,cy) <= (0,0) */
+
+L(Loop:)
+	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d1):R(d3)
+	addxl	R(d0),R(d3)
+	addxl	R(d5),R(d1)
+	subl	R(d3),MEM_POSTINC(res_ptr)
+L(L1:)	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d0):R(d3)
+	addxl	R(d1),R(d3)
+	addxl	R(d5),R(d0)
+	subl	R(d3),MEM_POSTINC(res_ptr)
+
+	dbf	R(s1_size),L(Loop)
+	addxl	R(d5),R(d0)
+	subl	#0x10000,R(s1_size)
+	bcc	L(Loop)
+
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d5)
+
+	rts
+EPILOG(__gmpn_submul_1)
diff --git a/rts/gmp/mpn/m68k/mc68020/udiv.S b/rts/gmp/mpn/m68k/mc68020/udiv.S
new file mode 100644
index 0000000000..d00cf13558
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/udiv.S
@@ -0,0 +1,31 @@
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+.text
+	.even
+.globl ___udiv_qrnnd
+___udiv_qrnnd:
+	movel sp@(4),a0
+	movel sp@(8),d1
+	movel sp@(12),d0
+	divul sp@(16),d1:d0
+	movel d1,a0@
+	rts
diff --git a/rts/gmp/mpn/m68k/mc68020/umul.S b/rts/gmp/mpn/m68k/mc68020/umul.S
new file mode 100644
index 0000000000..a34ae6c543
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/umul.S
@@ -0,0 +1,31 @@
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+.text
+	.even
+.globl ___umul_ppmm
+___umul_ppmm:
+	movel sp@(4),a0
+	movel sp@(8),d1
+	movel sp@(12),d0
+	mulul d0,d0:d1
+	movel d1,a0@
+	rts
diff --git a/rts/gmp/mpn/m68k/rshift.S b/rts/gmp/mpn/m68k/rshift.S
new file mode 100644
index 0000000000..b47a48e52a
--- /dev/null
+++ b/rts/gmp/mpn/m68k/rshift.S
@@ -0,0 +1,149 @@
+/* mc68020 __gmpn_rshift -- Shift right a low-level natural-number integer.
+
+Copyright (C) 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s_ptr		(sp + 8)
+  s_size	(sp + 16)
+  cnt		(sp + 12)
+*/
+
+#include "asm-syntax.h"
+
+#define res_ptr a1
+#define s_ptr a0
+#define s_size d6
+#define cnt d4
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_rshift)
+
+C_SYMBOL_NAME(__gmpn_rshift:)
+PROLOG(__gmpn_rshift)
+/* Save used registers on the stack.  */
+	moveml	R(d2)-R(d6)/R(a2),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  */
+	movel	MEM_DISP(sp,28),R(res_ptr)
+	movel	MEM_DISP(sp,32),R(s_ptr)
+	movel	MEM_DISP(sp,36),R(s_size)
+	movel	MEM_DISP(sp,40),R(cnt)
+
+	moveql	#1,R(d5)
+	cmpl	R(d5),R(cnt)
+	bne	L(Lnormal)
+	cmpl	R(res_ptr),R(s_ptr)
+	bls	L(Lspecial)		/* jump if res_ptr >= s_ptr */
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+	lea	MEM_INDX1(res_ptr,s_size,l,4),R(a2)
+#else /* not mc68020 */
+	movel	R(s_size),R(d0)
+	asll	#2,R(d0)
+	lea	MEM_INDX(res_ptr,d0,l),R(a2)
+#endif
+	cmpl	R(s_ptr),R(a2)
+	bls	L(Lspecial)		/* jump if s_ptr >= res_ptr + s_size */
+
+L(Lnormal:)
+	moveql	#32,R(d5)
+	subl	R(cnt),R(d5)
+	movel	MEM_POSTINC(s_ptr),R(d2)
+	movel	R(d2),R(d0)
+	lsll	R(d5),R(d0)		/* compute carry limb */
+
+	lsrl	R(cnt),R(d2)
+	movel	R(d2),R(d1)
+	subql	#1,R(s_size)
+	beq	L(Lend)
+	lsrl	#1,R(s_size)
+	bcs	L(L1)
+	subql	#1,R(s_size)
+
+L(Loop:)
+	movel	MEM_POSTINC(s_ptr),R(d2)
+	movel	R(d2),R(d3)
+	lsll	R(d5),R(d3)
+	orl	R(d3),R(d1)
+	movel	R(d1),MEM_POSTINC(res_ptr)
+	lsrl	R(cnt),R(d2)
+L(L1:)
+	movel	MEM_POSTINC(s_ptr),R(d1)
+	movel	R(d1),R(d3)
+	lsll	R(d5),R(d3)
+	orl	R(d3),R(d2)
+	movel	R(d2),MEM_POSTINC(res_ptr)
+	lsrl	R(cnt),R(d1)
+
+	dbf	R(s_size),L(Loop)
+	subl	#0x10000,R(s_size)
+	bcc	L(Loop)
+
+L(Lend:)
+	movel	R(d1),MEM(res_ptr) /* store most significant limb */
+
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+	rts
+
+/* We loop from most significant end of the arrays, which is only
+   permissable if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.  */
+
+L(Lspecial:)
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+	lea	MEM_INDX1(s_ptr,s_size,l,4),R(s_ptr)
+	lea	MEM_INDX1(res_ptr,s_size,l,4),R(res_ptr)
+#else /* not mc68000 */
+	movel	R(s_size),R(d0)
+	asll	#2,R(d0)
+	addl	R(s_size),R(s_ptr)
+	addl	R(s_size),R(res_ptr)
+#endif
+
+	clrl	R(d0)			/* initialize carry */
+	eorw	#1,R(s_size)
+	lsrl	#1,R(s_size)
+	bcc	L(LL1)
+	subql	#1,R(s_size)
+
+L(LLoop:)
+	movel	MEM_PREDEC(s_ptr),R(d2)
+	roxrl	#1,R(d2)
+	movel	R(d2),MEM_PREDEC(res_ptr)
+L(LL1:)
+	movel	MEM_PREDEC(s_ptr),R(d2)
+	roxrl	#1,R(d2)
+	movel	R(d2),MEM_PREDEC(res_ptr)
+
+	dbf	R(s_size),L(LLoop)
+	roxrl	#1,R(d0)		/* save cy in msb */
+	subl	#0x10000,R(s_size)
+	bcs	L(LLend)
+	addl	R(d0),R(d0)		/* restore cy */
+	bra	L(LLoop)
+
+L(LLend:)
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+	rts
+EPILOG(__gmpn_rshift)
diff --git a/rts/gmp/mpn/m68k/sub_n.S b/rts/gmp/mpn/m68k/sub_n.S
new file mode 100644
index 0000000000..ce45b24db5
--- /dev/null
+++ b/rts/gmp/mpn/m68k/sub_n.S
@@ -0,0 +1,79 @@
+/* mc68020 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+   store difference in a third limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s1_ptr	(sp + 8)
+  s2_ptr	(sp + 16)
+  size		(sp + 12)
+*/
+
+#include "asm-syntax.h"
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_sub_n)
+
+C_SYMBOL_NAME(__gmpn_sub_n:)
+PROLOG(__gmpn_sub_n)
+/* Save used registers on the stack.  */
+	movel	R(d2),MEM_PREDEC(sp)
+	movel	R(a2),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  Better use movem?  */
+	movel	MEM_DISP(sp,12),R(a2)
+	movel	MEM_DISP(sp,16),R(a0)
+	movel	MEM_DISP(sp,20),R(a1)
+	movel	MEM_DISP(sp,24),R(d2)
+
+	eorw	#1,R(d2)
+	lsrl	#1,R(d2)
+	bcc	L(L1)
+	subql	#1,R(d2)	/* clears cy as side effect */
+
+L(Loop:)
+	movel	MEM_POSTINC(a0),R(d0)
+	movel	MEM_POSTINC(a1),R(d1)
+	subxl	R(d1),R(d0)
+	movel	R(d0),MEM_POSTINC(a2)
+L(L1:)	movel	MEM_POSTINC(a0),R(d0)
+	movel	MEM_POSTINC(a1),R(d1)
+	subxl	R(d1),R(d0)
+	movel	R(d0),MEM_POSTINC(a2)
+
+	dbf	R(d2),L(Loop)		/* loop until 16 lsb of %4 == -1 */
+	subxl	R(d0),R(d0)	/* d0 <= -cy; save cy as 0 or -1 in d0 */
+	subl	#0x10000,R(d2)
+	bcs	L(L2)
+	addl	R(d0),R(d0)	/* restore cy */
+	bra	L(Loop)
+
+L(L2:)
+	negl	R(d0)
+
+/* Restore used registers from stack frame.  */
+	movel	MEM_POSTINC(sp),R(a2)
+	movel	MEM_POSTINC(sp),R(d2)
+
+	rts
+EPILOG(__gmpn_sub_n)
diff --git a/rts/gmp/mpn/m68k/syntax.h b/rts/gmp/mpn/m68k/syntax.h
new file mode 100644
index 0000000000..9eec279c06
--- /dev/null
+++ b/rts/gmp/mpn/m68k/syntax.h
@@ -0,0 +1,177 @@
+/* asm.h -- Definitions for 68k syntax variations.
+
+Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+#undef ALIGN
+
+#ifdef MIT_SYNTAX
+#define PROLOG(name)
+#define EPILOG(name)
+#define R(r)r
+#define MEM(base)base@
+#define MEM_DISP(base,displacement)base@(displacement)
+#define MEM_INDX(base,idx,size_suffix)base@(idx:size_suffix)
+#define MEM_INDX1(base,idx,size_suffix,scale)base@(idx:size_suffix:scale)
+#define MEM_PREDEC(memory_base)memory_base@-
+#define MEM_POSTINC(memory_base)memory_base@+
+#define L(label) label
+#define TEXT .text
+#define ALIGN .even
+#define GLOBL .globl
+#define moveql moveq
+/* Use variable sized opcodes.  */
+#define bcc jcc
+#define bcs jcs
+#define bls jls
+#define beq jeq
+#define bne jne
+#define bra jra
+#endif
+
+#ifdef SONY_SYNTAX
+#define PROLOG(name)
+#define EPILOG(name)
+#define R(r)r
+#define MEM(base)(base)
+#define MEM_DISP(base,displacement)(displacement,base)
+#define MEM_INDX(base,idx,size_suffix)(base,idx.size_suffix)
+#define MEM_INDX1(base,idx,size_suffix,scale)(base,idx.size_suffix*scale)
+#define MEM_PREDEC(memory_base)-(memory_base)
+#define MEM_POSTINC(memory_base)(memory_base)+
+#define L(label) label
+#define TEXT .text
+#define ALIGN .even
+#define GLOBL .globl
+#endif
+
+#ifdef MOTOROLA_SYNTAX
+#define PROLOG(name)
+#define EPILOG(name)
+#define R(r)r
+#define MEM(base)(base)
+#define MEM_DISP(base,displacement)(displacement,base)
+#define MEM_INDX(base,idx,size_suffix)(base,idx.size_suffix)
+#define MEM_INDX1(base,idx,size_suffix,scale)(base,idx.size_suffix*scale)
+#define MEM_PREDEC(memory_base)-(memory_base)
+#define MEM_POSTINC(memory_base)(memory_base)+
+#define L(label) label
+#define TEXT
+#define ALIGN
+#define GLOBL XDEF
+#define lea LEA
+#define movel MOVE.L
+#define moveml MOVEM.L
+#define moveql MOVEQ.L
+#define cmpl CMP.L
+#define orl OR.L
+#define clrl CLR.L
+#define eorw EOR.W
+#define lsrl LSR.L
+#define lsll LSL.L
+#define roxrl ROXR.L
+#define roxll ROXL.L
+#define addl ADD.L
+#define addxl ADDX.L
+#define addql ADDQ.L
+#define subl SUB.L
+#define subxl SUBX.L
+#define subql SUBQ.L
+#define negl NEG.L
+#define mulul MULU.L
+#define bcc BCC
+#define bcs BCS
+#define bls BLS
+#define beq BEQ
+#define bne BNE
+#define bra BRA
+#define dbf DBF
+#define rts RTS
+#define d0 D0
+#define d1 D1
+#define d2 D2
+#define d3 D3
+#define d4 D4
+#define d5 D5
+#define d6 D6
+#define d7 D7
+#define a0 A0
+#define a1 A1
+#define a2 A2
+#define a3 A3
+#define a4 A4
+#define a5 A5
+#define a6 A6
+#define a7 A7
+#define sp SP
+#endif
+
+#ifdef ELF_SYNTAX
+#define PROLOG(name) .type name,@function
+#define EPILOG(name) .size name,.-name
+#define MEM(base)(R(base))
+#define MEM_DISP(base,displacement)(displacement,R(base))
+#define MEM_PREDEC(memory_base)-(R(memory_base))
+#define MEM_POSTINC(memory_base)(R(memory_base))+
+#ifdef __STDC__
+#define R_(r)%##r
+#define R(r)R_(r)
+#define MEM_INDX_(base,idx,size_suffix)(R(base),R(idx##.##size_suffix))
+#define MEM_INDX(base,idx,size_suffix)MEM_INDX_(base,idx,size_suffix)
+#define MEM_INDX1_(base,idx,size_suffix,scale)(R(base),R(idx##.##size_suffix*scale))
+#define MEM_INDX1(base,idx,size_suffix,scale)MEM_INDX1_(base,idx,size_suffix,scale)
+#define L(label) .##label
+#else
+#define R(r)%/**/r
+#define MEM_INDX(base,idx,size_suffix)(R(base),R(idx).size_suffix)
+#define MEM_INDX1(base,idx,size_suffix,scale)(R(base),R(idx).size_suffix*scale)
+#define L(label) ./**/label
+#endif
+#define TEXT .text
+#define ALIGN .align 2
+#define GLOBL .globl
+#define bcc jbcc
+#define bcs jbcs
+#define bls jbls
+#define beq jbeq
+#define bne jbne
+#define bra jbra
+#endif
+
+#if defined (SONY_SYNTAX) || defined (ELF_SYNTAX)
+#define movel move.l
+#define moveml movem.l
+#define moveql moveq.l
+#define cmpl cmp.l
+#define orl or.l
+#define clrl clr.l
+#define eorw eor.w
+#define lsrl lsr.l
+#define lsll lsl.l
+#define roxrl roxr.l
+#define roxll roxl.l
+#define addl add.l
+#define addxl addx.l
+#define addql addq.l
+#define subl sub.l
+#define subxl subx.l
+#define subql subq.l
+#define negl neg.l
+#define mulul mulu.l
+#endif
diff --git a/rts/gmp/mpn/m88k/add_n.s b/rts/gmp/mpn/m88k/add_n.s
new file mode 100644
index 0000000000..0b776c618a
--- /dev/null
+++ b/rts/gmp/mpn/m88k/add_n.s
@@ -0,0 +1,104 @@
+; mc88100 __gmpn_add -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; s2_ptr	r4
+; size		r5
+
+; This code has been optimized to run one instruction per clock, avoiding
+; load stalls and writeback contention.  As a result, the instruction
+; order is not always natural.
+
+; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
+; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
+
+	text
+	align	 16
+	global	 ___gmpn_add_n
+___gmpn_add_n:
+	ld	r6,r3,0			; read first limb from s1_ptr
+	extu	r10,r5,3
+	ld	r7,r4,0			; read first limb from s2_ptr
+
+	subu.co	r5,r0,r5		; (clear carry as side effect)
+	mak	r5,r5,3<4>
+	bcnd	eq0,r5,Lzero
+
+	or	r12,r0,lo16(Lbase)
+	or.u	r12,r12,hi16(Lbase)
+	addu	r12,r12,r5		; r12 is address for entering in loop
+
+	extu	r5,r5,2			; divide by 4
+	subu	r2,r2,r5		; adjust res_ptr
+	subu	r3,r3,r5		; adjust s1_ptr
+	subu	r4,r4,r5		; adjust s2_ptr
+
+	or	r8,r6,r0
+
+	jmp.n	r12
+	 or	r9,r7,r0
+
+Loop:	addu	r3,r3,32
+	st	r8,r2,28
+	addu	r4,r4,32
+	ld	r6,r3,0
+	addu	r2,r2,32
+	ld	r7,r4,0
+Lzero:	subu	r10,r10,1		; add 0 + 8r limbs (adj loop cnt)
+Lbase:	ld	r8,r3,4
+	addu.cio r6,r6,r7
+	ld	r9,r4,4
+	st	r6,r2,0
+	ld	r6,r3,8			; add 7 + 8r limbs
+	addu.cio r8,r8,r9
+	ld	r7,r4,8
+	st	r8,r2,4
+	ld	r8,r3,12		; add 6 + 8r limbs
+	addu.cio r6,r6,r7
+	ld	r9,r4,12
+	st	r6,r2,8
+	ld	r6,r3,16		; add 5 + 8r limbs
+	addu.cio r8,r8,r9
+	ld	r7,r4,16
+	st	r8,r2,12
+	ld	r8,r3,20		; add 4 + 8r limbs
+	addu.cio r6,r6,r7
+	ld	r9,r4,20
+	st	r6,r2,16
+	ld	r6,r3,24		; add 3 + 8r limbs
+	addu.cio r8,r8,r9
+	ld	r7,r4,24
+	st	r8,r2,20
+	ld	r8,r3,28		; add 2 + 8r limbs
+	addu.cio r6,r6,r7
+	ld	r9,r4,28
+	st	r6,r2,24
+	bcnd.n	ne0,r10,Loop		; add 1 + 8r limbs
+	 addu.cio r8,r8,r9
+
+	st	r8,r2,28		; store most significant limb
+
+	jmp.n	 r1
+	 addu.ci r2,r0,r0		; return carry-out from most sign. limb
diff --git a/rts/gmp/mpn/m88k/mc88110/add_n.S b/rts/gmp/mpn/m88k/mc88110/add_n.S
new file mode 100644
index 0000000000..843a50dded
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mc88110/add_n.S
@@ -0,0 +1,200 @@
+; mc88110 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+#define res_ptr	r2
+#define s1_ptr	r3
+#define s2_ptr	r4
+#define size	r5
+
+#include "sysdep.h"
+
+	text
+	align	16
+	global	C_SYMBOL_NAME(__gmpn_add_n)
+C_SYMBOL_NAME(__gmpn_add_n):
+	addu.co	 r0,r0,r0		; clear cy flag
+	xor	 r12,s2_ptr,res_ptr
+	bb1	 2,r12,L1
+; **  V1a  **
+L0:	bb0	 2,res_ptr,L_v1		; branch if res_ptr is aligned?
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	addu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+L_v1:	cmp	 r12,size,2
+	bb1	 lt,r12,Lend2
+
+	ld	 r10,s1_ptr,0
+	ld	 r12,s1_ptr,4
+	ld.d	 r8,s2_ptr,0
+	subu	 size,size,10
+	bcnd	 lt0,size,Lfin1
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop1:	subu	 size,size,8
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,16
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,20
+	ld.d	 r8,s2_ptr,16
+	st.d	 r6,res_ptr,8
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,24
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,28
+	ld.d	 r8,s2_ptr,24
+	st.d	 r6,res_ptr,16
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,32
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,36
+	addu	 s1_ptr,s1_ptr,32
+	ld.d	 r8,s2_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	st.d	 r6,res_ptr,24
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop1
+
+Lfin1:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend1
+/* Add blocks of 2 limbs until less than 2 limbs remain */
+Loope1:	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope1
+Lend1:	addu.cio r6,r10,r8
+	addu.cio r7,r12,r9
+	st.d	 r6,res_ptr,0
+
+	bb0	 0,size,Lret1
+/* Add last limb */
+	ld	 r10,s1_ptr,8
+	ld	 r8,s2_ptr,8
+	addu.cio r6,r10,r8
+	st	 r6,res_ptr,8
+
+Lret1:	jmp.n	 r1
+	addu.ci	 r2,r0,r0		; return carry-out from most sign. limb
+
+L1:	xor	 r12,s1_ptr,res_ptr
+	bb1	 2,r12,L2
+; **  V1b  **
+	or	 r12,r0,s2_ptr
+	or	 s2_ptr,r0,s1_ptr
+	or	 s1_ptr,r0,r12
+	br	 L0
+
+; **  V2  **
+/* If we come here, the alignment of s1_ptr and res_ptr as well as the
+   alignment of s2_ptr and res_ptr differ.  Since there are only two ways
+   things can be aligned (that we care about) we now know that the alignment
+   of s1_ptr and s2_ptr are the same.  */
+
+L2:	cmp	 r12,size,1
+	bb1	 eq,r12,Ljone
+	bb0	 2,s1_ptr,L_v2		; branch if s1_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	addu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+
+L_v2:	subu	 size,size,8
+	bcnd	 lt0,size,Lfin2
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop2:	subu	 size,size,8
+	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	ld.d	 r8,s1_ptr,8
+	ld.d	 r6,s2_ptr,8
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,8
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,12
+	ld.d	 r8,s1_ptr,16
+	ld.d	 r6,s2_ptr,16
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,16
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,20
+	ld.d	 r8,s1_ptr,24
+	ld.d	 r6,s2_ptr,24
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,24
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,28
+	addu	 s1_ptr,s1_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop2
+
+Lfin2:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend2
+Loope2:	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope2
+Lend2:	bb0	 0,size,Lret2
+/* Add last limb */
+Ljone:	ld	 r10,s1_ptr,0
+	ld	 r8,s2_ptr,0
+	addu.cio r6,r10,r8
+	st	 r6,res_ptr,0
+
+Lret2:	jmp.n	 r1
+	addu.ci	 r2,r0,r0		; return carry-out from most sign. limb
diff --git a/rts/gmp/mpn/m88k/mc88110/addmul_1.s b/rts/gmp/mpn/m88k/mc88110/addmul_1.s
new file mode 100644
index 0000000000..7d97c87c79
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mc88110/addmul_1.s
@@ -0,0 +1,61 @@
+; mc88110 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; size		r4
+; s2_limb	r5
+
+	text
+	align	16
+	global	___gmpn_addmul_1
+___gmpn_addmul_1:
+	lda	 r3,r3[r4]
+	lda	 r8,r2[r4]		; RES_PTR in r8 since r2 is retval
+	subu	 r4,r0,r4
+	addu.co	 r2,r0,r0		; r2 = cy = 0
+
+	ld	 r6,r3[r4]
+	addu	 r4,r4,1
+	subu	 r8,r8,4
+	bcnd.n	 eq0,r4,Lend
+	 mulu.d	 r10,r6,r5
+
+Loop:	ld	 r7,r8[r4]
+	ld	 r6,r3[r4]
+	addu.cio r9,r11,r2
+	addu.ci	 r2,r10,r0
+	addu.co	 r9,r9,r7
+	st	 r9,r8[r4]
+	addu	 r4,r4,1
+	mulu.d	 r10,r6,r5
+	bcnd	 ne0,r4,Loop
+
+Lend:	ld	 r7,r8,0
+	addu.cio r9,r11,r2
+	addu.ci	 r2,r10,r0
+	addu.co	 r9,r9,r7
+	st	 r9,r8,0
+	jmp.n	 r1
+	 addu.ci r2,r2,r0
diff --git a/rts/gmp/mpn/m88k/mc88110/mul_1.s b/rts/gmp/mpn/m88k/mc88110/mul_1.s
new file mode 100644
index 0000000000..b8483afa91
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mc88110/mul_1.s
@@ -0,0 +1,59 @@
+; mc88110 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; size		r4
+; s2_limb	r5
+
+	text
+	align	16
+	global	___gmpn_mul_1
+___gmpn_mul_1:
+	; Make S1_PTR and RES_PTR point at the end of their blocks
+	; and negate SIZE.
+	lda	 r3,r3[r4]
+	lda	 r8,r2[r4]		; RES_PTR in r8 since r2 is retval
+	subu	 r4,r0,r4
+
+	addu.co	 r2,r0,r0		; r2 = cy = 0
+
+	ld	 r6,r3[r4]
+	addu	 r4,r4,1
+	mulu.d	 r10,r6,r5
+	bcnd.n	 eq0,r4,Lend
+	 subu	 r8,r8,8
+
+Loop:	ld	 r6,r3[r4]
+	addu.cio r9,r11,r2
+	or	 r2,r10,r0		; could be avoided if unrolled
+	addu	 r4,r4,1
+	mulu.d	 r10,r6,r5
+	bcnd.n	 ne0,r4,Loop
+	 st	 r9,r8[r4]
+
+Lend:	addu.cio r9,r11,r2
+	st	 r9,r8,4
+	jmp.n	 r1
+	 addu.ci r2,r10,r0
diff --git a/rts/gmp/mpn/m88k/mc88110/sub_n.S b/rts/gmp/mpn/m88k/mc88110/sub_n.S
new file mode 100644
index 0000000000..715a3faf25
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mc88110/sub_n.S
@@ -0,0 +1,276 @@
+; mc88110 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+#define res_ptr	r2
+#define s1_ptr	r3
+#define s2_ptr	r4
+#define size	r5
+
+#include "sysdep.h"
+
+	text
+	align	16
+	global	C_SYMBOL_NAME(__gmpn_sub_n)
+C_SYMBOL_NAME(__gmpn_sub_n):
+	subu.co	 r0,r0,r0		; set cy flag
+	xor	 r12,s2_ptr,res_ptr
+	bb1	 2,r12,L1
+; **  V1a  **
+L0:	bb0	 2,res_ptr,L_v1		; branch if res_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	subu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+L_v1:	cmp	 r12,size,2
+	bb1	 lt,r12,Lend2
+
+	ld	 r10,s1_ptr,0
+	ld	 r12,s1_ptr,4
+	ld.d	 r8,s2_ptr,0
+	subu	 size,size,10
+	bcnd	 lt0,size,Lfin1
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop1:	subu	 size,size,8
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,16
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,20
+	ld.d	 r8,s2_ptr,16
+	st.d	 r6,res_ptr,8
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,24
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,28
+	ld.d	 r8,s2_ptr,24
+	st.d	 r6,res_ptr,16
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,32
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,36
+	addu	 s1_ptr,s1_ptr,32
+	ld.d	 r8,s2_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	st.d	 r6,res_ptr,24
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop1
+
+Lfin1:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend1
+/* Add blocks of 2 limbs until less than 2 limbs remain */
+Loope1:	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope1
+Lend1:	subu.cio r6,r10,r8
+	subu.cio r7,r12,r9
+	st.d	 r6,res_ptr,0
+
+	bb0	 0,size,Lret1
+/* Add last limb */
+	ld	 r10,s1_ptr,8
+	ld	 r8,s2_ptr,8
+	subu.cio r6,r10,r8
+	st	 r6,res_ptr,8
+
+Lret1:	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
+
+L1:	xor	 r12,s1_ptr,res_ptr
+	bb1	 2,r12,L2
+; **  V1b  **
+	bb0	 2,res_ptr,L_v1b	; branch if res_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s1_ptr */
+	ld	 r10,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	ld	 r8,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	subu	 size,size,1
+	subu.co	 r6,r8,r10
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+L_v1b:	cmp	 r12,size,2
+	bb1	 lt,r12,Lend2
+
+	ld	 r10,s2_ptr,0
+	ld	 r12,s2_ptr,4
+	ld.d	 r8,s1_ptr,0
+	subu	 size,size,10
+	bcnd	 lt0,size,Lfin1b
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop1b:	subu	 size,size,8
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,8
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,12
+	ld.d	 r8,s1_ptr,8
+	st.d	 r6,res_ptr,0
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,16
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,20
+	ld.d	 r8,s1_ptr,16
+	st.d	 r6,res_ptr,8
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,24
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,28
+	ld.d	 r8,s1_ptr,24
+	st.d	 r6,res_ptr,16
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,32
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,36
+	addu	 s2_ptr,s2_ptr,32
+	ld.d	 r8,s1_ptr,32
+	addu	 s1_ptr,s1_ptr,32
+	st.d	 r6,res_ptr,24
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop1b
+
+Lfin1b:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend1b
+/* Add blocks of 2 limbs until less than 2 limbs remain */
+Loope1b:subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,8
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,12
+	ld.d	 r8,s1_ptr,8
+	st.d	 r6,res_ptr,0
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope1b
+Lend1b:	subu.cio r6,r8,r10
+	subu.cio r7,r9,r12
+	st.d	 r6,res_ptr,0
+
+	bb0	 0,size,Lret1b
+/* Add last limb */
+	ld	 r10,s2_ptr,8
+	ld	 r8,s1_ptr,8
+	subu.cio r6,r8,r10
+	st	 r6,res_ptr,8
+
+Lret1b:	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
+
+; **  V2  **
+/* If we come here, the alignment of s1_ptr and res_ptr as well as the
+   alignment of s2_ptr and res_ptr differ.  Since there are only two ways
+   things can be aligned (that we care about) we now know that the alignment
+   of s1_ptr and s2_ptr are the same.  */
+
+L2:	cmp	 r12,size,1
+	bb1	 eq,r12,Ljone
+	bb0	 2,s1_ptr,L_v2		; branch if s1_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	subu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+
+L_v2:	subu	 size,size,8
+	bcnd	 lt0,size,Lfin2
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop2:	subu	 size,size,8
+	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	ld.d	 r8,s1_ptr,8
+	ld.d	 r6,s2_ptr,8
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,8
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,12
+	ld.d	 r8,s1_ptr,16
+	ld.d	 r6,s2_ptr,16
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,16
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,20
+	ld.d	 r8,s1_ptr,24
+	ld.d	 r6,s2_ptr,24
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,24
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,28
+	addu	 s1_ptr,s1_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop2
+
+Lfin2:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend2
+Loope2:	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope2
+Lend2:	bb0	 0,size,Lret2
+/* Add last limb */
+Ljone:	ld	 r10,s1_ptr,0
+	ld	 r8,s2_ptr,0
+	subu.cio r6,r10,r8
+	st	 r6,res_ptr,0
+
+Lret2:	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
diff --git a/rts/gmp/mpn/m88k/mul_1.s b/rts/gmp/mpn/m88k/mul_1.s
new file mode 100644
index 0000000000..06370837ef
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mul_1.s
@@ -0,0 +1,127 @@
+; mc88100 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; size		r4
+; s2_limb	r5
+
+; Common overhead is about 11 cycles/invocation.
+
+; The speed for S2_LIMB >= 0x10000 is approximately 21 cycles/limb.  (The
+; pipeline stalls 2 cycles due to WB contention.)
+
+; The speed for S2_LIMB < 0x10000 is approximately 16 cycles/limb.  (The
+; pipeline stalls 2 cycles due to WB contention and 1 cycle due to latency.)
+
+; To enhance speed:
+; 1. Unroll main loop 4-8 times.
+; 2. Schedule code to avoid WB contention.  It might be tempting to move the
+;    ld instruction in the loops down to save 2 cycles (less WB contention),
+;    but that looses because the ultimate value will be read from outside
+;    the allocated space.  But if we handle the ultimate multiplication in
+;    the tail, we can do this.
+; 3. Make the multiplication with less instructions.  I think the code for
+;    (S2_LIMB >= 0x10000) is not minimal.
+; With these techniques the (S2_LIMB >= 0x10000) case would run in 17 or
+; less cycles/limb; the (S2_LIMB < 0x10000) case would run in 11
+; cycles/limb.  (Assuming infinite unrolling.)
+
+	text
+	align	 16
+	global	 ___gmpn_mul_1
+___gmpn_mul_1:
+
+	; Make S1_PTR and RES_PTR point at the end of their blocks
+	; and negate SIZE.
+	lda	 r3,r3[r4]
+	lda	 r6,r2[r4]	; RES_PTR in r6 since r2 is retval
+	subu	 r4,r0,r4
+
+	addu.co	 r2,r0,r0	; r2 = cy = 0
+	ld	 r9,r3[r4]
+	mask	 r7,r5,0xffff	; r7 = lo(S2_LIMB)
+	extu	 r8,r5,16	; r8 = hi(S2_LIMB)
+	bcnd.n	 eq0,r8,Lsmall	; jump if (hi(S2_LIMB) == 0)
+	 subu	 r6,r6,4
+
+; General code for any value of S2_LIMB.
+
+	; Make a stack frame and save r25 and r26
+	subu	 r31,r31,16
+	st.d	 r25,r31,8
+
+	; Enter the loop in the middle
+	br.n	L1
+	addu	 r4,r4,1
+
+Loop:	ld	 r9,r3[r4]
+	st	 r26,r6[r4]
+; bcnd	ne0,r0,0		; bubble
+	addu	 r4,r4,1
+L1:	mul	 r26,r9,r5	; low word of product	mul_1	WB ld
+	mask	 r12,r9,0xffff	; r12 = lo(s1_limb)	mask_1
+	mul	 r11,r12,r7	; r11 =  prod_0		mul_2	WB mask_1
+	mul	 r10,r12,r8	; r10 = prod_1a		mul_3
+	extu	 r13,r9,16	; r13 = hi(s1_limb)	extu_1	WB mul_1
+	mul	 r12,r13,r7	; r12 = prod_1b		mul_4	WB extu_1
+	mul	 r25,r13,r8	; r25  = prod_2		mul_5	WB mul_2
+	extu	 r11,r11,16	; r11 = hi(prod_0)	extu_2	WB mul_3
+	addu	 r10,r10,r11	;			addu_1	WB extu_2
+; bcnd	ne0,r0,0		; bubble			WB addu_1
+	addu.co	 r10,r10,r12	;				WB mul_4
+	mask.u	 r10,r10,0xffff	; move the 16 most significant bits...
+	addu.ci	 r10,r10,r0	; ...to the low half of the word...
+	rot	 r10,r10,16	; ...and put carry in pos 16.
+	addu.co	 r26,r26,r2	; add old carry limb
+	bcnd.n	 ne0,r4,Loop
+	 addu.ci r2,r25,r10	; compute new carry limb
+
+	st	 r26,r6[r4]
+	ld.d	 r25,r31,8
+	jmp.n	 r1
+	 addu	 r31,r31,16
+
+; Fast code for S2_LIMB < 0x10000
+Lsmall:
+	; Enter the loop in the middle
+	br.n	SL1
+	addu	 r4,r4,1
+
+SLoop:	ld	 r9,r3[r4]	;
+	st	 r8,r6[r4]	;
+	addu	 r4,r4,1	;
+SL1:	mul	 r8,r9,r5	; low word of product
+	mask	 r12,r9,0xffff	; r12 = lo(s1_limb)
+	extu	 r13,r9,16	; r13 = hi(s1_limb)
+	mul	 r11,r12,r7	; r11 =  prod_0
+	mul	 r12,r13,r7	; r12 = prod_1b
+	addu.cio r8,r8,r2	; add old carry limb
+	extu	 r10,r11,16	; r11 = hi(prod_0)
+	addu	 r10,r10,r12	;
+	bcnd.n	 ne0,r4,SLoop
+	extu	 r2,r10,16	; r2 = new carry limb
+
+	jmp.n	 r1
+	st	 r8,r6[r4]
diff --git a/rts/gmp/mpn/m88k/sub_n.s b/rts/gmp/mpn/m88k/sub_n.s
new file mode 100644
index 0000000000..2fd345a135
--- /dev/null
+++ b/rts/gmp/mpn/m88k/sub_n.s
@@ -0,0 +1,106 @@
+; mc88100 __gmpn_sub -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; s2_ptr	r4
+; size		r5
+
+; This code has been optimized to run one instruction per clock, avoiding
+; load stalls and writeback contention.  As a result, the instruction
+; order is not always natural.
+
+; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
+; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
+
+	text
+	align	 16
+	global	 ___gmpn_sub_n
+___gmpn_sub_n:
+	ld	r6,r3,0			; read first limb from s1_ptr
+	extu	r10,r5,3
+	ld	r7,r4,0			; read first limb from s2_ptr
+
+	subu	r5,r0,r5
+	mak	r5,r5,3<4>
+	bcnd.n	eq0,r5,Lzero
+	subu.co	r0,r0,r0		; initialize carry
+
+	or	r12,r0,lo16(Lbase)
+	or.u	r12,r12,hi16(Lbase)
+	addu	r12,r12,r5		; r12 is address for entering in loop
+
+	extu	r5,r5,2			; divide by 4
+	subu	r2,r2,r5		; adjust res_ptr
+	subu	r3,r3,r5		; adjust s1_ptr
+	subu	r4,r4,r5		; adjust s2_ptr
+
+	or	r8,r6,r0
+
+	jmp.n	r12
+	 or	r9,r7,r0
+
+Loop:	addu	r3,r3,32
+	st	r8,r2,28
+	addu	r4,r4,32
+	ld	r6,r3,0
+	addu	r2,r2,32
+	ld	r7,r4,0
+Lzero:	subu	r10,r10,1		; subtract 0 + 8r limbs (adj loop cnt)
+Lbase:	ld	r8,r3,4
+	subu.cio r6,r6,r7
+	ld	r9,r4,4
+	st	r6,r2,0
+	ld	r6,r3,8			; subtract 7 + 8r limbs
+	subu.cio r8,r8,r9
+	ld	r7,r4,8
+	st	r8,r2,4
+	ld	r8,r3,12		; subtract 6 + 8r limbs
+	subu.cio r6,r6,r7
+	ld	r9,r4,12
+	st	r6,r2,8
+	ld	r6,r3,16		; subtract 5 + 8r limbs
+	subu.cio r8,r8,r9
+	ld	r7,r4,16
+	st	r8,r2,12
+	ld	r8,r3,20		; subtract 4 + 8r limbs
+	subu.cio r6,r6,r7
+	ld	r9,r4,20
+	st	r6,r2,16
+	ld	r6,r3,24		; subtract 3 + 8r limbs
+	subu.cio r8,r8,r9
+	ld	r7,r4,24
+	st	r8,r2,20
+	ld	r8,r3,28		; subtract 2 + 8r limbs
+	subu.cio r6,r6,r7
+	ld	r9,r4,28
+	st	r6,r2,24
+	bcnd.n	ne0,r10,Loop		; subtract 1 + 8r limbs
+	 subu.cio r8,r8,r9
+
+	st	r8,r2,28		; store most significant limb
+
+	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
diff --git a/rts/gmp/mpn/mips2/add_n.s b/rts/gmp/mpn/mips2/add_n.s
new file mode 100644
index 0000000000..5c3c7fc8a1
--- /dev/null
+++ b/rts/gmp/mpn/mips2/add_n.s
@@ -0,0 +1,120 @@
+ # MIPS2 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_add_n
+	.ent	__gmpn_add_n
+__gmpn_add_n:
+	.set	noreorder
+	.set	nomacro
+
+	lw	$10,0($5)
+	lw	$11,0($6)
+
+	addiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	subu	$7,$7,$9
+
+.Loop0:	addiu	$9,$9,-1
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,4
+	addiu	$6,$6,4
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 addiu	$4,$4,4
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	addiu	$7,$7,-4
+
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	lw	$10,8($5)
+	addu	$13,$13,$2
+	lw	$11,8($6)
+	sltu	$8,$13,$2
+	addu	$13,$12,$13
+	sltu	$2,$13,$12
+	sw	$13,4($4)
+	or	$2,$2,$8
+
+	lw	$12,12($5)
+	addu	$11,$11,$2
+	lw	$13,12($6)
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,8($4)
+	or	$2,$2,$8
+
+	lw	$10,16($5)
+	addu	$13,$13,$2
+	lw	$11,16($6)
+	sltu	$8,$13,$2
+	addu	$13,$12,$13
+	sltu	$2,$13,$12
+	sw	$13,12($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,16
+	addiu	$6,$6,16
+
+	bne	$7,$0,.Loop
+	 addiu	$4,$4,16
+
+.Lend:	addu	$11,$11,$2
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__gmpn_add_n
diff --git a/rts/gmp/mpn/mips2/addmul_1.s b/rts/gmp/mpn/mips2/addmul_1.s
new file mode 100644
index 0000000000..1e5037751b
--- /dev/null
+++ b/rts/gmp/mpn/mips2/addmul_1.s
@@ -0,0 +1,97 @@
+ # MIPS __gmpn_addmul_1 -- Multiply a limb vector with a single limb and
+ # add the product to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	 4
+	.globl	 __gmpn_addmul_1
+	.ent	__gmpn_addmul_1
+__gmpn_addmul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	lw	$8,0($5)
+
+ # warm up phase 1
+	addiu	$5,$5,4
+	multu	$8,$7
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	lw	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addiu	$5,$5,4
+	addu	$3,$3,$2	# add old carry limb to low product limb
+	multu	$8,$7
+	lw	$8,0($5)	# load new s1 limb as early as possible
+	addiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	addu	$3,$10,$3
+	sltu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	bne	$6,$0,Loop
+	 addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	multu	$8,$7
+	addu	$3,$10,$3
+	sltu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	addu	$3,$10,$3
+	sltu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	j	$31
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__gmpn_addmul_1
diff --git a/rts/gmp/mpn/mips2/lshift.s b/rts/gmp/mpn/mips2/lshift.s
new file mode 100644
index 0000000000..2ca3a3c800
--- /dev/null
+++ b/rts/gmp/mpn/mips2/lshift.s
@@ -0,0 +1,95 @@
+ # MIPS2 __gmpn_lshift --
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_lshift
+	.ent	__gmpn_lshift
+__gmpn_lshift:
+	.set	noreorder
+	.set	nomacro
+
+	sll	$2,$6,2
+	addu	$5,$5,$2	# make r5 point at end of src
+	lw	$10,-4($5)	# load first limb
+	subu	$13,$0,$7
+	addu	$4,$4,$2	# make r4 point at end of res
+	addiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 srl	$2,$10,$13	# compute function result
+
+	subu	$6,$6,$9
+
+.Loop0:	lw	$3,-8($5)
+	addiu	$4,$4,-4
+	addiu	$5,$5,-4
+	addiu	$9,$9,-1
+	sll	$11,$10,$7
+	srl	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sw	$8,0($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	lw	$3,-8($5)
+	addiu	$4,$4,-16
+	addiu	$6,$6,-4
+	sll	$11,$10,$7
+	srl	$12,$3,$13
+
+	lw	$10,-12($5)
+	sll	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,12($4)
+	srl	$9,$10,$13
+
+	lw	$3,-16($5)
+	sll	$11,$10,$7
+	or	$8,$14,$9
+	sw	$8,8($4)
+	srl	$12,$3,$13
+
+	lw	$10,-20($5)
+	sll	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,4($4)
+	srl	$9,$10,$13
+
+	addiu	$5,$5,-16
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sw	$8,0($4)
+
+.Lend:	sll	$8,$10,$7
+	j	$31
+	sw	$8,-4($4)
+	.end	__gmpn_lshift
diff --git a/rts/gmp/mpn/mips2/mul_1.s b/rts/gmp/mpn/mips2/mul_1.s
new file mode 100644
index 0000000000..ea8aa26809
--- /dev/null
+++ b/rts/gmp/mpn/mips2/mul_1.s
@@ -0,0 +1,85 @@
+ # MIPS __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+ # store the product in a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	 4
+	.globl	 __gmpn_mul_1
+	.ent	__gmpn_mul_1
+__gmpn_mul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	lw	$8,0($5)
+
+ # warm up phase 1
+	addiu	$5,$5,4
+	multu	$8,$7
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	lw	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	mflo	$10
+	mfhi	$9
+	addiu	$5,$5,4
+	addu	$10,$10,$2	# add old carry limb to low product limb
+	multu	$8,$7
+	lw	$8,0($5)	# load new s1 limb as early as possible
+	addiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$10,$2	# carry from previous addition -> $2
+	sw	$10,0($4)
+	addiu	$4,$4,4
+	bne	$6,$0,Loop
+	 addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	mflo	$10
+	mfhi	$9
+	addu	$10,$10,$2
+	sltu	$2,$10,$2
+	multu	$8,$7
+	sw	$10,0($4)
+	addiu	$4,$4,4
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	mflo	$10
+	mfhi	$9
+	addu	$10,$10,$2
+	sltu	$2,$10,$2
+	sw	$10,0($4)
+	j	$31
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__gmpn_mul_1
diff --git a/rts/gmp/mpn/mips2/rshift.s b/rts/gmp/mpn/mips2/rshift.s
new file mode 100644
index 0000000000..37c8f39cb4
--- /dev/null
+++ b/rts/gmp/mpn/mips2/rshift.s
@@ -0,0 +1,92 @@
+ # MIPS2 __gmpn_rshift --
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_rshift
+	.ent	__gmpn_rshift
+__gmpn_rshift:
+	.set	noreorder
+	.set	nomacro
+
+	lw	$10,0($5)	# load first limb
+	subu	$13,$0,$7
+	addiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 sll	$2,$10,$13	# compute function result
+
+	subu	$6,$6,$9
+
+.Loop0:	lw	$3,4($5)
+	addiu	$4,$4,4
+	addiu	$5,$5,4
+	addiu	$9,$9,-1
+	srl	$11,$10,$7
+	sll	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sw	$8,-4($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	lw	$3,4($5)
+	addiu	$4,$4,16
+	addiu	$6,$6,-4
+	srl	$11,$10,$7
+	sll	$12,$3,$13
+
+	lw	$10,8($5)
+	srl	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,-16($4)
+	sll	$9,$10,$13
+
+	lw	$3,12($5)
+	srl	$11,$10,$7
+	or	$8,$14,$9
+	sw	$8,-12($4)
+	sll	$12,$3,$13
+
+	lw	$10,16($5)
+	srl	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,-8($4)
+	sll	$9,$10,$13
+
+	addiu	$5,$5,16
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sw	$8,-4($4)
+
+.Lend:	srl	$8,$10,$7
+	j	$31
+	sw	$8,0($4)
+	.end	__gmpn_rshift
diff --git a/rts/gmp/mpn/mips2/sub_n.s b/rts/gmp/mpn/mips2/sub_n.s
new file mode 100644
index 0000000000..51d34f3ac3
--- /dev/null
+++ b/rts/gmp/mpn/mips2/sub_n.s
@@ -0,0 +1,120 @@
+ # MIPS2 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_sub_n
+	.ent	__gmpn_sub_n
+__gmpn_sub_n:
+	.set	noreorder
+	.set	nomacro
+
+	lw	$10,0($5)
+	lw	$11,0($6)
+
+	addiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	subu	$7,$7,$9
+
+.Loop0:	addiu	$9,$9,-1
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,4
+	addiu	$6,$6,4
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 addiu	$4,$4,4
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	addiu	$7,$7,-4
+
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	lw	$10,8($5)
+	addu	$13,$13,$2
+	lw	$11,8($6)
+	sltu	$8,$13,$2
+	subu	$13,$12,$13
+	sltu	$2,$12,$13
+	sw	$13,4($4)
+	or	$2,$2,$8
+
+	lw	$12,12($5)
+	addu	$11,$11,$2
+	lw	$13,12($6)
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,8($4)
+	or	$2,$2,$8
+
+	lw	$10,16($5)
+	addu	$13,$13,$2
+	lw	$11,16($6)
+	sltu	$8,$13,$2
+	subu	$13,$12,$13
+	sltu	$2,$12,$13
+	sw	$13,12($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,16
+	addiu	$6,$6,16
+
+	bne	$7,$0,.Loop
+	 addiu	$4,$4,16
+
+.Lend:	addu	$11,$11,$2
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__gmpn_sub_n
diff --git a/rts/gmp/mpn/mips2/submul_1.s b/rts/gmp/mpn/mips2/submul_1.s
new file mode 100644
index 0000000000..495dea3ba2
--- /dev/null
+++ b/rts/gmp/mpn/mips2/submul_1.s
@@ -0,0 +1,97 @@
+ # MIPS __gmpn_submul_1 -- Multiply a limb vector with a single limb and
+ # subtract the product from a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	 4
+	.globl	 __gmpn_submul_1
+	.ent	__gmpn_submul_1
+__gmpn_submul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	lw	$8,0($5)
+
+ # warm up phase 1
+	addiu	$5,$5,4
+	multu	$8,$7
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	lw	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addiu	$5,$5,4
+	addu	$3,$3,$2	# add old carry limb to low product limb
+	multu	$8,$7
+	lw	$8,0($5)	# load new s1 limb as early as possible
+	addiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	subu	$3,$10,$3
+	sgtu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	bne	$6,$0,Loop
+	 addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	multu	$8,$7
+	subu	$3,$10,$3
+	sgtu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	subu	$3,$10,$3
+	sgtu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	j	$31
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__gmpn_submul_1
diff --git a/rts/gmp/mpn/mips2/umul.s b/rts/gmp/mpn/mips2/umul.s
new file mode 100644
index 0000000000..40e847614c
--- /dev/null
+++ b/rts/gmp/mpn/mips2/umul.s
@@ -0,0 +1,30 @@
+ # Copyright (C) 1999 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+	.text
+	.align	2
+	.globl	__umul_ppmm
+	.ent	__umul_ppmm
+__umul_ppmm:
+	multu $5,$6
+	mflo	$3
+	mfhi	$2
+	sw	$3,0($4)
+	j	$31
+	.end	__umul_ppmm
diff --git a/rts/gmp/mpn/mips3/README b/rts/gmp/mpn/mips3/README
new file mode 100644
index 0000000000..e94b2c7460
--- /dev/null
+++ b/rts/gmp/mpn/mips3/README
@@ -0,0 +1,23 @@
+This directory contains mpn functions optimized for MIPS3.  Example of
+processors that implement MIPS3 are R4000, R4400, R4600, R4700, and R8000.
+
+RELEVANT OPTIMIZATION ISSUES
+
+1. On the R4000 and R4400, branches, both the plain and the "likely" ones,
+   take 3 cycles to execute.  (The fastest possible loop will take 4 cycles,
+   because of the delay insn.)
+
+   On the R4600, branches takes a single cycle
+
+   On the R8000, branches often take no noticable cycles, as they are
+   executed in a separate function unit..
+
+2. The R4000 and R4400 have a load latency of 4 cycles.
+
+3. On the R4000 and R4400, multiplies take a data-dependent number of
+   cycles, contrary to the SGI documentation.  There seem to be 3 or 4
+   possible latencies.
+
+STATUS
+
+Good...
diff --git a/rts/gmp/mpn/mips3/add_n.s b/rts/gmp/mpn/mips3/add_n.s
new file mode 100644
index 0000000000..adad0beaef
--- /dev/null
+++ b/rts/gmp/mpn/mips3/add_n.s
@@ -0,0 +1,120 @@
+ # MIPS3 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_add_n
+	.ent	__gmpn_add_n
+__gmpn_add_n:
+	.set	noreorder
+	.set	nomacro
+
+	ld	$10,0($5)
+	ld	$11,0($6)
+
+	daddiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	dsubu	$7,$7,$9
+
+.Loop0:	daddiu	$9,$9,-1
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,8
+	daddiu	$6,$6,8
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 daddiu	$4,$4,8
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	daddiu	$7,$7,-4
+
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	ld	$10,16($5)
+	daddu	$13,$13,$2
+	ld	$11,16($6)
+	sltu	$8,$13,$2
+	daddu	$13,$12,$13
+	sltu	$2,$13,$12
+	sd	$13,8($4)
+	or	$2,$2,$8
+
+	ld	$12,24($5)
+	daddu	$11,$11,$2
+	ld	$13,24($6)
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,16($4)
+	or	$2,$2,$8
+
+	ld	$10,32($5)
+	daddu	$13,$13,$2
+	ld	$11,32($6)
+	sltu	$8,$13,$2
+	daddu	$13,$12,$13
+	sltu	$2,$13,$12
+	sd	$13,24($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,32
+	daddiu	$6,$6,32
+
+	bne	$7,$0,.Loop
+	 daddiu	$4,$4,32
+
+.Lend:	daddu	$11,$11,$2
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__gmpn_add_n
diff --git a/rts/gmp/mpn/mips3/addmul_1.s b/rts/gmp/mpn/mips3/addmul_1.s
new file mode 100644
index 0000000000..d390e2298e
--- /dev/null
+++ b/rts/gmp/mpn/mips3/addmul_1.s
@@ -0,0 +1,97 @@
+ # MIPS3 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and
+ # add the product to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	4
+	.globl	__gmpn_addmul_1
+	.ent	__gmpn_addmul_1
+__gmpn_addmul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	ld	$8,0($5)
+
+ # warm up phase 1
+	daddiu	$5,$5,8
+	dmultu	$8,$7
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	ld	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddiu	$5,$5,8
+	daddu	$3,$3,$2	# add old carry limb to low product limb
+	dmultu	$8,$7
+	ld	$8,0($5)	# load new s1 limb as early as possible
+	daddiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	daddu	$3,$10,$3
+	sltu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	bne	$6,$0,Loop
+	 daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	dmultu	$8,$7
+	daddu	$3,$10,$3
+	sltu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	daddu	$3,$10,$3
+	sltu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	j	$31
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__gmpn_addmul_1
diff --git a/rts/gmp/mpn/mips3/gmp-mparam.h b/rts/gmp/mpn/mips3/gmp-mparam.h
new file mode 100644
index 0000000000..656e90c7b0
--- /dev/null
+++ b/rts/gmp/mpn/mips3/gmp-mparam.h
@@ -0,0 +1,58 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values are for the R10000 usign the system cc.  */
+/* Generated by tuneup.c, 2000-07-25. */
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 16
+#endif
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 32
+#endif
+
+/* Supressed the TOOM3 values as they looked absolutely crazy
+   (698 and 21 respectively) */
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              58
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             54
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            82
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD         159
+#endif
diff --git a/rts/gmp/mpn/mips3/lshift.s b/rts/gmp/mpn/mips3/lshift.s
new file mode 100644
index 0000000000..372606fddf
--- /dev/null
+++ b/rts/gmp/mpn/mips3/lshift.s
@@ -0,0 +1,95 @@
+ # MIPS3 __gmpn_lshift --
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_lshift
+	.ent	__gmpn_lshift
+__gmpn_lshift:
+	.set	noreorder
+	.set	nomacro
+
+	dsll	$2,$6,3
+	daddu	$5,$5,$2	# make r5 point at end of src
+	ld	$10,-8($5)	# load first limb
+	dsubu	$13,$0,$7
+	daddu	$4,$4,$2	# make r4 point at end of res
+	daddiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 dsrl	$2,$10,$13	# compute function result
+
+	dsubu	$6,$6,$9
+
+.Loop0:	ld	$3,-16($5)
+	daddiu	$4,$4,-8
+	daddiu	$5,$5,-8
+	daddiu	$9,$9,-1
+	dsll	$11,$10,$7
+	dsrl	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sd	$8,0($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	ld	$3,-16($5)
+	daddiu	$4,$4,-32
+	daddiu	$6,$6,-4
+	dsll	$11,$10,$7
+	dsrl	$12,$3,$13
+
+	ld	$10,-24($5)
+	dsll	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,24($4)
+	dsrl	$9,$10,$13
+
+	ld	$3,-32($5)
+	dsll	$11,$10,$7
+	or	$8,$14,$9
+	sd	$8,16($4)
+	dsrl	$12,$3,$13
+
+	ld	$10,-40($5)
+	dsll	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,8($4)
+	dsrl	$9,$10,$13
+
+	daddiu	$5,$5,-32
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sd	$8,0($4)
+
+.Lend:	dsll	$8,$10,$7
+	j	$31
+	sd	$8,-8($4)
+	.end	__gmpn_lshift
diff --git a/rts/gmp/mpn/mips3/mul_1.s b/rts/gmp/mpn/mips3/mul_1.s
new file mode 100644
index 0000000000..6659e2b4eb
--- /dev/null
+++ b/rts/gmp/mpn/mips3/mul_1.s
@@ -0,0 +1,85 @@
+ # MIPS3 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+ # store the product in a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	4
+	.globl	__gmpn_mul_1
+	.ent	__gmpn_mul_1
+__gmpn_mul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	ld	$8,0($5)
+
+ # warm up phase 1
+	daddiu	$5,$5,8
+	dmultu	$8,$7
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	ld	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	mflo	$10
+	mfhi	$9
+	daddiu	$5,$5,8
+	daddu	$10,$10,$2	# add old carry limb to low product limb
+	dmultu	$8,$7
+	ld	$8,0($5)	# load new s1 limb as early as possible
+	daddiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$10,$2	# carry from previous addition -> $2
+	sd	$10,0($4)
+	daddiu	$4,$4,8
+	bne	$6,$0,Loop
+	 daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	mflo	$10
+	mfhi	$9
+	daddu	$10,$10,$2
+	sltu	$2,$10,$2
+	dmultu	$8,$7
+	sd	$10,0($4)
+	daddiu	$4,$4,8
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	mflo	$10
+	mfhi	$9
+	daddu	$10,$10,$2
+	sltu	$2,$10,$2
+	sd	$10,0($4)
+	j	$31
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__gmpn_mul_1
diff --git a/rts/gmp/mpn/mips3/rshift.s b/rts/gmp/mpn/mips3/rshift.s
new file mode 100644
index 0000000000..59c7fd3492
--- /dev/null
+++ b/rts/gmp/mpn/mips3/rshift.s
@@ -0,0 +1,92 @@
+ # MIPS3 __gmpn_rshift --
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_rshift
+	.ent	__gmpn_rshift
+__gmpn_rshift:
+	.set	noreorder
+	.set	nomacro
+
+	ld	$10,0($5)	# load first limb
+	dsubu	$13,$0,$7
+	daddiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 dsll	$2,$10,$13	# compute function result
+
+	dsubu	$6,$6,$9
+
+.Loop0:	ld	$3,8($5)
+	daddiu	$4,$4,8
+	daddiu	$5,$5,8
+	daddiu	$9,$9,-1
+	dsrl	$11,$10,$7
+	dsll	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sd	$8,-8($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	ld	$3,8($5)
+	daddiu	$4,$4,32
+	daddiu	$6,$6,-4
+	dsrl	$11,$10,$7
+	dsll	$12,$3,$13
+
+	ld	$10,16($5)
+	dsrl	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,-32($4)
+	dsll	$9,$10,$13
+
+	ld	$3,24($5)
+	dsrl	$11,$10,$7
+	or	$8,$14,$9
+	sd	$8,-24($4)
+	dsll	$12,$3,$13
+
+	ld	$10,32($5)
+	dsrl	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,-16($4)
+	dsll	$9,$10,$13
+
+	daddiu	$5,$5,32
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sd	$8,-8($4)
+
+.Lend:	dsrl	$8,$10,$7
+	j	$31
+	sd	$8,0($4)
+	.end	__gmpn_rshift
diff --git a/rts/gmp/mpn/mips3/sub_n.s b/rts/gmp/mpn/mips3/sub_n.s
new file mode 100644
index 0000000000..c57c824b04
--- /dev/null
+++ b/rts/gmp/mpn/mips3/sub_n.s
@@ -0,0 +1,120 @@
+ # MIPS3 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_sub_n
+	.ent	__gmpn_sub_n
+__gmpn_sub_n:
+	.set	noreorder
+	.set	nomacro
+
+	ld	$10,0($5)
+	ld	$11,0($6)
+
+	daddiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	dsubu	$7,$7,$9
+
+.Loop0:	daddiu	$9,$9,-1
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,8
+	daddiu	$6,$6,8
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 daddiu	$4,$4,8
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	daddiu	$7,$7,-4
+
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	ld	$10,16($5)
+	daddu	$13,$13,$2
+	ld	$11,16($6)
+	sltu	$8,$13,$2
+	dsubu	$13,$12,$13
+	sltu	$2,$12,$13
+	sd	$13,8($4)
+	or	$2,$2,$8
+
+	ld	$12,24($5)
+	daddu	$11,$11,$2
+	ld	$13,24($6)
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,16($4)
+	or	$2,$2,$8
+
+	ld	$10,32($5)
+	daddu	$13,$13,$2
+	ld	$11,32($6)
+	sltu	$8,$13,$2
+	dsubu	$13,$12,$13
+	sltu	$2,$12,$13
+	sd	$13,24($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,32
+	daddiu	$6,$6,32
+
+	bne	$7,$0,.Loop
+	 daddiu	$4,$4,32
+
+.Lend:	daddu	$11,$11,$2
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__gmpn_sub_n
diff --git a/rts/gmp/mpn/mips3/submul_1.s b/rts/gmp/mpn/mips3/submul_1.s
new file mode 100644
index 0000000000..531f9705a6
--- /dev/null
+++ b/rts/gmp/mpn/mips3/submul_1.s
@@ -0,0 +1,97 @@
+ # MIPS3 __gmpn_submul_1 -- Multiply a limb vector with a single limb and
+ # subtract the product from a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	4
+	.globl	__gmpn_submul_1
+	.ent	__gmpn_submul_1
+__gmpn_submul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	ld	$8,0($5)
+
+ # warm up phase 1
+	daddiu	$5,$5,8
+	dmultu	$8,$7
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	ld	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddiu	$5,$5,8
+	daddu	$3,$3,$2	# add old carry limb to low product limb
+	dmultu	$8,$7
+	ld	$8,0($5)	# load new s1 limb as early as possible
+	daddiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	dsubu	$3,$10,$3
+	sgtu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	bne	$6,$0,Loop
+	 daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	dmultu	$8,$7
+	dsubu	$3,$10,$3
+	sgtu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	dsubu	$3,$10,$3
+	sgtu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	j	$31
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__gmpn_submul_1
diff --git a/rts/gmp/mpn/mp_bases.c b/rts/gmp/mpn/mp_bases.c
new file mode 100644
index 0000000000..011c328c80
--- /dev/null
+++ b/rts/gmp/mpn/mp_bases.c
@@ -0,0 +1,550 @@
+/* __mp_bases -- Structure for conversion between internal binary
+   format and strings in base 2..255.  The fields are explained in
+   gmp-impl.h.
+
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+#if BITS_PER_MP_LIMB == 32
+const struct bases __mp_bases[256] =
+{
+  /*  0 */ {0, 0.0, 0, 0},
+  /*  1 */ {0, 1e38, 0, 0},
+  /*  2 */ {32, 1.0000000000000000, 0x1, 0x0},
+  /*  3 */ {20, 0.6309297535714575, 0xcfd41b91, 0x3b563c24},
+  /*  4 */ {16, 0.5000000000000000, 0x2, 0x0},
+  /*  5 */ {13, 0.4306765580733931, 0x48c27395, 0xc25c2684},
+  /*  6 */ {12, 0.3868528072345416, 0x81bf1000, 0xf91bd1b6},
+  /*  7 */ {11, 0.3562071871080222, 0x75db9c97, 0x1607a2cb},
+  /*  8 */ {10, 0.3333333333333334, 0x3, 0x0},
+  /*  9 */ {10, 0.3154648767857287, 0xcfd41b91, 0x3b563c24},
+  /* 10 */ {9, 0.3010299956639811, 0x3b9aca00, 0x12e0be82},
+  /* 11 */ {9, 0.2890648263178878, 0x8c8b6d2b, 0xd24cde04},
+  /* 12 */ {8, 0.2789429456511298, 0x19a10000, 0x3fa39ab5},
+  /* 13 */ {8, 0.2702381544273197, 0x309f1021, 0x50f8ac5f},
+  /* 14 */ {8, 0.2626495350371936, 0x57f6c100, 0x74843b1e},
+  /* 15 */ {8, 0.2559580248098155, 0x98c29b81, 0xad0326c2},
+  /* 16 */ {8, 0.2500000000000000, 0x4, 0x0},
+  /* 17 */ {7, 0.2446505421182260, 0x18754571, 0x4ef0b6bd},
+  /* 18 */ {7, 0.2398124665681315, 0x247dbc80, 0xc0fc48a1},
+  /* 19 */ {7, 0.2354089133666382, 0x3547667b, 0x33838942},
+  /* 20 */ {7, 0.2313782131597592, 0x4c4b4000, 0xad7f29ab},
+  /* 21 */ {7, 0.2276702486969530, 0x6b5a6e1d, 0x313c3d15},
+  /* 22 */ {7, 0.2242438242175754, 0x94ace180, 0xb8cca9e0},
+  /* 23 */ {7, 0.2210647294575037, 0xcaf18367, 0x42ed6de9},
+  /* 24 */ {6, 0.2181042919855316, 0xb640000, 0x67980e0b},
+  /* 25 */ {6, 0.2153382790366965, 0xe8d4a51, 0x19799812},
+  /* 26 */ {6, 0.2127460535533632, 0x1269ae40, 0xbce85396},
+  /* 27 */ {6, 0.2103099178571525, 0x17179149, 0x62c103a9},
+  /* 28 */ {6, 0.2080145976765095, 0x1cb91000, 0x1d353d43},
+  /* 29 */ {6, 0.2058468324604344, 0x23744899, 0xce1decea},
+  /* 30 */ {6, 0.2037950470905062, 0x2b73a840, 0x790fc511},
+  /* 31 */ {6, 0.2018490865820999, 0x34e63b41, 0x35b865a0},
+  /* 32 */ {6, 0.2000000000000000, 0x5, 0x0},
+  /* 33 */ {6, 0.1982398631705605, 0x4cfa3cc1, 0xa9aed1b3},
+  /* 34 */ {6, 0.1965616322328226, 0x5c13d840, 0x63dfc229},
+  /* 35 */ {6, 0.1949590218937863, 0x6d91b519, 0x2b0fee30},
+  /* 36 */ {6, 0.1934264036172708, 0x81bf1000, 0xf91bd1b6},
+  /* 37 */ {6, 0.1919587200065601, 0x98ede0c9, 0xac89c3a9},
+  /* 38 */ {6, 0.1905514124267734, 0xb3773e40, 0x6d2c32fe},
+  /* 39 */ {6, 0.1892003595168700, 0xd1bbc4d1, 0x387907c9},
+  /* 40 */ {6, 0.1879018247091076, 0xf4240000, 0xc6f7a0b},
+  /* 41 */ {5, 0.1866524112389434, 0x6e7d349, 0x28928154},
+  /* 42 */ {5, 0.1854490234153689, 0x7ca30a0, 0x6e8629d},
+  /* 43 */ {5, 0.1842888331487062, 0x8c32bbb, 0xd373dca0},
+  /* 44 */ {5, 0.1831692509136336, 0x9d46c00, 0xa0b17895},
+  /* 45 */ {5, 0.1820879004699383, 0xaffacfd, 0x746811a5},
+  /* 46 */ {5, 0.1810425967800402, 0xc46bee0, 0x4da6500f},
+  /* 47 */ {5, 0.1800313266566926, 0xdab86ef, 0x2ba23582},
+  /* 48 */ {5, 0.1790522317510414, 0xf300000, 0xdb20a88},
+  /* 49 */ {5, 0.1781035935540111, 0x10d63af1, 0xe68d5ce4},
+  /* 50 */ {5, 0.1771838201355579, 0x12a05f20, 0xb7cdfd9d},
+  /* 51 */ {5, 0.1762914343888821, 0x1490aae3, 0x8e583933},
+  /* 52 */ {5, 0.1754250635819545, 0x16a97400, 0x697cc3ea},
+  /* 53 */ {5, 0.1745834300480449, 0x18ed2825, 0x48a5ca6c},
+  /* 54 */ {5, 0.1737653428714400, 0x1b5e4d60, 0x2b52db16},
+  /* 55 */ {5, 0.1729696904450771, 0x1dff8297, 0x111586a6},
+  /* 56 */ {5, 0.1721954337940981, 0x20d38000, 0xf31d2b36},
+  /* 57 */ {5, 0.1714416005739134, 0x23dd1799, 0xc8d76d19},
+  /* 58 */ {5, 0.1707072796637201, 0x271f35a0, 0xa2cb1eb4},
+  /* 59 */ {5, 0.1699916162869140, 0x2a9ce10b, 0x807c3ec3},
+  /* 60 */ {5, 0.1692938075987814, 0x2e593c00, 0x617ec8bf},
+  /* 61 */ {5, 0.1686130986895011, 0x3257844d, 0x45746cbe},
+  /* 62 */ {5, 0.1679487789570419, 0x369b13e0, 0x2c0aa273},
+  /* 63 */ {5, 0.1673001788101741, 0x3b27613f, 0x14f90805},
+  /* 64 */ {5, 0.1666666666666667, 0x6, 0x0},
+  /* 65 */ {5, 0.1660476462159378, 0x4528a141, 0xd9cf0829},
+  /* 66 */ {5, 0.1654425539190583, 0x4aa51420, 0xb6fc4841},
+  /* 67 */ {5, 0.1648508567221604, 0x50794633, 0x973054cb},
+  /* 68 */ {5, 0.1642720499620502, 0x56a94400, 0x7a1dbe4b},
+  /* 69 */ {5, 0.1637056554452156, 0x5d393975, 0x5f7fcd7f},
+  /* 70 */ {5, 0.1631512196835108, 0x642d7260, 0x47196c84},
+  /* 71 */ {5, 0.1626083122716341, 0x6b8a5ae7, 0x30b43635},
+  /* 72 */ {5, 0.1620765243931223, 0x73548000, 0x1c1fa5f6},
+  /* 73 */ {5, 0.1615554674429964, 0x7b908fe9, 0x930634a},
+  /* 74 */ {5, 0.1610447717564445, 0x84435aa0, 0xef7f4a3c},
+  /* 75 */ {5, 0.1605440854340214, 0x8d71d25b, 0xcf5552d2},
+  /* 76 */ {5, 0.1600530732548213, 0x97210c00, 0xb1a47c8e},
+  /* 77 */ {5, 0.1595714156699382, 0xa1563f9d, 0x9634b43e},
+  /* 78 */ {5, 0.1590988078692941, 0xac16c8e0, 0x7cd3817d},
+  /* 79 */ {5, 0.1586349589155960, 0xb768278f, 0x65536761},
+  /* 80 */ {5, 0.1581795909397823, 0xc3500000, 0x4f8b588e},
+  /* 81 */ {5, 0.1577324383928644, 0xcfd41b91, 0x3b563c24},
+  /* 82 */ {5, 0.1572932473495469, 0xdcfa6920, 0x28928154},
+  /* 83 */ {5, 0.1568617748594410, 0xeac8fd83, 0x1721bfb0},
+  /* 84 */ {5, 0.1564377883420716, 0xf9461400, 0x6e8629d},
+  /* 85 */ {4, 0.1560210650222250, 0x31c84b1, 0x491cc17c},
+  /* 86 */ {4, 0.1556113914024940, 0x342ab10, 0x3a11d83b},
+  /* 87 */ {4, 0.1552085627701551, 0x36a2c21, 0x2be074cd},
+  /* 88 */ {4, 0.1548123827357682, 0x3931000, 0x1e7a02e7},
+  /* 89 */ {4, 0.1544226628011101, 0x3bd5ee1, 0x11d10edd},
+  /* 90 */ {4, 0.1540392219542636, 0x3e92110, 0x5d92c68},
+  /* 91 */ {4, 0.1536618862898642, 0x4165ef1, 0xf50dbfb2},
+  /* 92 */ {4, 0.1532904886526781, 0x4452100, 0xdf9f1316},
+  /* 93 */ {4, 0.1529248683028321, 0x4756fd1, 0xcb52a684},
+  /* 94 */ {4, 0.1525648706011593, 0x4a75410, 0xb8163e97},
+  /* 95 */ {4, 0.1522103467132434, 0x4dad681, 0xa5d8f269},
+  /* 96 */ {4, 0.1518611533308632, 0x5100000, 0x948b0fcd},
+  /* 97 */ {4, 0.1515171524096389, 0x546d981, 0x841e0215},
+  /* 98 */ {4, 0.1511782109217764, 0x57f6c10, 0x74843b1e},
+  /* 99 */ {4, 0.1508442006228941, 0x5b9c0d1, 0x65b11e6e},
+  /* 100 */ {4, 0.1505149978319906, 0x5f5e100, 0x5798ee23},
+  /* 101 */ {4, 0.1501904832236879, 0x633d5f1, 0x4a30b99b},
+  /* 102 */ {4, 0.1498705416319474, 0x673a910, 0x3d6e4d94},
+  /* 103 */ {4, 0.1495550618645152, 0x6b563e1, 0x314825b0},
+  /* 104 */ {4, 0.1492439365274121, 0x6f91000, 0x25b55f2e},
+  /* 105 */ {4, 0.1489370618588283, 0x73eb721, 0x1aadaccb},
+  /* 106 */ {4, 0.1486343375718350, 0x7866310, 0x10294ba2},
+  /* 107 */ {4, 0.1483356667053617, 0x7d01db1, 0x620f8f6},
+  /* 108 */ {4, 0.1480409554829326, 0x81bf100, 0xf91bd1b6},
+  /* 109 */ {4, 0.1477501131786861, 0x869e711, 0xe6d37b2a},
+  /* 110 */ {4, 0.1474630519902391, 0x8ba0a10, 0xd55cff6e},
+  /* 111 */ {4, 0.1471796869179852, 0x90c6441, 0xc4ad2db2},
+  /* 112 */ {4, 0.1468999356504447, 0x9610000, 0xb4b985cf},
+  /* 113 */ {4, 0.1466237184553111, 0x9b7e7c1, 0xa5782bef},
+  /* 114 */ {4, 0.1463509580758620, 0xa112610, 0x96dfdd2a},
+  /* 115 */ {4, 0.1460815796324244, 0xa6cc591, 0x88e7e509},
+  /* 116 */ {4, 0.1458155105286054, 0xacad100, 0x7b8813d3},
+  /* 117 */ {4, 0.1455526803620167, 0xb2b5331, 0x6eb8b595},
+  /* 118 */ {4, 0.1452930208392428, 0xb8e5710, 0x627289db},
+  /* 119 */ {4, 0.1450364656948130, 0xbf3e7a1, 0x56aebc07},
+  /* 120 */ {4, 0.1447829506139581, 0xc5c1000, 0x4b66dc33},
+  /* 121 */ {4, 0.1445324131589439, 0xcc6db61, 0x4094d8a3},
+  /* 122 */ {4, 0.1442847926987864, 0xd345510, 0x3632f7a5},
+  /* 123 */ {4, 0.1440400303421672, 0xda48871, 0x2c3bd1f0},
+  /* 124 */ {4, 0.1437980688733775, 0xe178100, 0x22aa4d5f},
+  /* 125 */ {4, 0.1435588526911310, 0xe8d4a51, 0x19799812},
+  /* 126 */ {4, 0.1433223277500932, 0xf05f010, 0x10a523e5},
+  /* 127 */ {4, 0.1430884415049874, 0xf817e01, 0x828a237},
+  /* 128 */ {4, 0.1428571428571428, 0x7, 0x0},
+  /* 129 */ {4, 0.1426283821033600, 0x10818201, 0xf04ec452},
+  /* 130 */ {4, 0.1424021108869747, 0x11061010, 0xe136444a},
+  /* 131 */ {4, 0.1421782821510107, 0x118db651, 0xd2af9589},
+  /* 132 */ {4, 0.1419568500933153, 0x12188100, 0xc4b42a83},
+  /* 133 */ {4, 0.1417377701235801, 0x12a67c71, 0xb73dccf5},
+  /* 134 */ {4, 0.1415209988221527, 0x1337b510, 0xaa4698c5},
+  /* 135 */ {4, 0.1413064939005528, 0x13cc3761, 0x9dc8f729},
+  /* 136 */ {4, 0.1410942141636095, 0x14641000, 0x91bf9a30},
+  /* 137 */ {4, 0.1408841194731412, 0x14ff4ba1, 0x86257887},
+  /* 138 */ {4, 0.1406761707131039, 0x159df710, 0x7af5c98c},
+  /* 139 */ {4, 0.1404703297561400, 0x16401f31, 0x702c01a0},
+  /* 140 */ {4, 0.1402665594314587, 0x16e5d100, 0x65c3ceb1},
+  /* 141 */ {4, 0.1400648234939879, 0x178f1991, 0x5bb91502},
+  /* 142 */ {4, 0.1398650865947379, 0x183c0610, 0x5207ec23},
+  /* 143 */ {4, 0.1396673142523192, 0x18eca3c1, 0x48ac9c19},
+  /* 144 */ {4, 0.1394714728255649, 0x19a10000, 0x3fa39ab5},
+  /* 145 */ {4, 0.1392775294872041, 0x1a592841, 0x36e98912},
+  /* 146 */ {4, 0.1390854521985406, 0x1b152a10, 0x2e7b3140},
+  /* 147 */ {4, 0.1388952096850913, 0x1bd51311, 0x2655840b},
+  /* 148 */ {4, 0.1387067714131417, 0x1c98f100, 0x1e7596ea},
+  /* 149 */ {4, 0.1385201075671774, 0x1d60d1b1, 0x16d8a20d},
+  /* 150 */ {4, 0.1383351890281539, 0x1e2cc310, 0xf7bfe87},
+  /* 151 */ {4, 0.1381519873525671, 0x1efcd321, 0x85d2492},
+  /* 152 */ {4, 0.1379704747522905, 0x1fd11000, 0x179a9f4},
+  /* 153 */ {4, 0.1377906240751463, 0x20a987e1, 0xf59e80eb},
+  /* 154 */ {4, 0.1376124087861776, 0x21864910, 0xe8b768db},
+  /* 155 */ {4, 0.1374358029495937, 0x226761f1, 0xdc39d6d5},
+  /* 156 */ {4, 0.1372607812113589, 0x234ce100, 0xd021c5d1},
+  /* 157 */ {4, 0.1370873187823978, 0x2436d4d1, 0xc46b5e37},
+  /* 158 */ {4, 0.1369153914223921, 0x25254c10, 0xb912f39c},
+  /* 159 */ {4, 0.1367449754241439, 0x26185581, 0xae150294},
+  /* 160 */ {4, 0.1365760475984821, 0x27100000, 0xa36e2eb1},
+  /* 161 */ {4, 0.1364085852596902, 0x280c5a81, 0x991b4094},
+  /* 162 */ {4, 0.1362425662114337, 0x290d7410, 0x8f19241e},
+  /* 163 */ {4, 0.1360779687331669, 0x2a135bd1, 0x8564e6b7},
+  /* 164 */ {4, 0.1359147715670014, 0x2b1e2100, 0x7bfbb5b4},
+  /* 165 */ {4, 0.1357529539050150, 0x2c2dd2f1, 0x72dadcc8},
+  /* 166 */ {4, 0.1355924953769863, 0x2d428110, 0x69ffc498},
+  /* 167 */ {4, 0.1354333760385373, 0x2e5c3ae1, 0x6167f154},
+  /* 168 */ {4, 0.1352755763596663, 0x2f7b1000, 0x5911016e},
+  /* 169 */ {4, 0.1351190772136599, 0x309f1021, 0x50f8ac5f},
+  /* 170 */ {4, 0.1349638598663645, 0x31c84b10, 0x491cc17c},
+  /* 171 */ {4, 0.1348099059658079, 0x32f6d0b1, 0x417b26d8},
+  /* 172 */ {4, 0.1346571975321549, 0x342ab100, 0x3a11d83b},
+  /* 173 */ {4, 0.1345057169479844, 0x3563fc11, 0x32dee622},
+  /* 174 */ {4, 0.1343554469488779, 0x36a2c210, 0x2be074cd},
+  /* 175 */ {4, 0.1342063706143054, 0x37e71341, 0x2514bb58},
+  /* 176 */ {4, 0.1340584713587980, 0x39310000, 0x1e7a02e7},
+  /* 177 */ {4, 0.1339117329233981, 0x3a8098c1, 0x180ea5d0},
+  /* 178 */ {4, 0.1337661393673756, 0x3bd5ee10, 0x11d10edd},
+  /* 179 */ {4, 0.1336216750601996, 0x3d311091, 0xbbfb88e},
+  /* 180 */ {4, 0.1334783246737591, 0x3e921100, 0x5d92c68},
+  /* 181 */ {4, 0.1333360731748201, 0x3ff90031, 0x1c024c},
+  /* 182 */ {4, 0.1331949058177136, 0x4165ef10, 0xf50dbfb2},
+  /* 183 */ {4, 0.1330548081372441, 0x42d8eea1, 0xea30efa3},
+  /* 184 */ {4, 0.1329157659418126, 0x44521000, 0xdf9f1316},
+  /* 185 */ {4, 0.1327777653067443, 0x45d16461, 0xd555c0c9},
+  /* 186 */ {4, 0.1326407925678156, 0x4756fd10, 0xcb52a684},
+  /* 187 */ {4, 0.1325048343149731, 0x48e2eb71, 0xc193881f},
+  /* 188 */ {4, 0.1323698773862368, 0x4a754100, 0xb8163e97},
+  /* 189 */ {4, 0.1322359088617821, 0x4c0e0f51, 0xaed8b724},
+  /* 190 */ {4, 0.1321029160581950, 0x4dad6810, 0xa5d8f269},
+  /* 191 */ {4, 0.1319708865228925, 0x4f535d01, 0x9d15039d},
+  /* 192 */ {4, 0.1318398080287045, 0x51000000, 0x948b0fcd},
+  /* 193 */ {4, 0.1317096685686114, 0x52b36301, 0x8c394d1d},
+  /* 194 */ {4, 0.1315804563506306, 0x546d9810, 0x841e0215},
+  /* 195 */ {4, 0.1314521597928493, 0x562eb151, 0x7c3784f8},
+  /* 196 */ {4, 0.1313247675185968, 0x57f6c100, 0x74843b1e},
+  /* 197 */ {4, 0.1311982683517524, 0x59c5d971, 0x6d02985d},
+  /* 198 */ {4, 0.1310726513121843, 0x5b9c0d10, 0x65b11e6e},
+  /* 199 */ {4, 0.1309479056113158, 0x5d796e61, 0x5e8e5c64},
+  /* 200 */ {4, 0.1308240206478128, 0x5f5e1000, 0x5798ee23},
+  /* 201 */ {4, 0.1307009860033912, 0x614a04a1, 0x50cf7bde},
+  /* 202 */ {4, 0.1305787914387386, 0x633d5f10, 0x4a30b99b},
+  /* 203 */ {4, 0.1304574268895465, 0x65383231, 0x43bb66bd},
+  /* 204 */ {4, 0.1303368824626505, 0x673a9100, 0x3d6e4d94},
+  /* 205 */ {4, 0.1302171484322746, 0x69448e91, 0x374842ee},
+  /* 206 */ {4, 0.1300982152363760, 0x6b563e10, 0x314825b0},
+  /* 207 */ {4, 0.1299800734730872, 0x6d6fb2c1, 0x2b6cde75},
+  /* 208 */ {4, 0.1298627138972530, 0x6f910000, 0x25b55f2e},
+  /* 209 */ {4, 0.1297461274170591, 0x71ba3941, 0x2020a2c5},
+  /* 210 */ {4, 0.1296303050907487, 0x73eb7210, 0x1aadaccb},
+  /* 211 */ {4, 0.1295152381234257, 0x7624be11, 0x155b891f},
+  /* 212 */ {4, 0.1294009178639407, 0x78663100, 0x10294ba2},
+  /* 213 */ {4, 0.1292873358018581, 0x7aafdeb1, 0xb160fe9},
+  /* 214 */ {4, 0.1291744835645007, 0x7d01db10, 0x620f8f6},
+  /* 215 */ {4, 0.1290623529140715, 0x7f5c3a21, 0x14930ef},
+  /* 216 */ {4, 0.1289509357448472, 0x81bf1000, 0xf91bd1b6},
+  /* 217 */ {4, 0.1288402240804449, 0x842a70e1, 0xefdcb0c7},
+  /* 218 */ {4, 0.1287302100711567, 0x869e7110, 0xe6d37b2a},
+  /* 219 */ {4, 0.1286208859913518, 0x891b24f1, 0xddfeb94a},
+  /* 220 */ {4, 0.1285122442369443, 0x8ba0a100, 0xd55cff6e},
+  /* 221 */ {4, 0.1284042773229231, 0x8e2ef9d1, 0xcceced50},
+  /* 222 */ {4, 0.1282969778809442, 0x90c64410, 0xc4ad2db2},
+  /* 223 */ {4, 0.1281903386569819, 0x93669481, 0xbc9c75f9},
+  /* 224 */ {4, 0.1280843525090381, 0x96100000, 0xb4b985cf},
+  /* 225 */ {4, 0.1279790124049077, 0x98c29b81, 0xad0326c2},
+  /* 226 */ {4, 0.1278743114199984, 0x9b7e7c10, 0xa5782bef},
+  /* 227 */ {4, 0.1277702427352035, 0x9e43b6d1, 0x9e1771a9},
+  /* 228 */ {4, 0.1276667996348261, 0xa1126100, 0x96dfdd2a},
+  /* 229 */ {4, 0.1275639755045533, 0xa3ea8ff1, 0x8fd05c41},
+  /* 230 */ {4, 0.1274617638294791, 0xa6cc5910, 0x88e7e509},
+  /* 231 */ {4, 0.1273601581921741, 0xa9b7d1e1, 0x8225759d},
+  /* 232 */ {4, 0.1272591522708010, 0xacad1000, 0x7b8813d3},
+  /* 233 */ {4, 0.1271587398372755, 0xafac2921, 0x750eccf9},
+  /* 234 */ {4, 0.1270589147554692, 0xb2b53310, 0x6eb8b595},
+  /* 235 */ {4, 0.1269596709794558, 0xb5c843b1, 0x6884e923},
+  /* 236 */ {4, 0.1268610025517973, 0xb8e57100, 0x627289db},
+  /* 237 */ {4, 0.1267629036018709, 0xbc0cd111, 0x5c80c07b},
+  /* 238 */ {4, 0.1266653683442337, 0xbf3e7a10, 0x56aebc07},
+  /* 239 */ {4, 0.1265683910770258, 0xc27a8241, 0x50fbb19b},
+  /* 240 */ {4, 0.1264719661804097, 0xc5c10000, 0x4b66dc33},
+  /* 241 */ {4, 0.1263760881150453, 0xc91209c1, 0x45ef7c7c},
+  /* 242 */ {4, 0.1262807514205999, 0xcc6db610, 0x4094d8a3},
+  /* 243 */ {4, 0.1261859507142915, 0xcfd41b91, 0x3b563c24},
+  /* 244 */ {4, 0.1260916806894653, 0xd3455100, 0x3632f7a5},
+  /* 245 */ {4, 0.1259979361142023, 0xd6c16d31, 0x312a60c3},
+  /* 246 */ {4, 0.1259047118299582, 0xda488710, 0x2c3bd1f0},
+  /* 247 */ {4, 0.1258120027502338, 0xdddab5a1, 0x2766aa45},
+  /* 248 */ {4, 0.1257198038592741, 0xe1781000, 0x22aa4d5f},
+  /* 249 */ {4, 0.1256281102107963, 0xe520ad61, 0x1e06233c},
+  /* 250 */ {4, 0.1255369169267456, 0xe8d4a510, 0x19799812},
+  /* 251 */ {4, 0.1254462191960791, 0xec940e71, 0x15041c33},
+  /* 252 */ {4, 0.1253560122735751, 0xf05f0100, 0x10a523e5},
+  /* 253 */ {4, 0.1252662914786691, 0xf4359451, 0xc5c2749},
+  /* 254 */ {4, 0.1251770521943144, 0xf817e010, 0x828a237},
+  /* 255 */ {4, 0.1250882898658681, 0xfc05fc01, 0x40a1423},
+};
+#endif
+#if BITS_PER_MP_LIMB == 64
+const struct bases __mp_bases[256] =
+{
+  /*  0 */ {0, 0.0, 0, 0},
+  /*  1 */ {0, 1e38, 0, 0},
+  /*  2 */ {64, 1.0000000000000000, CNST_LIMB(0x1), CNST_LIMB(0x0)},
+  /*  3 */ {40, 0.6309297535714574, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
+  /*  4 */ {32, 0.5000000000000000, CNST_LIMB(0x2), CNST_LIMB(0x0)},
+  /*  5 */ {27, 0.4306765580733931, CNST_LIMB(0x6765c793fa10079d), CNST_LIMB(0x3ce9a36f23c0fc90)},
+  /*  6 */ {24, 0.3868528072345416, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)},
+  /*  7 */ {22, 0.3562071871080222, CNST_LIMB(0x3642798750226111), CNST_LIMB(0x2df495ccaa57147b)},
+  /*  8 */ {21, 0.3333333333333334, CNST_LIMB(0x3), CNST_LIMB(0x0)},
+  /*  9 */ {20, 0.3154648767857287, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
+  /* 10 */ {19, 0.3010299956639811, CNST_LIMB(0x8ac7230489e80000), CNST_LIMB(0xd83c94fb6d2ac34a)},
+  /* 11 */ {18, 0.2890648263178878, CNST_LIMB(0x4d28cb56c33fa539), CNST_LIMB(0xa8adf7ae45e7577b)},
+  /* 12 */ {17, 0.2789429456511298, CNST_LIMB(0x1eca170c00000000), CNST_LIMB(0xa10c2bec5da8f8f)},
+  /* 13 */ {17, 0.2702381544273197, CNST_LIMB(0x780c7372621bd74d), CNST_LIMB(0x10f4becafe412ec3)},
+  /* 14 */ {16, 0.2626495350371936, CNST_LIMB(0x1e39a5057d810000), CNST_LIMB(0xf08480f672b4e86)},
+  /* 15 */ {16, 0.2559580248098155, CNST_LIMB(0x5b27ac993df97701), CNST_LIMB(0x6779c7f90dc42f48)},
+  /* 16 */ {16, 0.2500000000000000, CNST_LIMB(0x4), CNST_LIMB(0x0)},
+  /* 17 */ {15, 0.2446505421182260, CNST_LIMB(0x27b95e997e21d9f1), CNST_LIMB(0x9c71e11bab279323)},
+  /* 18 */ {15, 0.2398124665681315, CNST_LIMB(0x5da0e1e53c5c8000), CNST_LIMB(0x5dfaa697ec6f6a1c)},
+  /* 19 */ {15, 0.2354089133666382, CNST_LIMB(0xd2ae3299c1c4aedb), CNST_LIMB(0x3711783f6be7e9ec)},
+  /* 20 */ {14, 0.2313782131597592, CNST_LIMB(0x16bcc41e90000000), CNST_LIMB(0x6849b86a12b9b01e)},
+  /* 21 */ {14, 0.2276702486969530, CNST_LIMB(0x2d04b7fdd9c0ef49), CNST_LIMB(0x6bf097ba5ca5e239)},
+  /* 22 */ {14, 0.2242438242175754, CNST_LIMB(0x5658597bcaa24000), CNST_LIMB(0x7b8015c8d7af8f08)},
+  /* 23 */ {14, 0.2210647294575037, CNST_LIMB(0xa0e2073737609371), CNST_LIMB(0x975a24b3a3151b38)},
+  /* 24 */ {13, 0.2181042919855316, CNST_LIMB(0xc29e98000000000), CNST_LIMB(0x50bd367972689db1)},
+  /* 25 */ {13, 0.2153382790366965, CNST_LIMB(0x14adf4b7320334b9), CNST_LIMB(0x8c240c4aecb13bb5)},
+  /* 26 */ {13, 0.2127460535533632, CNST_LIMB(0x226ed36478bfa000), CNST_LIMB(0xdbd2e56854e118c9)},
+  /* 27 */ {13, 0.2103099178571525, CNST_LIMB(0x383d9170b85ff80b), CNST_LIMB(0x2351ffcaa9c7c4ae)},
+  /* 28 */ {13, 0.2080145976765095, CNST_LIMB(0x5a3c23e39c000000), CNST_LIMB(0x6b24188ca33b0636)},
+  /* 29 */ {13, 0.2058468324604344, CNST_LIMB(0x8e65137388122bcd), CNST_LIMB(0xcc3dceaf2b8ba99d)},
+  /* 30 */ {13, 0.2037950470905062, CNST_LIMB(0xdd41bb36d259e000), CNST_LIMB(0x2832e835c6c7d6b6)},
+  /* 31 */ {12, 0.2018490865820999, CNST_LIMB(0xaee5720ee830681), CNST_LIMB(0x76b6aa272e1873c5)},
+  /* 32 */ {12, 0.2000000000000000, CNST_LIMB(0x5), CNST_LIMB(0x0)},
+  /* 33 */ {12, 0.1982398631705605, CNST_LIMB(0x172588ad4f5f0981), CNST_LIMB(0x61eaf5d402c7bf4f)},
+  /* 34 */ {12, 0.1965616322328226, CNST_LIMB(0x211e44f7d02c1000), CNST_LIMB(0xeeb658123ffb27ec)},
+  /* 35 */ {12, 0.1949590218937863, CNST_LIMB(0x2ee56725f06e5c71), CNST_LIMB(0x5d5e3762e6fdf509)},
+  /* 36 */ {12, 0.1934264036172708, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)},
+  /* 37 */ {12, 0.1919587200065601, CNST_LIMB(0x5b5b57f8a98a5dd1), CNST_LIMB(0x66ae7831762efb6f)},
+  /* 38 */ {12, 0.1905514124267734, CNST_LIMB(0x7dcff8986ea31000), CNST_LIMB(0x47388865a00f544)},
+  /* 39 */ {12, 0.1892003595168700, CNST_LIMB(0xabd4211662a6b2a1), CNST_LIMB(0x7d673c33a123b54c)},
+  /* 40 */ {12, 0.1879018247091076, CNST_LIMB(0xe8d4a51000000000), CNST_LIMB(0x19799812dea11197)},
+  /* 41 */ {11, 0.1866524112389434, CNST_LIMB(0x7a32956ad081b79), CNST_LIMB(0xc27e62e0686feae)},
+  /* 42 */ {11, 0.1854490234153689, CNST_LIMB(0x9f49aaff0e86800), CNST_LIMB(0x9b6e7507064ce7c7)},
+  /* 43 */ {11, 0.1842888331487062, CNST_LIMB(0xce583bb812d37b3), CNST_LIMB(0x3d9ac2bf66cfed94)},
+  /* 44 */ {11, 0.1831692509136336, CNST_LIMB(0x109b79a654c00000), CNST_LIMB(0xed46bc50ce59712a)},
+  /* 45 */ {11, 0.1820879004699383, CNST_LIMB(0x1543beff214c8b95), CNST_LIMB(0x813d97e2c89b8d46)},
+  /* 46 */ {11, 0.1810425967800402, CNST_LIMB(0x1b149a79459a3800), CNST_LIMB(0x2e81751956af8083)},
+  /* 47 */ {11, 0.1800313266566926, CNST_LIMB(0x224edfb5434a830f), CNST_LIMB(0xdd8e0a95e30c0988)},
+  /* 48 */ {11, 0.1790522317510413, CNST_LIMB(0x2b3fb00000000000), CNST_LIMB(0x7ad4dd48a0b5b167)},
+  /* 49 */ {11, 0.1781035935540111, CNST_LIMB(0x3642798750226111), CNST_LIMB(0x2df495ccaa57147b)},
+  /* 50 */ {11, 0.1771838201355579, CNST_LIMB(0x43c33c1937564800), CNST_LIMB(0xe392010175ee5962)},
+  /* 51 */ {11, 0.1762914343888821, CNST_LIMB(0x54411b2441c3cd8b), CNST_LIMB(0x84eaf11b2fe7738e)},
+  /* 52 */ {11, 0.1754250635819545, CNST_LIMB(0x6851455acd400000), CNST_LIMB(0x3a1e3971e008995d)},
+  /* 53 */ {11, 0.1745834300480449, CNST_LIMB(0x80a23b117c8feb6d), CNST_LIMB(0xfd7a462344ffce25)},
+  /* 54 */ {11, 0.1737653428714400, CNST_LIMB(0x9dff7d32d5dc1800), CNST_LIMB(0x9eca40b40ebcef8a)},
+  /* 55 */ {11, 0.1729696904450771, CNST_LIMB(0xc155af6faeffe6a7), CNST_LIMB(0x52fa161a4a48e43d)},
+  /* 56 */ {11, 0.1721954337940981, CNST_LIMB(0xebb7392e00000000), CNST_LIMB(0x1607a2cbacf930c1)},
+  /* 57 */ {10, 0.1714416005739134, CNST_LIMB(0x50633659656d971), CNST_LIMB(0x97a014f8e3be55f1)},
+  /* 58 */ {10, 0.1707072796637201, CNST_LIMB(0x5fa8624c7fba400), CNST_LIMB(0x568df8b76cbf212c)},
+  /* 59 */ {10, 0.1699916162869140, CNST_LIMB(0x717d9faa73c5679), CNST_LIMB(0x20ba7c4b4e6ef492)},
+  /* 60 */ {10, 0.1692938075987814, CNST_LIMB(0x86430aac6100000), CNST_LIMB(0xe81ee46b9ef492f5)},
+  /* 61 */ {10, 0.1686130986895011, CNST_LIMB(0x9e64d9944b57f29), CNST_LIMB(0x9dc0d10d51940416)},
+  /* 62 */ {10, 0.1679487789570419, CNST_LIMB(0xba5ca5392cb0400), CNST_LIMB(0x5fa8ed2f450272a5)},
+  /* 63 */ {10, 0.1673001788101741, CNST_LIMB(0xdab2ce1d022cd81), CNST_LIMB(0x2ba9eb8c5e04e641)},
+  /* 64 */ {10, 0.1666666666666667, CNST_LIMB(0x6), CNST_LIMB(0x0)},
+  /* 65 */ {10, 0.1660476462159378, CNST_LIMB(0x12aeed5fd3e2d281), CNST_LIMB(0xb67759cc00287bf1)},
+  /* 66 */ {10, 0.1654425539190583, CNST_LIMB(0x15c3da1572d50400), CNST_LIMB(0x78621feeb7f4ed33)},
+  /* 67 */ {10, 0.1648508567221604, CNST_LIMB(0x194c05534f75ee29), CNST_LIMB(0x43d55b5f72943bc0)},
+  /* 68 */ {10, 0.1642720499620502, CNST_LIMB(0x1d56299ada100000), CNST_LIMB(0x173decb64d1d4409)},
+  /* 69 */ {10, 0.1637056554452156, CNST_LIMB(0x21f2a089a4ff4f79), CNST_LIMB(0xe29fb54fd6b6074f)},
+  /* 70 */ {10, 0.1631512196835108, CNST_LIMB(0x2733896c68d9a400), CNST_LIMB(0xa1f1f5c210d54e62)},
+  /* 71 */ {10, 0.1626083122716341, CNST_LIMB(0x2d2cf2c33b533c71), CNST_LIMB(0x6aac7f9bfafd57b2)},
+  /* 72 */ {10, 0.1620765243931223, CNST_LIMB(0x33f506e440000000), CNST_LIMB(0x3b563c2478b72ee2)},
+  /* 73 */ {10, 0.1615554674429964, CNST_LIMB(0x3ba43bec1d062211), CNST_LIMB(0x12b536b574e92d1b)},
+  /* 74 */ {10, 0.1610447717564444, CNST_LIMB(0x4455872d8fd4e400), CNST_LIMB(0xdf86c03020404fa5)},
+  /* 75 */ {10, 0.1605440854340214, CNST_LIMB(0x4e2694539f2f6c59), CNST_LIMB(0xa34adf02234eea8e)},
+  /* 76 */ {10, 0.1600530732548213, CNST_LIMB(0x5938006c18900000), CNST_LIMB(0x6f46eb8574eb59dd)},
+  /* 77 */ {10, 0.1595714156699382, CNST_LIMB(0x65ad9912474aa649), CNST_LIMB(0x42459b481df47cec)},
+  /* 78 */ {10, 0.1590988078692941, CNST_LIMB(0x73ae9ff4241ec400), CNST_LIMB(0x1b424b95d80ca505)},
+  /* 79 */ {10, 0.1586349589155960, CNST_LIMB(0x836612ee9c4ce1e1), CNST_LIMB(0xf2c1b982203a0dac)},
+  /* 80 */ {10, 0.1581795909397823, CNST_LIMB(0x9502f90000000000), CNST_LIMB(0xb7cdfd9d7bdbab7d)},
+  /* 81 */ {10, 0.1577324383928644, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
+  /* 82 */ {10, 0.1572932473495469, CNST_LIMB(0xbebf59a07dab4400), CNST_LIMB(0x57931eeaf85cf64f)},
+  /* 83 */ {10, 0.1568617748594410, CNST_LIMB(0xd7540d4093bc3109), CNST_LIMB(0x305a944507c82f47)},
+  /* 84 */ {10, 0.1564377883420716, CNST_LIMB(0xf2b96616f1900000), CNST_LIMB(0xe007ccc9c22781a)},
+  /* 85 */ {9, 0.1560210650222250, CNST_LIMB(0x336de62af2bca35), CNST_LIMB(0x3e92c42e000eeed4)},
+  /* 86 */ {9, 0.1556113914024940, CNST_LIMB(0x39235ec33d49600), CNST_LIMB(0x1ebe59130db2795e)},
+  /* 87 */ {9, 0.1552085627701551, CNST_LIMB(0x3f674e539585a17), CNST_LIMB(0x268859e90f51b89)},
+  /* 88 */ {9, 0.1548123827357682, CNST_LIMB(0x4645b6958000000), CNST_LIMB(0xd24cde0463108cfa)},
+  /* 89 */ {9, 0.1544226628011101, CNST_LIMB(0x4dcb74afbc49c19), CNST_LIMB(0xa536009f37adc383)},
+  /* 90 */ {9, 0.1540392219542636, CNST_LIMB(0x56064e1d18d9a00), CNST_LIMB(0x7cea06ce1c9ace10)},
+  /* 91 */ {9, 0.1536618862898642, CNST_LIMB(0x5f04fe2cd8a39fb), CNST_LIMB(0x58db032e72e8ba43)},
+  /* 92 */ {9, 0.1532904886526781, CNST_LIMB(0x68d74421f5c0000), CNST_LIMB(0x388cc17cae105447)},
+  /* 93 */ {9, 0.1529248683028321, CNST_LIMB(0x738df1f6ab4827d), CNST_LIMB(0x1b92672857620ce0)},
+  /* 94 */ {9, 0.1525648706011593, CNST_LIMB(0x7f3afbc9cfb5e00), CNST_LIMB(0x18c6a9575c2ade4)},
+  /* 95 */ {9, 0.1522103467132434, CNST_LIMB(0x8bf187fba88f35f), CNST_LIMB(0xd44da7da8e44b24f)},
+  /* 96 */ {9, 0.1518611533308632, CNST_LIMB(0x99c600000000000), CNST_LIMB(0xaa2f78f1b4cc6794)},
+  /* 97 */ {9, 0.1515171524096389, CNST_LIMB(0xa8ce21eb6531361), CNST_LIMB(0x843c067d091ee4cc)},
+  /* 98 */ {9, 0.1511782109217764, CNST_LIMB(0xb92112c1a0b6200), CNST_LIMB(0x62005e1e913356e3)},
+  /* 99 */ {9, 0.1508442006228941, CNST_LIMB(0xcad7718b8747c43), CNST_LIMB(0x4316eed01dedd518)},
+  /* 100 */ {9, 0.1505149978319906, CNST_LIMB(0xde0b6b3a7640000), CNST_LIMB(0x2725dd1d243aba0e)},
+  /* 101 */ {9, 0.1501904832236879, CNST_LIMB(0xf2d8cf5fe6d74c5), CNST_LIMB(0xddd9057c24cb54f)},
+  /* 102 */ {9, 0.1498705416319474, CNST_LIMB(0x1095d25bfa712600), CNST_LIMB(0xedeee175a736d2a1)},
+  /* 103 */ {9, 0.1495550618645152, CNST_LIMB(0x121b7c4c3698faa7), CNST_LIMB(0xc4699f3df8b6b328)},
+  /* 104 */ {9, 0.1492439365274121, CNST_LIMB(0x13c09e8d68000000), CNST_LIMB(0x9ebbe7d859cb5a7c)},
+  /* 105 */ {9, 0.1489370618588283, CNST_LIMB(0x15876ccb0b709ca9), CNST_LIMB(0x7c828b9887eb2179)},
+  /* 106 */ {9, 0.1486343375718350, CNST_LIMB(0x17723c2976da2a00), CNST_LIMB(0x5d652ab99001adcf)},
+  /* 107 */ {9, 0.1483356667053617, CNST_LIMB(0x198384e9c259048b), CNST_LIMB(0x4114f1754e5d7b32)},
+  /* 108 */ {9, 0.1480409554829326, CNST_LIMB(0x1bbde41dfeec0000), CNST_LIMB(0x274b7c902f7e0188)},
+  /* 109 */ {9, 0.1477501131786861, CNST_LIMB(0x1e241d6e3337910d), CNST_LIMB(0xfc9e0fbb32e210c)},
+  /* 110 */ {9, 0.1474630519902391, CNST_LIMB(0x20b91cee9901ee00), CNST_LIMB(0xf4afa3e594f8ea1f)},
+  /* 111 */ {9, 0.1471796869179852, CNST_LIMB(0x237ff9079863dfef), CNST_LIMB(0xcd85c32e9e4437b0)},
+  /* 112 */ {9, 0.1468999356504447, CNST_LIMB(0x267bf47000000000), CNST_LIMB(0xa9bbb147e0dd92a8)},
+  /* 113 */ {9, 0.1466237184553111, CNST_LIMB(0x29b08039fbeda7f1), CNST_LIMB(0x8900447b70e8eb82)},
+  /* 114 */ {9, 0.1463509580758620, CNST_LIMB(0x2d213df34f65f200), CNST_LIMB(0x6b0a92adaad5848a)},
+  /* 115 */ {9, 0.1460815796324244, CNST_LIMB(0x30d201d957a7c2d3), CNST_LIMB(0x4f990ad8740f0ee5)},
+  /* 116 */ {9, 0.1458155105286054, CNST_LIMB(0x34c6d52160f40000), CNST_LIMB(0x3670a9663a8d3610)},
+  /* 117 */ {9, 0.1455526803620167, CNST_LIMB(0x3903f855d8f4c755), CNST_LIMB(0x1f5c44188057be3c)},
+  /* 118 */ {9, 0.1452930208392428, CNST_LIMB(0x3d8de5c8ec59b600), CNST_LIMB(0xa2bea956c4e4977)},
+  /* 119 */ {9, 0.1450364656948130, CNST_LIMB(0x4269541d1ff01337), CNST_LIMB(0xed68b23033c3637e)},
+  /* 120 */ {9, 0.1447829506139581, CNST_LIMB(0x479b38e478000000), CNST_LIMB(0xc99cf624e50549c5)},
+  /* 121 */ {9, 0.1445324131589439, CNST_LIMB(0x4d28cb56c33fa539), CNST_LIMB(0xa8adf7ae45e7577b)},
+  /* 122 */ {9, 0.1442847926987864, CNST_LIMB(0x5317871fa13aba00), CNST_LIMB(0x8a5bc740b1c113e5)},
+  /* 123 */ {9, 0.1440400303421672, CNST_LIMB(0x596d2f44de9fa71b), CNST_LIMB(0x6e6c7efb81cfbb9b)},
+  /* 124 */ {9, 0.1437980688733775, CNST_LIMB(0x602fd125c47c0000), CNST_LIMB(0x54aba5c5cada5f10)},
+  /* 125 */ {9, 0.1435588526911310, CNST_LIMB(0x6765c793fa10079d), CNST_LIMB(0x3ce9a36f23c0fc90)},
+  /* 126 */ {9, 0.1433223277500932, CNST_LIMB(0x6f15be069b847e00), CNST_LIMB(0x26fb43de2c8cd2a8)},
+  /* 127 */ {9, 0.1430884415049874, CNST_LIMB(0x7746b3e82a77047f), CNST_LIMB(0x12b94793db8486a1)},
+  /* 128 */ {9, 0.1428571428571428, CNST_LIMB(0x7), CNST_LIMB(0x0)},
+  /* 129 */ {9, 0.1426283821033600, CNST_LIMB(0x894953f7ea890481), CNST_LIMB(0xdd5deca404c0156d)},
+  /* 130 */ {9, 0.1424021108869747, CNST_LIMB(0x932abffea4848200), CNST_LIMB(0xbd51373330291de0)},
+  /* 131 */ {9, 0.1421782821510107, CNST_LIMB(0x9dacb687d3d6a163), CNST_LIMB(0x9fa4025d66f23085)},
+  /* 132 */ {9, 0.1419568500933153, CNST_LIMB(0xa8d8102a44840000), CNST_LIMB(0x842530ee2db4949d)},
+  /* 133 */ {9, 0.1417377701235801, CNST_LIMB(0xb4b60f9d140541e5), CNST_LIMB(0x6aa7f2766b03dc25)},
+  /* 134 */ {9, 0.1415209988221527, CNST_LIMB(0xc15065d4856e4600), CNST_LIMB(0x53035ba7ebf32e8d)},
+  /* 135 */ {9, 0.1413064939005528, CNST_LIMB(0xceb1363f396d23c7), CNST_LIMB(0x3d12091fc9fb4914)},
+  /* 136 */ {9, 0.1410942141636095, CNST_LIMB(0xdce31b2488000000), CNST_LIMB(0x28b1cb81b1ef1849)},
+  /* 137 */ {9, 0.1408841194731412, CNST_LIMB(0xebf12a24bca135c9), CNST_LIMB(0x15c35be67ae3e2c9)},
+  /* 138 */ {9, 0.1406761707131039, CNST_LIMB(0xfbe6f8dbf88f4a00), CNST_LIMB(0x42a17bd09be1ff0)},
+  /* 139 */ {8, 0.1404703297561400, CNST_LIMB(0x1ef156c084ce761), CNST_LIMB(0x8bf461f03cf0bbf)},
+  /* 140 */ {8, 0.1402665594314587, CNST_LIMB(0x20c4e3b94a10000), CNST_LIMB(0xf3fbb43f68a32d05)},
+  /* 141 */ {8, 0.1400648234939879, CNST_LIMB(0x22b0695a08ba421), CNST_LIMB(0xd84f44c48564dc19)},
+  /* 142 */ {8, 0.1398650865947379, CNST_LIMB(0x24b4f35d7a4c100), CNST_LIMB(0xbe58ebcce7956abe)},
+  /* 143 */ {8, 0.1396673142523192, CNST_LIMB(0x26d397284975781), CNST_LIMB(0xa5fac463c7c134b7)},
+  /* 144 */ {8, 0.1394714728255649, CNST_LIMB(0x290d74100000000), CNST_LIMB(0x8f19241e28c7d757)},
+  /* 145 */ {8, 0.1392775294872041, CNST_LIMB(0x2b63b3a37866081), CNST_LIMB(0x799a6d046c0ae1ae)},
+  /* 146 */ {8, 0.1390854521985406, CNST_LIMB(0x2dd789f4d894100), CNST_LIMB(0x6566e37d746a9e40)},
+  /* 147 */ {8, 0.1388952096850913, CNST_LIMB(0x306a35e51b58721), CNST_LIMB(0x526887dbfb5f788f)},
+  /* 148 */ {8, 0.1387067714131417, CNST_LIMB(0x331d01712e10000), CNST_LIMB(0x408af3382b8efd3d)},
+  /* 149 */ {8, 0.1385201075671774, CNST_LIMB(0x35f14200a827c61), CNST_LIMB(0x2fbb374806ec05f1)},
+  /* 150 */ {8, 0.1383351890281539, CNST_LIMB(0x38e858b62216100), CNST_LIMB(0x1fe7c0f0afce87fe)},
+  /* 151 */ {8, 0.1381519873525671, CNST_LIMB(0x3c03b2c13176a41), CNST_LIMB(0x11003d517540d32e)},
+  /* 152 */ {8, 0.1379704747522905, CNST_LIMB(0x3f44c9b21000000), CNST_LIMB(0x2f5810f98eff0dc)},
+  /* 153 */ {8, 0.1377906240751463, CNST_LIMB(0x42ad23cef3113c1), CNST_LIMB(0xeb72e35e7840d910)},
+  /* 154 */ {8, 0.1376124087861776, CNST_LIMB(0x463e546b19a2100), CNST_LIMB(0xd27de19593dc3614)},
+  /* 155 */ {8, 0.1374358029495937, CNST_LIMB(0x49f9fc3f96684e1), CNST_LIMB(0xbaf391fd3e5e6fc2)},
+  /* 156 */ {8, 0.1372607812113589, CNST_LIMB(0x4de1c9c5dc10000), CNST_LIMB(0xa4bd38c55228c81d)},
+  /* 157 */ {8, 0.1370873187823978, CNST_LIMB(0x51f77994116d2a1), CNST_LIMB(0x8fc5a8de8e1de782)},
+  /* 158 */ {8, 0.1369153914223921, CNST_LIMB(0x563cd6bb3398100), CNST_LIMB(0x7bf9265bea9d3a3b)},
+  /* 159 */ {8, 0.1367449754241439, CNST_LIMB(0x5ab3bb270beeb01), CNST_LIMB(0x69454b325983dccd)},
+  /* 160 */ {8, 0.1365760475984821, CNST_LIMB(0x5f5e10000000000), CNST_LIMB(0x5798ee2308c39df9)},
+  /* 161 */ {8, 0.1364085852596902, CNST_LIMB(0x643dce0ec16f501), CNST_LIMB(0x46e40ba0fa66a753)},
+  /* 162 */ {8, 0.1362425662114337, CNST_LIMB(0x6954fe21e3e8100), CNST_LIMB(0x3717b0870b0db3a7)},
+  /* 163 */ {8, 0.1360779687331669, CNST_LIMB(0x6ea5b9755f440a1), CNST_LIMB(0x2825e6775d11cdeb)},
+  /* 164 */ {8, 0.1359147715670014, CNST_LIMB(0x74322a1c0410000), CNST_LIMB(0x1a01a1c09d1b4dac)},
+  /* 165 */ {8, 0.1357529539050150, CNST_LIMB(0x79fc8b6ae8a46e1), CNST_LIMB(0xc9eb0a8bebc8f3e)},
+  /* 166 */ {8, 0.1355924953769863, CNST_LIMB(0x80072a66d512100), CNST_LIMB(0xffe357ff59e6a004)},
+  /* 167 */ {8, 0.1354333760385373, CNST_LIMB(0x86546633b42b9c1), CNST_LIMB(0xe7dfd1be05fa61a8)},
+  /* 168 */ {8, 0.1352755763596663, CNST_LIMB(0x8ce6b0861000000), CNST_LIMB(0xd11ed6fc78f760e5)},
+  /* 169 */ {8, 0.1351190772136599, CNST_LIMB(0x93c08e16a022441), CNST_LIMB(0xbb8db609dd29ebfe)},
+  /* 170 */ {8, 0.1349638598663645, CNST_LIMB(0x9ae49717f026100), CNST_LIMB(0xa71aec8d1813d532)},
+  /* 171 */ {8, 0.1348099059658079, CNST_LIMB(0xa25577ae24c1a61), CNST_LIMB(0x93b612a9f20fbc02)},
+  /* 172 */ {8, 0.1346571975321549, CNST_LIMB(0xaa15f068e610000), CNST_LIMB(0x814fc7b19a67d317)},
+  /* 173 */ {8, 0.1345057169479844, CNST_LIMB(0xb228d6bf7577921), CNST_LIMB(0x6fd9a03f2e0a4b7c)},
+  /* 174 */ {8, 0.1343554469488779, CNST_LIMB(0xba91158ef5c4100), CNST_LIMB(0x5f4615a38d0d316e)},
+  /* 175 */ {8, 0.1342063706143054, CNST_LIMB(0xc351ad9aec0b681), CNST_LIMB(0x4f8876863479a286)},
+  /* 176 */ {8, 0.1340584713587980, CNST_LIMB(0xcc6db6100000000), CNST_LIMB(0x4094d8a3041b60eb)},
+  /* 177 */ {8, 0.1339117329233981, CNST_LIMB(0xd5e85d09025c181), CNST_LIMB(0x32600b8ed883a09b)},
+  /* 178 */ {8, 0.1337661393673756, CNST_LIMB(0xdfc4e816401c100), CNST_LIMB(0x24df8c6eb4b6d1f1)},
+  /* 179 */ {8, 0.1336216750601996, CNST_LIMB(0xea06b4c72947221), CNST_LIMB(0x18097a8ee151acef)},
+  /* 180 */ {8, 0.1334783246737591, CNST_LIMB(0xf4b139365210000), CNST_LIMB(0xbd48cc8ec1cd8e3)},
+  /* 181 */ {8, 0.1333360731748201, CNST_LIMB(0xffc80497d520961), CNST_LIMB(0x3807a8d67485fb)},
+  /* 182 */ {8, 0.1331949058177136, CNST_LIMB(0x10b4ebfca1dee100), CNST_LIMB(0xea5768860b62e8d8)},
+  /* 183 */ {8, 0.1330548081372441, CNST_LIMB(0x117492de921fc141), CNST_LIMB(0xd54faf5b635c5005)},
+  /* 184 */ {8, 0.1329157659418126, CNST_LIMB(0x123bb2ce41000000), CNST_LIMB(0xc14a56233a377926)},
+  /* 185 */ {8, 0.1327777653067443, CNST_LIMB(0x130a8b6157bdecc1), CNST_LIMB(0xae39a88db7cd329f)},
+  /* 186 */ {8, 0.1326407925678156, CNST_LIMB(0x13e15dede0e8a100), CNST_LIMB(0x9c10bde69efa7ab6)},
+  /* 187 */ {8, 0.1325048343149731, CNST_LIMB(0x14c06d941c0ca7e1), CNST_LIMB(0x8ac36c42a2836497)},
+  /* 188 */ {8, 0.1323698773862368, CNST_LIMB(0x15a7ff487a810000), CNST_LIMB(0x7a463c8b84f5ef67)},
+  /* 189 */ {8, 0.1322359088617821, CNST_LIMB(0x169859ddc5c697a1), CNST_LIMB(0x6a8e5f5ad090fd4b)},
+  /* 190 */ {8, 0.1321029160581950, CNST_LIMB(0x1791c60f6fed0100), CNST_LIMB(0x5b91a2943596fc56)},
+  /* 191 */ {8, 0.1319708865228925, CNST_LIMB(0x18948e8c0e6fba01), CNST_LIMB(0x4d4667b1c468e8f0)},
+  /* 192 */ {8, 0.1318398080287045, CNST_LIMB(0x19a1000000000000), CNST_LIMB(0x3fa39ab547994daf)},
+  /* 193 */ {8, 0.1317096685686114, CNST_LIMB(0x1ab769203dafc601), CNST_LIMB(0x32a0a9b2faee1e2a)},
+  /* 194 */ {8, 0.1315804563506306, CNST_LIMB(0x1bd81ab557f30100), CNST_LIMB(0x26357ceac0e96962)},
+  /* 195 */ {8, 0.1314521597928493, CNST_LIMB(0x1d0367a69fed1ba1), CNST_LIMB(0x1a5a6f65caa5859e)},
+  /* 196 */ {8, 0.1313247675185968, CNST_LIMB(0x1e39a5057d810000), CNST_LIMB(0xf08480f672b4e86)},
+  /* 197 */ {8, 0.1311982683517524, CNST_LIMB(0x1f7b2a18f29ac3e1), CNST_LIMB(0x4383340615612ca)},
+  /* 198 */ {8, 0.1310726513121843, CNST_LIMB(0x20c850694c2aa100), CNST_LIMB(0xf3c77969ee4be5a2)},
+  /* 199 */ {8, 0.1309479056113158, CNST_LIMB(0x222173cc014980c1), CNST_LIMB(0xe00993cc187c5ec9)},
+  /* 200 */ {8, 0.1308240206478128, CNST_LIMB(0x2386f26fc1000000), CNST_LIMB(0xcd2b297d889bc2b6)},
+  /* 201 */ {8, 0.1307009860033912, CNST_LIMB(0x24f92ce8af296d41), CNST_LIMB(0xbb214d5064862b22)},
+  /* 202 */ {8, 0.1305787914387386, CNST_LIMB(0x2678863cd0ece100), CNST_LIMB(0xa9e1a7ca7ea10e20)},
+  /* 203 */ {8, 0.1304574268895465, CNST_LIMB(0x280563f0a9472d61), CNST_LIMB(0x99626e72b39ea0cf)},
+  /* 204 */ {8, 0.1303368824626505, CNST_LIMB(0x29a02e1406210000), CNST_LIMB(0x899a5ba9c13fafd9)},
+  /* 205 */ {8, 0.1302171484322746, CNST_LIMB(0x2b494f4efe6d2e21), CNST_LIMB(0x7a80a705391e96ff)},
+  /* 206 */ {8, 0.1300982152363760, CNST_LIMB(0x2d0134ef21cbc100), CNST_LIMB(0x6c0cfe23de23042a)},
+  /* 207 */ {8, 0.1299800734730872, CNST_LIMB(0x2ec84ef4da2ef581), CNST_LIMB(0x5e377df359c944dd)},
+  /* 208 */ {8, 0.1298627138972530, CNST_LIMB(0x309f102100000000), CNST_LIMB(0x50f8ac5fc8f53985)},
+  /* 209 */ {8, 0.1297461274170591, CNST_LIMB(0x3285ee02a1420281), CNST_LIMB(0x44497266278e35b7)},
+  /* 210 */ {8, 0.1296303050907487, CNST_LIMB(0x347d6104fc324100), CNST_LIMB(0x382316831f7ee175)},
+  /* 211 */ {8, 0.1295152381234257, CNST_LIMB(0x3685e47dade53d21), CNST_LIMB(0x2c7f377833b8946e)},
+  /* 212 */ {8, 0.1294009178639407, CNST_LIMB(0x389ff6bb15610000), CNST_LIMB(0x2157c761ab4163ef)},
+  /* 213 */ {8, 0.1292873358018581, CNST_LIMB(0x3acc1912ebb57661), CNST_LIMB(0x16a7071803cc49a9)},
+  /* 214 */ {8, 0.1291744835645007, CNST_LIMB(0x3d0acff111946100), CNST_LIMB(0xc6781d80f8224fc)},
+  /* 215 */ {8, 0.1290623529140715, CNST_LIMB(0x3f5ca2e692eaf841), CNST_LIMB(0x294092d370a900b)},
+  /* 216 */ {8, 0.1289509357448472, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)},
+  /* 217 */ {8, 0.1288402240804449, CNST_LIMB(0x443bcb714399a5c1), CNST_LIMB(0xe03b98f103fad6d2)},
+  /* 218 */ {8, 0.1287302100711567, CNST_LIMB(0x46ca406c81af2100), CNST_LIMB(0xcee3d32cad2a9049)},
+  /* 219 */ {8, 0.1286208859913518, CNST_LIMB(0x496e106ac22aaae1), CNST_LIMB(0xbe3f9df9277fdada)},
+  /* 220 */ {8, 0.1285122442369443, CNST_LIMB(0x4c27d39fa5410000), CNST_LIMB(0xae46f0d94c05e933)},
+  /* 221 */ {8, 0.1284042773229231, CNST_LIMB(0x4ef825c296e43ca1), CNST_LIMB(0x9ef2280fb437a33d)},
+  /* 222 */ {8, 0.1282969778809442, CNST_LIMB(0x51dfa61f5ad88100), CNST_LIMB(0x9039ff426d3f284b)},
+  /* 223 */ {8, 0.1281903386569819, CNST_LIMB(0x54def7a6d2f16901), CNST_LIMB(0x82178c6d6b51f8f4)},
+  /* 224 */ {8, 0.1280843525090381, CNST_LIMB(0x57f6c10000000000), CNST_LIMB(0x74843b1ee4c1e053)},
+  /* 225 */ {8, 0.1279790124049077, CNST_LIMB(0x5b27ac993df97701), CNST_LIMB(0x6779c7f90dc42f48)},
+  /* 226 */ {8, 0.1278743114199984, CNST_LIMB(0x5e7268b9bbdf8100), CNST_LIMB(0x5af23c74f9ad9fe9)},
+  /* 227 */ {8, 0.1277702427352035, CNST_LIMB(0x61d7a7932ff3d6a1), CNST_LIMB(0x4ee7eae2acdc617e)},
+  /* 228 */ {8, 0.1276667996348261, CNST_LIMB(0x65581f53c8c10000), CNST_LIMB(0x43556aa2ac262a0b)},
+  /* 229 */ {8, 0.1275639755045533, CNST_LIMB(0x68f48a385b8320e1), CNST_LIMB(0x3835949593b8ddd1)},
+  /* 230 */ {8, 0.1274617638294791, CNST_LIMB(0x6cada69ed07c2100), CNST_LIMB(0x2d837fbe78458762)},
+  /* 231 */ {8, 0.1273601581921741, CNST_LIMB(0x70843718cdbf27c1), CNST_LIMB(0x233a7e150a54a555)},
+  /* 232 */ {8, 0.1272591522708010, CNST_LIMB(0x7479027ea1000000), CNST_LIMB(0x19561984a50ff8fe)},
+  /* 233 */ {8, 0.1271587398372755, CNST_LIMB(0x788cd40268f39641), CNST_LIMB(0xfd211159fe3490f)},
+  /* 234 */ {8, 0.1270589147554692, CNST_LIMB(0x7cc07b437ecf6100), CNST_LIMB(0x6aa563e655033e3)},
+  /* 235 */ {8, 0.1269596709794558, CNST_LIMB(0x8114cc6220762061), CNST_LIMB(0xfbb614b3f2d3b14c)},
+  /* 236 */ {8, 0.1268610025517973, CNST_LIMB(0x858aa0135be10000), CNST_LIMB(0xeac0f8837fb05773)},
+  /* 237 */ {8, 0.1267629036018709, CNST_LIMB(0x8a22d3b53c54c321), CNST_LIMB(0xda6e4c10e8615ca5)},
+  /* 238 */ {8, 0.1266653683442337, CNST_LIMB(0x8ede496339f34100), CNST_LIMB(0xcab755a8d01fa67f)},
+  /* 239 */ {8, 0.1265683910770258, CNST_LIMB(0x93bde80aec3a1481), CNST_LIMB(0xbb95a9ae71aa3e0c)},
+  /* 240 */ {8, 0.1264719661804097, CNST_LIMB(0x98c29b8100000000), CNST_LIMB(0xad0326c296b4f529)},
+  /* 241 */ {8, 0.1263760881150453, CNST_LIMB(0x9ded549671832381), CNST_LIMB(0x9ef9f21eed31b7c1)},
+  /* 242 */ {8, 0.1262807514205999, CNST_LIMB(0xa33f092e0b1ac100), CNST_LIMB(0x91747422be14b0b2)},
+  /* 243 */ {8, 0.1261859507142915, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
+  /* 244 */ {8, 0.1260916806894653, CNST_LIMB(0xae5b564ac3a10000), CNST_LIMB(0x77df79e9a96c06f6)},
+  /* 245 */ {8, 0.1259979361142023, CNST_LIMB(0xb427f4b3be74c361), CNST_LIMB(0x6bc6019636c7d0c2)},
+  /* 246 */ {8, 0.1259047118299582, CNST_LIMB(0xba1f9a938041e100), CNST_LIMB(0x601c4205aebd9e47)},
+  /* 247 */ {8, 0.1258120027502338, CNST_LIMB(0xc0435871d1110f41), CNST_LIMB(0x54ddc59756f05016)},
+  /* 248 */ {8, 0.1257198038592741, CNST_LIMB(0xc694446f01000000), CNST_LIMB(0x4a0648979c838c18)},
+  /* 249 */ {8, 0.1256281102107963, CNST_LIMB(0xcd137a5b57ac3ec1), CNST_LIMB(0x3f91b6e0bb3a053d)},
+  /* 250 */ {8, 0.1255369169267456, CNST_LIMB(0xd3c21bcecceda100), CNST_LIMB(0x357c299a88ea76a5)},
+  /* 251 */ {8, 0.1254462191960791, CNST_LIMB(0xdaa150410b788de1), CNST_LIMB(0x2bc1e517aecc56e3)},
+  /* 252 */ {8, 0.1253560122735751, CNST_LIMB(0xe1b24521be010000), CNST_LIMB(0x225f56ceb3da9f5d)},
+  /* 253 */ {8, 0.1252662914786691, CNST_LIMB(0xe8f62df12777c1a1), CNST_LIMB(0x1951136d53ad63ac)},
+  /* 254 */ {8, 0.1251770521943144, CNST_LIMB(0xf06e445906fc0100), CNST_LIMB(0x1093d504b3cd7d93)},
+  /* 255 */ {8, 0.1250882898658681, CNST_LIMB(0xf81bc845c81bf801), CNST_LIMB(0x824794d1ec1814f)},
+};
+#endif
diff --git a/rts/gmp/mpn/ns32k/add_n.s b/rts/gmp/mpn/ns32k/add_n.s
new file mode 100644
index 0000000000..bd063d07d9
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/add_n.s
@@ -0,0 +1,46 @@
+# ns32000 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+	.align 1
+.globl ___gmpn_add_n
+___gmpn_add_n:
+	save	[r3,r4,r5]
+	negd	28(sp),r3
+	movd	r3,r0
+	lshd	2,r0
+	movd	24(sp),r4
+	subd	r0,r4			# r4 -> to end of S2
+	movd	20(sp),r5
+	subd	r0,r5			# r5 -> to end of S1
+	movd	16(sp),r2
+	subd	r0,r2			# r2 -> to end of RES
+	subd	r0,r0			# cy = 0
+
+Loop:	movd	r5[r3:d],r0
+	addcd	r4[r3:d],r0
+	movd	r0,r2[r3:d]
+	acbd	1,r3,Loop
+
+	scsd	r0			# r0 = cy.
+	restore	[r5,r4,r3]
+	ret	0
diff --git a/rts/gmp/mpn/ns32k/addmul_1.s b/rts/gmp/mpn/ns32k/addmul_1.s
new file mode 100644
index 0000000000..df0dcdd4af
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/addmul_1.s
@@ -0,0 +1,48 @@
+# ns32000 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+	.align 1
+.globl ___gmpn_addmul_1
+___gmpn_addmul_1:
+	save	[r3,r4,r5,r6,r7]
+	negd	24(sp),r4
+	movd	r4,r0
+	lshd	2,r0
+	movd	20(sp),r5
+	subd	r0,r5			# r5 -> to end of S1
+	movd	16(sp),r6
+	subd	r0,r6			# r6 -> to end of RES
+	subd	r0,r0			# r0 = 0, cy = 0
+	movd	28(sp),r7		# r7 = s2_limb
+
+Loop:	movd	r5[r4:d],r2
+	meid	r7,r2			# r2 = low_prod, r3 = high_prod
+	addcd	r0,r2			# r2 = low_prod + cy_limb
+	movd	r3,r0			# r0 = new cy_limb
+	addcd	0,r0
+	addd	r2,r6[r4:d]
+	acbd	1,r4,Loop
+
+	addcd	0,r0
+	restore	[r7,r6,r5,r4,r3]
+	ret	0
diff --git a/rts/gmp/mpn/ns32k/mul_1.s b/rts/gmp/mpn/ns32k/mul_1.s
new file mode 100644
index 0000000000..0a77efba29
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/mul_1.s
@@ -0,0 +1,47 @@
+# ns32000 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+	.align 1
+.globl ___gmpn_mul_1
+___gmpn_mul_1:
+	save	[r3,r4,r5,r6,r7]
+	negd	24(sp),r4
+	movd	r4,r0
+	lshd	2,r0
+	movd	20(sp),r5
+	subd	r0,r5			# r5 -> to end of S1
+	movd	16(sp),r6
+	subd	r0,r6			# r6 -> to end of RES
+	subd	r0,r0			# r0 = 0, cy = 0
+	movd	28(sp),r7		# r7 = s2_limb
+
+Loop:	movd	r5[r4:d],r2
+	meid	r7,r2			# r2 = low_prod, r3 = high_prod
+	addcd	r0,r2			# r2 = low_prod + cy_limb
+	movd	r3,r0			# r0 = new cy_limb
+	movd	r2,r6[r4:d]
+	acbd	1,r4,Loop
+
+	addcd	0,r0
+	restore	[r7,r6,r5,r4,r3]
+	ret	0
diff --git a/rts/gmp/mpn/ns32k/sub_n.s b/rts/gmp/mpn/ns32k/sub_n.s
new file mode 100644
index 0000000000..cd89f4fd3f
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/sub_n.s
@@ -0,0 +1,46 @@
+# ns32000 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+	.align 1
+.globl ___gmpn_sub_n
+___gmpn_sub_n:
+	save	[r3,r4,r5]
+	negd	28(sp),r3
+	movd	r3,r0
+	lshd	2,r0
+	movd	24(sp),r4
+	subd	r0,r4			# r4 -> to end of S2
+	movd	20(sp),r5
+	subd	r0,r5			# r5 -> to end of S1
+	movd	16(sp),r2
+	subd	r0,r2			# r2 -> to end of RES
+	subd	r0,r0			# cy = 0
+
+Loop:	movd	r5[r3:d],r0
+	subcd	r4[r3:d],r0
+	movd	r0,r2[r3:d]
+	acbd	1,r3,Loop
+
+	scsd	r0			# r0 = cy.
+	restore	[r5,r4,r3]
+	ret	0
diff --git a/rts/gmp/mpn/ns32k/submul_1.s b/rts/gmp/mpn/ns32k/submul_1.s
new file mode 100644
index 0000000000..f811aedcf1
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/submul_1.s
@@ -0,0 +1,48 @@
+# ns32000 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+	.align 1
+.globl ___gmpn_submul_1
+___gmpn_submul_1:
+	save	[r3,r4,r5,r6,r7]
+	negd	24(sp),r4
+	movd	r4,r0
+	lshd	2,r0
+	movd	20(sp),r5
+	subd	r0,r5			# r5 -> to end of S1
+	movd	16(sp),r6
+	subd	r0,r6			# r6 -> to end of RES
+	subd	r0,r0			# r0 = 0, cy = 0
+	movd	28(sp),r7		# r7 = s2_limb
+
+Loop:	movd	r5[r4:d],r2
+	meid	r7,r2			# r2 = low_prod, r3 = high_prod
+	addcd	r0,r2			# r2 = low_prod + cy_limb
+	movd	r3,r0			# r0 = new cy_limb
+	addcd	0,r0
+	subd	r2,r6[r4:d]
+	acbd	1,r4,Loop
+
+	addcd	0,r0
+	restore	[r7,r6,r5,r4,r3]
+	ret	0
diff --git a/rts/gmp/mpn/pa64/README b/rts/gmp/mpn/pa64/README
new file mode 100644
index 0000000000..8d2976dabc
--- /dev/null
+++ b/rts/gmp/mpn/pa64/README
@@ -0,0 +1,38 @@
+This directory contains mpn functions for 64-bit PA-RISC 2.0.
+
+RELEVANT OPTIMIZATION ISSUES
+
+The PA8000 has a multi-issue pipeline with large buffers for instructions
+awaiting pending results.  Therefore, no latency scheduling is necessary
+(and might actually be harmful).
+
+Two 64-bit loads can be completed per cycle.  One 64-bit store can be
+completed per cycle.  A store cannot complete in the same cycle as a load.
+
+STATUS
+
+* mpn_lshift, mpn_rshift, mpn_add_n, mpn_sub_n are all well-tuned and run at
+  the peak cache bandwidth; 1.5 cycles/limb for shifting and 2.0 cycles/limb
+  for add/subtract.
+
+* The multiplication functions run at 11 cycles/limb.  The cache bandwidth
+  allows 7.5 cycles/limb.  Perhaps it would be possible, using unrolling or
+  better scheduling, to get closer to the cache bandwidth limit.
+
+* xaddmul_1.S contains a quicker method for forming the 128 bit product.  It
+  uses some fewer operations, and keep the carry flag live across the loop
+  boundary.  But it seems hard to make it run more than 1/4 cycle faster
+  than the old code.  Perhaps we really ought to unroll this loop be 2x?
+  2x should suffice since register latency schedling is never needed,
+  but the unrolling would hide the store-load latency.  Here is a sketch:
+
+	1. A multiply and store 64-bit products
+	2. B sum 64-bit products 128-bit product
+	3. B load  64-bit products to integer registers
+	4. B multiply and store 64-bit products
+	5. A sum 64-bit products 128-bit product
+	6. A load  64-bit products to integer registers
+	7. goto 1
+
+  In practice, adjacent groups (1 and 2, 2 and 3, etc) will be interleaved
+  for better instruction mix.
diff --git a/rts/gmp/mpn/pa64/add_n.s b/rts/gmp/mpn/pa64/add_n.s
new file mode 100644
index 0000000000..22ff19c184
--- /dev/null
+++ b/rts/gmp/mpn/pa64/add_n.s
@@ -0,0 +1,90 @@
+; HP-PA 2.0 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+; store sum in a third limb vector.
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.level	2.0n
+	.code
+	.export	__gmpn_add_n,entry
+__gmpn_add_n
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	sub		%r0,%r23,%r22
+	depw,z		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	sub		%r26,%r22,%r26		; offset res_ptr
+	blr		%r28,%r0		; branch into loop
+	add		%r0,%r0,%r0		; reset carry
+
+L$loop	ldd		0(%r25),%r20
+	ldd		0(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,0(%r26)
+L$7	ldd		8(%r25),%r21
+	ldd		8(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,8(%r26)
+L$6	ldd		16(%r25),%r20
+	ldd		16(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,16(%r26)
+L$5	ldd		24(%r25),%r21
+	ldd		24(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,24(%r26)
+L$4	ldd		32(%r25),%r20
+	ldd		32(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,32(%r26)
+L$3	ldd		40(%r25),%r21
+	ldd		40(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,40(%r26)
+L$2	ldd		48(%r25),%r20
+	ldd		48(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,48(%r26)
+L$1	ldd		56(%r25),%r21
+	ldo		64(%r25),%r25
+	ldd		56(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,56(%r26)
+	ldo		64(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		64(%r26),%r26
+
+	add,dc		%r0,%r0,%r29
+	bve		(%r2)
+	.exit
+	ldi		0,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64/addmul_1.S b/rts/gmp/mpn/pa64/addmul_1.S
new file mode 100644
index 0000000000..b1885b432c
--- /dev/null
+++ b/rts/gmp/mpn/pa64/addmul_1.S
@@ -0,0 +1,167 @@
+; HP-PA 2.0 64-bit __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		-56(%r30)
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0n
+	.code
+	.export __gmpn_addmul_1,entry
+__gmpn_addmul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+        fldd		-56(%r30),%fr5		; s2limb passed on stack
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,rlimb,rlimb
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	add		t4,rlimb,t3
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,rlimb,rlimb
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	add		t4,rlimb,t3
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,rlimb,rlimb
+	add,dc		t2,hi,cylimb
+	add		t4,rlimb,t3
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	extrd,u		cylimb,31,32,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64/gmp-mparam.h b/rts/gmp/mpn/pa64/gmp-mparam.h
new file mode 100644
index 0000000000..847735b987
--- /dev/null
+++ b/rts/gmp/mpn/pa64/gmp-mparam.h
@@ -0,0 +1,65 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values were measured in a PA8000 using the system compiler version
+   A.10.32.30.  Presumably the PA8200 and PA8500 have the same timing
+   characteristic, but GCC might give somewhat different results.  */
+/* Generated by tuneup.c, 2000-07-25. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   16
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      105
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   40
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      116
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              72
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             94
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            50
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD       46
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD           1
+#endif
diff --git a/rts/gmp/mpn/pa64/lshift.s b/rts/gmp/mpn/pa64/lshift.s
new file mode 100644
index 0000000000..994bc1c4d6
--- /dev/null
+++ b/rts/gmp/mpn/pa64/lshift.s
@@ -0,0 +1,103 @@
+; HP-PA 2.0 __gmpn_lshift --
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; size		gr24
+; cnt		gr23
+
+; This runs at 1.5 cycles/limb on PA8000.
+
+	.level	2.0n
+	.code
+	.export	__gmpn_lshift,entry
+__gmpn_lshift
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	shladd		%r24,3,%r25,%r25
+	shladd		%r24,3,%r26,%r26
+	subi		64,%r23,%r23
+	mtsar		%r23
+	ldd		-8(%r25),%r21
+	addib,=		-1,%r24,L$end
+	shrpd		%r0,%r21,%sar,%r29	; compute carry out limb
+	depw,z		%r24,31,3,%r28		; r28 = (size & 7)
+	sub		%r0,%r24,%r22
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-size & 7)
+	add		%r25,%r22,%r25		; offset s1_ptr
+	blr		%r28,%r0		; branch into jump table
+	add		%r26,%r22,%r26		; offset res_ptr
+	b		L$0
+	nop
+	b		L$1
+	copy		%r21,%r20
+	b		L$2
+	nop
+	b		L$3
+	copy		%r21,%r20
+	b		L$4
+	nop
+	b		L$5
+	copy		%r21,%r20
+	b		L$6
+	nop
+	b		L$7
+	copy		%r21,%r20
+
+L$loop
+L$0	ldd		-16(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-8(%r26)
+L$7	ldd		-24(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-16(%r26)
+L$6	ldd		-32(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-24(%r26)
+L$5	ldd		-40(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-32(%r26)
+L$4	ldd		-48(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-40(%r26)
+L$3	ldd		-56(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-48(%r26)
+L$2	ldd		-64(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-56(%r26)
+L$1	ldd		-72(%r25),%r21
+	ldo		-64(%r25),%r25
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-64(%r26)
+	addib,>		-8,%r24,L$loop
+	ldo		-64(%r26),%r26
+
+L$end	shrpd		%r21,%r0,%sar,%r21
+	std		%r21,-8(%r26)
+	bve		(%r2)
+	.exit
+	extrd,u		%r29,31,32,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64/mul_1.S b/rts/gmp/mpn/pa64/mul_1.S
new file mode 100644
index 0000000000..ab310c1264
--- /dev/null
+++ b/rts/gmp/mpn/pa64/mul_1.S
@@ -0,0 +1,158 @@
+; HP-PA 2.0 64-bit __gmpn_mul_1 -- Multiply a limb vector with a limb and
+; store the result in a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		-56(%r30)
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0n
+	.code
+	.export __gmpn_mul_1,entry
+__gmpn_mul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+        fldd		-56(%r30),%fr5		; s2limb passed on stack
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t3
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t3
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t2 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,t4,t3
+	add,dc		t2,hi,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	extrd,u		cylimb,31,32,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64/rshift.s b/rts/gmp/mpn/pa64/rshift.s
new file mode 100644
index 0000000000..f0730e2a91
--- /dev/null
+++ b/rts/gmp/mpn/pa64/rshift.s
@@ -0,0 +1,100 @@
+; HP-PA 2.0 __gmpn_rshift --
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; size		gr24
+; cnt		gr23
+
+; This runs at 1.5 cycles/limb on PA8000.
+
+	.level	2.0n
+	.code
+	.export	__gmpn_rshift,entry
+__gmpn_rshift
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	mtsar		%r23
+	ldd		0(%r25),%r21
+	addib,=		-1,%r24,L$end
+	shrpd		%r21,%r0,%sar,%r29	; compute carry out limb
+	depw,z		%r24,31,3,%r28		; r28 = (size & 7)
+	sub		%r0,%r24,%r22
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-size & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	blr		%r28,%r0		; branch into jump table
+	sub		%r26,%r22,%r26		; offset res_ptr
+	b		L$0
+	nop
+	b		L$1
+	copy		%r21,%r20
+	b		L$2
+	nop
+	b		L$3
+	copy		%r21,%r20
+	b		L$4
+	nop
+	b		L$5
+	copy		%r21,%r20
+	b		L$6
+	nop
+	b		L$7
+	copy		%r21,%r20
+
+L$loop
+L$0	ldd		8(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,0(%r26)
+L$7	ldd		16(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,8(%r26)
+L$6	ldd		24(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,16(%r26)
+L$5	ldd		32(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,24(%r26)
+L$4	ldd		40(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,32(%r26)
+L$3	ldd		48(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,40(%r26)
+L$2	ldd		56(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,48(%r26)
+L$1	ldd		64(%r25),%r21
+	ldo		64(%r25),%r25
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,56(%r26)
+	addib,>		-8,%r24,L$loop
+	ldo		64(%r26),%r26
+
+L$end	shrpd		%r0,%r21,%sar,%r21
+	std		%r21,0(%r26)
+	bve		(%r2)
+	.exit
+	extrd,u		%r29,31,32,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64/sub_n.s b/rts/gmp/mpn/pa64/sub_n.s
new file mode 100644
index 0000000000..dda1f54b34
--- /dev/null
+++ b/rts/gmp/mpn/pa64/sub_n.s
@@ -0,0 +1,90 @@
+; HP-PA 2.0 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0
+; and store difference in a third limb vector.
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.level	2.0n
+	.code
+	.export	__gmpn_sub_n,entry
+__gmpn_sub_n
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	sub		%r0,%r23,%r22
+	depw,z		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	blr		%r28,%r0		; branch into loop
+	sub		%r26,%r22,%r26		; offset res_ptr and set carry
+
+L$loop	ldd		0(%r25),%r20
+	ldd		0(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,0(%r26)
+L$7	ldd		8(%r25),%r21
+	ldd		8(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,8(%r26)
+L$6	ldd		16(%r25),%r20
+	ldd		16(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,16(%r26)
+L$5	ldd		24(%r25),%r21
+	ldd		24(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,24(%r26)
+L$4	ldd		32(%r25),%r20
+	ldd		32(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,32(%r26)
+L$3	ldd		40(%r25),%r21
+	ldd		40(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,40(%r26)
+L$2	ldd		48(%r25),%r20
+	ldd		48(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,48(%r26)
+L$1	ldd		56(%r25),%r21
+	ldo		64(%r25),%r25
+	ldd		56(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,56(%r26)
+	ldo		64(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		64(%r26),%r26
+
+	add,dc		%r0,%r0,%r29
+	subi		1,%r29,%r29
+	bve		(%r2)
+	.exit
+	ldi		0,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64/submul_1.S b/rts/gmp/mpn/pa64/submul_1.S
new file mode 100644
index 0000000000..27666b99df
--- /dev/null
+++ b/rts/gmp/mpn/pa64/submul_1.S
@@ -0,0 +1,170 @@
+; HP-PA 2.0 64-bit __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		-56(%r30)
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0n
+	.code
+	.export __gmpn_submul_1,entry
+__gmpn_submul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+        fldd		-56(%r30),%fr5		; s2limb passed on stack
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t4
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t4
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,t4,t4
+	add,dc		t2,hi,cylimb
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	extrd,u		cylimb,31,32,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64/udiv_qrnnd.c b/rts/gmp/mpn/pa64/udiv_qrnnd.c
new file mode 100644
index 0000000000..1c9fe084db
--- /dev/null
+++ b/rts/gmp/mpn/pa64/udiv_qrnnd.c
@@ -0,0 +1,111 @@
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#define TWO64 18446744073709551616.0
+
+mp_limb_t
+#if __STDC__
+__MPN(udiv_qrnnd) (mp_limb_t n1, mp_limb_t n0, mp_limb_t d, mp_limb_t *r)
+#else
+__MPN(udiv_qrnnd) (n1, n0, d, r)
+     mp_limb_t n1;
+     mp_limb_t n0;
+     mp_limb_t d;
+     mp_limb_t *r;
+#endif
+{
+  mp_limb_t q1, q2, q;
+  mp_limb_t p1, p0;
+  double di, dq;
+
+  di = 1.0 / d;
+
+  /* Generate upper 53 bits of quotient.  Be careful here; the `double'
+     quotient may be rounded to 2^64 which we cannot safely convert back
+     to a 64-bit integer.  */
+  dq = (TWO64 * (double) n1 + (double) n0) * di;
+  if (dq >= TWO64)
+    q1 = 0xfffffffffffff800LL;
+  else
+    q1 = (mp_limb_t) dq;
+
+  /* Multiply back in order to compare the product to the dividend.  */
+  umul_ppmm (p1, p0, q1, d);
+
+  /* Was the 53-bit quotient greater that our sought quotient?  Test the
+     sign of the partial remainder to find out.  */
+  if (n1 < p1 || (n1 == p1 && n0 < p0))
+    {
+      /* 53-bit quotient too large.  Partial remainder is negative.
+	 Compute the absolute value of the remainder in n1,,n0.  */
+      n1 = p1 - (n1 + (p0 < n0));
+      n0 = p0 - n0;
+
+      /* Now use the partial remainder as new dividend to compute more bits of
+	 quotient.  This is an adjustment for the one we got previously.  */
+      q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di);
+      umul_ppmm (p1, p0, q2, d);
+
+      q = q1 - q2;
+      if (n1 < p1 || (n1 == p1 && n0 <= p0))
+	{
+	  n0 = p0 - n0;
+	}
+      else
+	{
+	  n0 = p0 - n0;
+	  n0 += d;
+	  q--;
+	}
+    }
+  else
+    {
+      n1 = n1 - (p1 + (n0 < p0));
+      n0 = n0 - p0;
+
+      q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di);
+      umul_ppmm (p1, p0, q2, d);
+
+      q = q1 + q2;
+      if (n1 < p1 || (n1 == p1 && n0 < p0))
+	{
+	  n0 = n0 - p0;
+	  n0 += d;
+	  q--;
+	}
+      else
+	{
+	  n0 = n0 - p0;
+	  if (n0 >= d)
+	    {
+	      n0 -= d;
+	      q++;
+	    }
+	}
+    }
+
+  *r = n0;
+  return q;
+}
diff --git a/rts/gmp/mpn/pa64/umul_ppmm.S b/rts/gmp/mpn/pa64/umul_ppmm.S
new file mode 100644
index 0000000000..ceff2d752f
--- /dev/null
+++ b/rts/gmp/mpn/pa64/umul_ppmm.S
@@ -0,0 +1,74 @@
+; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+#define p0	%r28
+#define p1	%r29
+#define t32	%r19
+#define t0	%r20
+#define t1	%r21
+#define x	%r22
+#define m0	%r23
+#define m1	%r24
+	.level  2.0n
+	.code
+	.export __gmpn_umul_ppmm,entry
+__gmpn_umul_ppmm
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+	ldo		128(%r30),%r30
+	depd		%r25,31,32,%r26
+	std		%r26,-64(%r30)
+	depd		%r23,31,32,%r24
+	std		%r24,-56(%r30)
+
+	ldw		-180(%r30),%r31
+
+        fldd		-64(%r30),%fr4
+        fldd		-56(%r30),%fr5
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+
+	depdi,z		1,31,1,t32		; t32 = 2^32
+
+	ldd		-128(%r30),p0		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),p1		; hi = high 64 bit of product
+
+	add,l,*nuv	m0,m1,x			; x = m1+m0
+	 add,l		t32,p1,p1		; propagate carry to mid of p1
+	depd,z		x,31,32,t0		; lo32(m1+m0)
+	add		t0,p0,p0
+	extrd,u		x,31,32,t1		; hi32(m1+m0)
+	add,dc		t1,p1,p1
+
+	std		p0,0(%r31)		; store low half of product
+	extrd,u		p1,31,32,%r28		; return high half of product
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64w/README b/rts/gmp/mpn/pa64w/README
new file mode 100644
index 0000000000..cf590a7b98
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/README
@@ -0,0 +1,2 @@
+This directory contains mpn functions for 64-bit PA-RISC 2.0
+using 64-bit pointers (2.0W).
diff --git a/rts/gmp/mpn/pa64w/add_n.s b/rts/gmp/mpn/pa64w/add_n.s
new file mode 100644
index 0000000000..1bb9e8fbc7
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/add_n.s
@@ -0,0 +1,90 @@
+; HP-PA 2.0 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+; store sum in a third limb vector.
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.level	2.0w
+	.code
+	.export	__gmpn_add_n,entry
+__gmpn_add_n
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	sub		%r0,%r23,%r22
+	depw,z		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	sub		%r26,%r22,%r26		; offset res_ptr
+	blr		%r28,%r0		; branch into loop
+	add		%r0,%r0,%r0		; reset carry
+
+L$loop	ldd		0(%r25),%r20
+	ldd		0(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,0(%r26)
+L$7	ldd		8(%r25),%r21
+	ldd		8(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,8(%r26)
+L$6	ldd		16(%r25),%r20
+	ldd		16(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,16(%r26)
+L$5	ldd		24(%r25),%r21
+	ldd		24(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,24(%r26)
+L$4	ldd		32(%r25),%r20
+	ldd		32(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,32(%r26)
+L$3	ldd		40(%r25),%r21
+	ldd		40(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,40(%r26)
+L$2	ldd		48(%r25),%r20
+	ldd		48(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,48(%r26)
+L$1	ldd		56(%r25),%r21
+	ldo		64(%r25),%r25
+	ldd		56(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,56(%r26)
+	ldo		64(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		64(%r26),%r26
+
+	add,dc		%r0,%r0,%r29
+	bve		(%r2)
+	.exit
+	copy		%r29,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64w/addmul_1.S b/rts/gmp/mpn/pa64w/addmul_1.S
new file mode 100644
index 0000000000..4799f90fc5
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/addmul_1.S
@@ -0,0 +1,168 @@
+; HP-PA 2.0 64-bit __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		%r23
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0w
+	.code
+	.export __gmpn_addmul_1,entry
+__gmpn_addmul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+	std		s2limb,-56(%r30)
+        fldd		-56(%r30),%fr5
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,rlimb,rlimb
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	add		t4,rlimb,t3
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,rlimb,rlimb
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	add		t4,rlimb,t3
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,rlimb,rlimb
+	add,dc		t2,hi,cylimb
+	add		t4,rlimb,t3
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	copy		cylimb,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64w/gmp-mparam.h b/rts/gmp/mpn/pa64w/gmp-mparam.h
new file mode 100644
index 0000000000..ee5a0a3ab7
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/gmp-mparam.h
@@ -0,0 +1,65 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values were measured on a PA8500 using the system compiler version
+   A.11.01.02.  Presumably the PA8000 and PA8200 have the same timing
+   characteristic, but GCC might give somewhat different results..  */
+/* Generated by tuneup.c, 2000-07-25. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   18
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      105
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   46
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD       83
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              58
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            134
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            56
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD       26
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD           1
+#endif
diff --git a/rts/gmp/mpn/pa64w/lshift.s b/rts/gmp/mpn/pa64w/lshift.s
new file mode 100644
index 0000000000..84f925a105
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/lshift.s
@@ -0,0 +1,103 @@
+; HP-PA 2.0 __gmpn_lshift --
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; size		gr24
+; cnt		gr23
+
+; This runs at 1.5 cycles/limb on PA8000.
+
+	.level	2.0w
+	.code
+	.export	__gmpn_lshift,entry
+__gmpn_lshift
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	shladd		%r24,3,%r25,%r25
+	shladd		%r24,3,%r26,%r26
+	subi		64,%r23,%r23
+	mtsar		%r23
+	ldd		-8(%r25),%r21
+	addib,=		-1,%r24,L$end
+	shrpd		%r0,%r21,%sar,%r29	; compute carry out limb
+	depw,z		%r24,31,3,%r28		; r28 = (size & 7)
+	sub		%r0,%r24,%r22
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-size & 7)
+	add		%r25,%r22,%r25		; offset s1_ptr
+	blr		%r28,%r0		; branch into jump table
+	add		%r26,%r22,%r26		; offset res_ptr
+	b		L$0
+	nop
+	b		L$1
+	copy		%r21,%r20
+	b		L$2
+	nop
+	b		L$3
+	copy		%r21,%r20
+	b		L$4
+	nop
+	b		L$5
+	copy		%r21,%r20
+	b		L$6
+	nop
+	b		L$7
+	copy		%r21,%r20
+
+L$loop
+L$0	ldd		-16(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-8(%r26)
+L$7	ldd		-24(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-16(%r26)
+L$6	ldd		-32(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-24(%r26)
+L$5	ldd		-40(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-32(%r26)
+L$4	ldd		-48(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-40(%r26)
+L$3	ldd		-56(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-48(%r26)
+L$2	ldd		-64(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-56(%r26)
+L$1	ldd		-72(%r25),%r21
+	ldo		-64(%r25),%r25
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-64(%r26)
+	addib,>		-8,%r24,L$loop
+	ldo		-64(%r26),%r26
+
+L$end	shrpd		%r21,%r0,%sar,%r21
+	std		%r21,-8(%r26)
+	bve		(%r2)
+	.exit
+	copy		%r29,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64w/mul_1.S b/rts/gmp/mpn/pa64w/mul_1.S
new file mode 100644
index 0000000000..48f13fbd1b
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/mul_1.S
@@ -0,0 +1,159 @@
+; HP-PA 2.0 64-bit __gmpn_mul_1 -- Multiply a limb vector with a limb and
+; store the result in a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		%r23
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0w
+	.code
+	.export __gmpn_mul_1,entry
+__gmpn_mul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+	std		s2limb,-56(%r30)
+        fldd		-56(%r30),%fr5
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t3
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t3
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t2 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,t4,t3
+	add,dc		t2,hi,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	copy		cylimb,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64w/rshift.s b/rts/gmp/mpn/pa64w/rshift.s
new file mode 100644
index 0000000000..2517cb1f87
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/rshift.s
@@ -0,0 +1,100 @@
+; HP-PA 2.0 __gmpn_rshift --
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; size		gr24
+; cnt		gr23
+
+; This runs at 1.5 cycles/limb on PA8000.
+
+	.level	2.0w
+	.code
+	.export	__gmpn_rshift,entry
+__gmpn_rshift
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	mtsar		%r23
+	ldd		0(%r25),%r21
+	addib,=		-1,%r24,L$end
+	shrpd		%r21,%r0,%sar,%r29	; compute carry out limb
+	depw,z		%r24,31,3,%r28		; r28 = (size & 7)
+	sub		%r0,%r24,%r22
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-size & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	blr		%r28,%r0		; branch into jump table
+	sub		%r26,%r22,%r26		; offset res_ptr
+	b		L$0
+	nop
+	b		L$1
+	copy		%r21,%r20
+	b		L$2
+	nop
+	b		L$3
+	copy		%r21,%r20
+	b		L$4
+	nop
+	b		L$5
+	copy		%r21,%r20
+	b		L$6
+	nop
+	b		L$7
+	copy		%r21,%r20
+
+L$loop
+L$0	ldd		8(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,0(%r26)
+L$7	ldd		16(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,8(%r26)
+L$6	ldd		24(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,16(%r26)
+L$5	ldd		32(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,24(%r26)
+L$4	ldd		40(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,32(%r26)
+L$3	ldd		48(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,40(%r26)
+L$2	ldd		56(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,48(%r26)
+L$1	ldd		64(%r25),%r21
+	ldo		64(%r25),%r25
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,56(%r26)
+	addib,>		-8,%r24,L$loop
+	ldo		64(%r26),%r26
+
+L$end	shrpd		%r0,%r21,%sar,%r21
+	std		%r21,0(%r26)
+	bve		(%r2)
+	.exit
+	copy		%r29,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64w/sub_n.s b/rts/gmp/mpn/pa64w/sub_n.s
new file mode 100644
index 0000000000..ad01e24aa7
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/sub_n.s
@@ -0,0 +1,90 @@
+; HP-PA 2.0 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0
+; and store difference in a third limb vector.
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.level	2.0w
+	.code
+	.export	__gmpn_sub_n,entry
+__gmpn_sub_n
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	sub		%r0,%r23,%r22
+	depw,z		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	blr		%r28,%r0		; branch into loop
+	sub		%r26,%r22,%r26		; offset res_ptr and set carry
+
+L$loop	ldd		0(%r25),%r20
+	ldd		0(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,0(%r26)
+L$7	ldd		8(%r25),%r21
+	ldd		8(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,8(%r26)
+L$6	ldd		16(%r25),%r20
+	ldd		16(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,16(%r26)
+L$5	ldd		24(%r25),%r21
+	ldd		24(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,24(%r26)
+L$4	ldd		32(%r25),%r20
+	ldd		32(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,32(%r26)
+L$3	ldd		40(%r25),%r21
+	ldd		40(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,40(%r26)
+L$2	ldd		48(%r25),%r20
+	ldd		48(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,48(%r26)
+L$1	ldd		56(%r25),%r21
+	ldo		64(%r25),%r25
+	ldd		56(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,56(%r26)
+	ldo		64(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		64(%r26),%r26
+
+	add,dc		%r0,%r0,%r29
+	subi		1,%r29,%r29
+	bve		(%r2)
+	.exit
+	copy		%r29,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64w/submul_1.S b/rts/gmp/mpn/pa64w/submul_1.S
new file mode 100644
index 0000000000..294f6239b2
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/submul_1.S
@@ -0,0 +1,171 @@
+; HP-PA 2.0 64-bit __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		%r23
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0w
+	.code
+	.export __gmpn_submul_1,entry
+__gmpn_submul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+	std		s2limb,-56(%r30)
+        fldd		-56(%r30),%fr5
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t4
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t4
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,t4,t4
+	add,dc		t2,hi,cylimb
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	copy		cylimb,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64w/udiv_qrnnd.c b/rts/gmp/mpn/pa64w/udiv_qrnnd.c
new file mode 100644
index 0000000000..1852913000
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/udiv_qrnnd.c
@@ -0,0 +1,117 @@
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#define TWO64 18446744073709551616.0
+#define TWO63 9223372036854775808.0
+
+mp_limb_t
+#if __STDC__
+__MPN(udiv_qrnnd) (mp_limb_t n1, mp_limb_t n0, mp_limb_t d, mp_limb_t *r)
+#else
+__MPN(udiv_qrnnd) (n1, n0, d, r)
+     mp_limb_t n1;
+     mp_limb_t n0;
+     mp_limb_t d;
+     mp_limb_t *r;
+#endif
+{
+  mp_limb_t q1, q2, q;
+  mp_limb_t p1, p0;
+  double di, dq;
+
+  di = 1.0 / d;
+
+  /* Generate upper 53 bits of quotient.  Be careful here; the `double'
+     quotient may be rounded to 2^64 which we cannot safely convert back
+     to a 64-bit integer.  */
+  dq = (TWO64 * (double) n1 + (double) n0) * di;
+  if (dq >= TWO64)
+    q1 = 0xfffffffffffff800L;
+#ifndef __GNUC__
+  /* Work around HP compiler bug.  */
+  else if (dq > TWO63)
+    q1 = (mp_limb_t) (dq - TWO63) + 0x8000000000000000L;
+#endif
+  else
+    q1 = (mp_limb_t) dq;
+
+  /* Multiply back in order to compare the product to the dividend.  */
+  umul_ppmm (p1, p0, q1, d);
+
+  /* Was the 53-bit quotient greater that our sought quotient?  Test the
+     sign of the partial remainder to find out.  */
+  if (n1 < p1 || (n1 == p1 && n0 < p0))
+    {
+      /* 53-bit quotient too large.  Partial remainder is negative.
+	 Compute the absolute value of the remainder in n1,,n0.  */
+      n1 = p1 - (n1 + (p0 < n0));
+      n0 = p0 - n0;
+
+      /* Now use the partial remainder as new dividend to compute more bits of
+	 quotient.  This is an adjustment for the one we got previously.  */
+      q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di);
+      umul_ppmm (p1, p0, q2, d);
+
+      q = q1 - q2;
+      if (n1 < p1 || (n1 == p1 && n0 <= p0))
+	{
+	  n0 = p0 - n0;
+	}
+      else
+	{
+	  n0 = p0 - n0;
+	  n0 += d;
+	  q--;
+	}
+    }
+  else
+    {
+      n1 = n1 - (p1 + (n0 < p0));
+      n0 = n0 - p0;
+
+      q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di);
+      umul_ppmm (p1, p0, q2, d);
+
+      q = q1 + q2;
+      if (n1 < p1 || (n1 == p1 && n0 < p0))
+	{
+	  n0 = n0 - p0;
+	  n0 += d;
+	  q--;
+	}
+      else
+	{
+	  n0 = n0 - p0;
+	  if (n0 >= d)
+	    {
+	      n0 -= d;
+	      q++;
+	    }
+	}
+    }
+
+  *r = n0;
+  return q;
+}
diff --git a/rts/gmp/mpn/pa64w/umul_ppmm.S b/rts/gmp/mpn/pa64w/umul_ppmm.S
new file mode 100644
index 0000000000..d9fb92be8c
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/umul_ppmm.S
@@ -0,0 +1,72 @@
+; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+#define p0	%r28
+#define p1	%r29
+#define t32	%r19
+#define t0	%r20
+#define t1	%r21
+#define x	%r22
+#define m0	%r23
+#define m1	%r24
+	.level  2.0w
+	.code
+	.export __gmpn_umul_ppmm,entry
+__gmpn_umul_ppmm
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+	ldo		128(%r30),%r30
+	std		%r26,-64(%r30)
+	std		%r25,-56(%r30)
+
+	copy		%r24,%r31
+
+        fldd		-64(%r30),%fr4
+        fldd		-56(%r30),%fr5
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+
+	depdi,z		1,31,1,t32		; t32 = 2^32
+
+	ldd		-128(%r30),p0		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),p1		; hi = high 64 bit of product
+
+	add,l,*nuv	m0,m1,x			; x = m1+m0
+	 add,l		t32,p1,p1		; propagate carry to mid of p1
+	depd,z		x,31,32,t0		; lo32(m1+m0)
+	add		t0,p0,p0
+	extrd,u		x,31,32,t1		; hi32(m1+m0)
+	add,dc		t1,p1,p1
+
+	std		p0,0(%r31)		; store low half of product
+	copy		p1,%r28			; return high half of product
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/power/add_n.s b/rts/gmp/mpn/power/add_n.s
new file mode 100644
index 0000000000..0f9f48f1cc
--- /dev/null
+++ b/rts/gmp/mpn/power/add_n.s
@@ -0,0 +1,79 @@
+# IBM POWER __gmpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+# Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software Foundation,
+# Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+	.toc
+	.globl	__gmpn_add_n
+	.globl	.__gmpn_add_n
+	.csect	__gmpn_add_n[DS]
+__gmpn_add_n:
+	.long	.__gmpn_add_n, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_add_n:
+	andil.	10,6,1		# odd or even number of limbs?
+	l	8,0(4)		# load least significant s1 limb
+	l	0,0(5)		# load least significant s2 limb
+	cal	3,-4(3)		# offset res_ptr, it's updated before it's used
+	sri	10,6,1		# count for unrolled loop
+	a	7,0,8		# add least significant limbs, set cy
+	mtctr	10		# copy count into CTR
+	beq	0,Leven		# branch if even # of limbs (# of limbs >= 2)
+
+# We have an odd # of limbs.  Add the first limbs separately.
+	cmpi	1,10,0		# is count for unrolled loop zero?
+	bc	4,6,L1		# bne cr1,L1 (misassembled by gas)
+	st	7,4(3)
+	aze	3,10		# use the fact that r10 is zero...
+	br			# return
+
+# We added least significant limbs.  Now reload the next limbs to enter loop.
+L1:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	stu	7,4(3)
+	ae	7,0,8		# add limbs, set cy
+Leven:	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5)		# load s2 limb and update s2_ptr
+	bdz	Lend		# If done, skip loop
+
+Loop:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	ae	11,9,10		# add previous limbs with cy, set cy
+	stu	7,4(3)		# 
+	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5)		# load s2 limb and update s2_ptr
+	ae	7,0,8		# add previous limbs with cy, set cy
+	stu	11,4(3)		# 
+	bdn	Loop		# decrement CTR and loop back
+
+Lend:	ae	11,9,10		# add limbs with cy, set cy
+	st	7,4(3)		# 
+	st	11,8(3)		# 
+	lil	3,0		# load cy into ...
+	aze	3,3		# ... return value register
+	br
diff --git a/rts/gmp/mpn/power/addmul_1.s b/rts/gmp/mpn/power/addmul_1.s
new file mode 100644
index 0000000000..8ecc651579
--- /dev/null
+++ b/rts/gmp/mpn/power/addmul_1.s
@@ -0,0 +1,122 @@
+# IBM POWER __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The POWER architecture has no unsigned 32x32->64 bit multiplication
+# instruction.  To obtain that operation, we have to use the 32x32->64 signed
+# multiplication instruction, and add the appropriate compensation to the high
+# limb of the result.  We add the multiplicand if the multiplier has its most
+# significant bit set, and we add the multiplier if the multiplicand has its
+# most significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit we
+# can branch in zero cycles, so that's how we perform the additions.
+
+	.toc
+	.globl	__gmpn_addmul_1
+	.globl	.__gmpn_addmul_1
+	.csect	__gmpn_addmul_1[DS]
+__gmpn_addmul_1:
+	.long	.__gmpn_addmul_1, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_addmul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	8
+	cax	9,9,7
+	l	7,4(3)
+	a	8,8,7		# add res_limb
+	blt	Lneg
+Lpos:	bdz	Lend
+
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	8,0,9		# low limb + old_cy_limb + old cy
+	l	7,4(3)
+	aze	10,10		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Lp0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	8,0,10
+	l	7,4(3)
+	aze	9,9
+	a	8,8,7
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	7
+	ae	8,7,9
+	l	7,4(3)
+	ae	10,10,0		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Ln0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	7
+	ae	8,7,10
+	l	7,4(3)
+	ae	9,9,0		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
diff --git a/rts/gmp/mpn/power/lshift.s b/rts/gmp/mpn/power/lshift.s
new file mode 100644
index 0000000000..ab71fb7727
--- /dev/null
+++ b/rts/gmp/mpn/power/lshift.s
@@ -0,0 +1,56 @@
+# IBM POWER __gmpn_lshift -- 
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s_ptr		r4
+# size		r5
+# cnt		r6
+
+	.toc
+	.globl	__gmpn_lshift
+	.globl	.__gmpn_lshift
+	.csect	__gmpn_lshift[DS]
+__gmpn_lshift:
+	.long	.__gmpn_lshift, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_lshift:
+	sli	0,5,2
+	cax	9,3,0
+	cax	4,4,0
+	sfi	8,6,32
+	mtctr	5		# put limb count in CTR loop register
+	lu	0,-4(4)		# read most significant limb
+	sre	3,0,8		# compute carry out limb, and init MQ register
+	bdz	Lend2		# if just one limb, skip loop
+	lu	0,-4(4)		# read 2:nd most significant limb
+	sreq	7,0,8		# compute most significant limb of result
+	bdz	Lend		# if just two limb, skip loop
+Loop:	lu	0,-4(4)		# load next lower limb
+	stu	7,-4(9)		# store previous result during read latency
+	sreq	7,0,8		# compute result limb
+	bdn	Loop		# loop back until CTR is zero
+Lend:	stu	7,-4(9)		# store 2:nd least significant limb
+Lend2:	sle	7,0,6		# compute least significant limb
+	st      7,-4(9)		# store it"				\
+	br
diff --git a/rts/gmp/mpn/power/mul_1.s b/rts/gmp/mpn/power/mul_1.s
new file mode 100644
index 0000000000..4e08ade583
--- /dev/null
+++ b/rts/gmp/mpn/power/mul_1.s
@@ -0,0 +1,109 @@
+# IBM POWER __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The POWER architecture has no unsigned 32x32->64 bit multiplication
+# instruction.  To obtain that operation, we have to use the 32x32->64 signed
+# multiplication instruction, and add the appropriate compensation to the high
+# limb of the result.  We add the multiplicand if the multiplier has its most
+# significant bit set, and we add the multiplier if the multiplicand has its
+# most significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit we
+# can branch in zero cycles, so that's how we perform the additions.
+
+	.toc
+	.globl	__gmpn_mul_1
+	.globl	.__gmpn_mul_1
+	.csect	__gmpn_mul_1[DS]
+__gmpn_mul_1:
+	.long	.__gmpn_mul_1, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_mul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	8
+	ai	0,0,0		# reset carry
+	cax	9,9,7
+	blt	Lneg
+Lpos:	bdz	Lend
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	8,0,9
+	bge	Lp0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	8,0,10
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	cax	10,10,0		# adjust high limb for negative s2_limb
+	mfmq	0
+	ae	8,0,9
+	bge	Ln0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	cax	9,9,0		# adjust high limb for negative s2_limb
+	mfmq	0
+	ae	8,0,10
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
diff --git a/rts/gmp/mpn/power/rshift.s b/rts/gmp/mpn/power/rshift.s
new file mode 100644
index 0000000000..65b3945f8a
--- /dev/null
+++ b/rts/gmp/mpn/power/rshift.s
@@ -0,0 +1,54 @@
+# IBM POWER __gmpn_rshift -- 
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s_ptr		r4
+# size		r5
+# cnt		r6
+
+	.toc
+	.globl	__gmpn_rshift
+	.globl	.__gmpn_rshift
+	.csect	__gmpn_rshift[DS]
+__gmpn_rshift:
+	.long	.__gmpn_rshift, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_rshift:
+	sfi	8,6,32
+	mtctr	5		# put limb count in CTR loop register
+	l	0,0(4)		# read least significant limb
+	ai	9,3,-4		# adjust res_ptr since it's offset in the stu:s
+	sle	3,0,8		# compute carry limb, and init MQ register
+	bdz	Lend2		# if just one limb, skip loop
+	lu	0,4(4)		# read 2:nd least significant limb
+	sleq	7,0,8		# compute least significant limb of result
+	bdz	Lend		# if just two limb, skip loop
+Loop:	lu	0,4(4)		# load next higher limb
+	stu	7,4(9)		# store previous result during read latency
+	sleq	7,0,8		# compute result limb
+	bdn	Loop		# loop back until CTR is zero
+Lend:	stu	7,4(9)		# store 2:nd most significant limb
+Lend2:	sre	7,0,6		# compute most significant limb
+	st      7,4(9)		# store it"				\
+	br
diff --git a/rts/gmp/mpn/power/sdiv.s b/rts/gmp/mpn/power/sdiv.s
new file mode 100644
index 0000000000..81da622fbc
--- /dev/null
+++ b/rts/gmp/mpn/power/sdiv.s
@@ -0,0 +1,34 @@
+# Copyright (C) 1999 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+	.toc
+	.globl	__sdiv_qrnnd
+	.globl	.__sdiv_qrnnd
+	.csect	__sdiv_qrnnd[DS]
+__sdiv_qrnnd:
+	.long	.__sdiv_qrnnd, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__sdiv_qrnnd:
+	mtmq	5
+	div	0,4,6
+	mfmq	9
+	st	9,0(3)
+	mr	3,0
+	br
diff --git a/rts/gmp/mpn/power/sub_n.s b/rts/gmp/mpn/power/sub_n.s
new file mode 100644
index 0000000000..aa09cf5bc1
--- /dev/null
+++ b/rts/gmp/mpn/power/sub_n.s
@@ -0,0 +1,80 @@
+# IBM POWER __gmpn_sub_n -- Subtract two limb vectors of equal, non-zero length.
+
+# Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software Foundation,
+# Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+	.toc
+	.globl	__gmpn_sub_n
+	.globl	.__gmpn_sub_n
+	.csect	__gmpn_sub_n[DS]
+__gmpn_sub_n:
+	.long	.__gmpn_sub_n, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_sub_n:
+	andil.	10,6,1		# odd or even number of limbs?
+	l	8,0(4)		# load least significant s1 limb
+	l	0,0(5)		# load least significant s2 limb
+	cal	3,-4(3)		# offset res_ptr, it's updated before it's used
+	sri	10,6,1		# count for unrolled loop
+	sf	7,0,8		# subtract least significant limbs, set cy
+	mtctr	10		# copy count into CTR
+	beq	0,Leven		# branch if even # of limbs (# of limbs >= 2)
+
+# We have an odd # of limbs.  Add the first limbs separately.
+	cmpi	1,10,0		# is count for unrolled loop zero?
+	bc	4,6,L1		# bne cr1,L1 (misassembled by gas)
+	st	7,4(3)
+	sfe	3,0,0		# load !cy into ...
+	sfi	3,3,0		# ... return value register
+	br			# return
+
+# We added least significant limbs.  Now reload the next limbs to enter loop.
+L1:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	stu	7,4(3)
+	sfe	7,0,8		# subtract limbs, set cy
+Leven:	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5)		# load s2 limb and update s2_ptr
+	bdz	Lend		# If done, skip loop
+
+Loop:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	sfe	11,10,9		# subtract previous limbs with cy, set cy
+	stu	7,4(3)		# 
+	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5)		# load s2 limb and update s2_ptr
+	sfe	7,0,8		# subtract previous limbs with cy, set cy
+	stu	11,4(3)		# 
+	bdn	Loop		# decrement CTR and loop back
+
+Lend:	sfe	11,10,9		# subtract limbs with cy, set cy
+	st	7,4(3)		# 
+	st	11,8(3)		# 
+	sfe	3,0,0		# load !cy into ...
+	sfi	3,3,0		# ... return value register
+	br
diff --git a/rts/gmp/mpn/power/submul_1.s b/rts/gmp/mpn/power/submul_1.s
new file mode 100644
index 0000000000..bc01b7c95d
--- /dev/null
+++ b/rts/gmp/mpn/power/submul_1.s
@@ -0,0 +1,127 @@
+# IBM POWER __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The POWER architecture has no unsigned 32x32->64 bit multiplication
+# instruction.  To obtain that operation, we have to use the 32x32->64 signed
+# multiplication instruction, and add the appropriate compensation to the high
+# limb of the result.  We add the multiplicand if the multiplier has its most
+# significant bit set, and we add the multiplier if the multiplicand has its
+# most significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit we
+# can branch in zero cycles, so that's how we perform the additions.
+
+	.toc
+	.globl	__gmpn_submul_1
+	.globl	.__gmpn_submul_1
+	.csect	__gmpn_submul_1[DS]
+__gmpn_submul_1:
+	.long	.__gmpn_submul_1, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_submul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	11
+	cax	9,9,7
+	l	7,4(3)
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	blt	Lneg
+Lpos:	bdz	Lend
+
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	11,0,9		# low limb + old_cy_limb + old cy
+	l	7,4(3)
+	aze	10,10		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Lp0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	11,0,10
+	l	7,4(3)
+	aze	9,9
+	sf	8,11,7
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	7
+	ae	11,7,9
+	l	7,4(3)
+	ae	10,10,0		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Ln0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	7
+	ae	11,7,10
+	l	7,4(3)
+	ae	9,9,0		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
diff --git a/rts/gmp/mpn/power/umul.s b/rts/gmp/mpn/power/umul.s
new file mode 100644
index 0000000000..8c77496380
--- /dev/null
+++ b/rts/gmp/mpn/power/umul.s
@@ -0,0 +1,38 @@
+# Copyright (C) 1999 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+	.toc
+	.globl	__umul_ppmm
+	.globl	.__umul_ppmm
+	.csect	__umul_ppmm[DS]
+__umul_ppmm:
+	.long	.__umul_ppmm, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__umul_ppmm:
+	mul	9,4,5
+	srai	0,4,31
+	and	0,0,5
+	srai	5,5,31
+	and	5,5,4
+	cax	0,0,5
+	mfmq	11
+	st	11,0(3)
+	cax	3,9,0
+	br
diff --git a/rts/gmp/mpn/powerpc32/add_n.asm b/rts/gmp/mpn/powerpc32/add_n.asm
new file mode 100644
index 0000000000..81ed04b162
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/add_n.asm
@@ -0,0 +1,61 @@
+dnl PowerPC-32 mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl store sum in a third limb vector.
+
+dnl Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl s2_ptr	r5
+dnl size	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	mtctr	r6		C copy size into CTR
+	addic	r0,r0,0		C clear cy
+	lwz	r8,0(r4)	C load least significant s1 limb
+	lwz	r0,0(r5)	C load least significant s2 limb
+	addi	r3,r3,-4	C offset res_ptr, it's updated before it's used
+	bdz	.Lend		C If done, skip loop
+.Loop:	lwz	r9,4(r4)	C load s1 limb
+	lwz	r10,4(r5)	C load s2 limb
+	adde	r7,r0,r8	C add limbs with cy, set cy
+	stw	r7,4(r3)	C store result limb
+	bdz	.Lexit		C decrement CTR and exit if done
+	lwzu	r8,8(r4)	C load s1 limb and update s1_ptr
+	lwzu	r0,8(r5)	C load s2 limb and update s2_ptr
+	adde	r7,r10,r9	C add limbs with cy, set cy
+	stwu	r7,8(r3)	C store result limb and update res_ptr
+	bdnz	.Loop		C decrement CTR and loop back
+
+.Lend:	adde	r7,r0,r8
+	stw	r7,4(r3)	C store ultimate result limb
+	li	r3,0		C load cy into ...
+	addze	r3,r3		C ... return value register
+	blr
+.Lexit:	adde	r7,r10,r9
+	stw	r7,8(r3)
+	li	r3,0		C load cy into ...
+	addze	r3,r3		C ... return value register
+	blr
+EPILOGUE(mpn_add_n)
diff --git a/rts/gmp/mpn/powerpc32/addmul_1.asm b/rts/gmp/mpn/powerpc32/addmul_1.asm
new file mode 100644
index 0000000000..3ef75b1532
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/addmul_1.asm
@@ -0,0 +1,124 @@
+dnl PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl Copyright (C) 1995, 1997, 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl size	r5
+dnl s2_limb	r6
+
+dnl This is optimized for the PPC604.  It has not been tested on PPC601, PPC603
+dnl or PPC750 since I don't have access to any such machines.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	cmpi	cr0,r5,9	C more than 9 limbs?
+	bgt	cr0,.Lbig	C branch if more than 9 limbs
+
+	mtctr	r5
+	lwz	r0,0(r4)
+	mullw	r7,r0,r6
+	mulhwu	r10,r0,r6
+	lwz	r9,0(r3)
+	addc	r8,r7,r9
+	addi	r3,r3,-4
+	bdz	.Lend
+.Lloop:
+	lwzu	r0,4(r4)
+	stwu	r8,4(r3)
+	mullw	r8,r0,r6
+	adde	r7,r8,r10
+	mulhwu	r10,r0,r6
+	lwz	r9,4(r3)
+	addze	r10,r10
+	addc	r8,r7,r9
+	bdnz	.Lloop
+.Lend:	stw	r8,4(r3)
+	addze	r3,r10
+	blr
+
+.Lbig:	stmw	r30,-32(r1)
+	addi	r5,r5,-1
+	srwi	r0,r5,2
+	mtctr	r0
+
+	lwz	r7,0(r4)
+	mullw	r8,r7,r6
+	mulhwu	r0,r7,r6
+	lwz	r7,0(r3)
+	addc	r8,r8,r7
+	stw	r8,0(r3)
+
+.LloopU:
+	lwz	r7,4(r4)
+	lwz	r12,8(r4)
+	lwz	r30,12(r4)
+	lwzu	r31,16(r4)
+	mullw	r8,r7,r6
+	mullw	r9,r12,r6
+	mullw	r10,r30,r6
+	mullw	r11,r31,r6
+	adde	r8,r8,r0	C add cy_limb
+	mulhwu	r0,r7,r6
+	lwz	r7,4(r3)
+	adde	r9,r9,r0
+	mulhwu	r0,r12,r6
+	lwz	r12,8(r3)
+	adde	r10,r10,r0
+	mulhwu	r0,r30,r6
+	lwz	r30,12(r3)
+	adde	r11,r11,r0
+	mulhwu	r0,r31,r6
+	lwz	r31,16(r3)
+	addze	r0,r0		C new cy_limb
+	addc	r8,r8,r7
+	stw	r8,4(r3)
+	adde	r9,r9,r12
+	stw	r9,8(r3)
+	adde	r10,r10,r30
+	stw	r10,12(r3)
+	adde	r11,r11,r31
+	stwu	r11,16(r3)
+	bdnz	.LloopU
+
+	andi.	r31,r5,3
+	mtctr	r31
+	beq	cr0,.Lendx
+
+.LloopE:
+	lwzu	r7,4(r4)
+	mullw	r8,r7,r6
+	adde	r8,r8,r0	C add cy_limb
+	mulhwu	r0,r7,r6
+	lwz	r7,4(r3)
+	addze	r0,r0		C new cy_limb
+	addc	r8,r8,r7
+	stwu	r8,4(r3)
+	bdnz	.LloopE
+.Lendx:
+	addze	r3,r0
+	lmw	r30,-32(r1)
+	blr
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/powerpc32/aix.m4 b/rts/gmp/mpn/powerpc32/aix.m4
new file mode 100644
index 0000000000..2bd8425817
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/aix.m4
@@ -0,0 +1,39 @@
+divert(-1)
+dnl  m4 macros for AIX 32-bit assembly.
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+define(`ASM_START',
+	`.toc')
+	
+define(`PROLOGUE',
+	`
+	.globl	$1
+	.globl	.$1
+	.csect	$1[DS],2
+$1:
+	.long	.$1, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.$1:')
+
+define(`EPILOGUE', `')
+
+divert
diff --git a/rts/gmp/mpn/powerpc32/gmp-mparam.h b/rts/gmp/mpn/powerpc32/gmp-mparam.h
new file mode 100644
index 0000000000..b283185789
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/gmp-mparam.h
@@ -0,0 +1,66 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values are for the 604.  Presumably, these should be considerably
+   different for the 603 and 750 that have much slower multiply
+   instructions.  */
+
+/* Generated by tuneup.c, 2000-05-26. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   26	/* tuneup says 20 */
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      228
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   46	/* tuneup says 44 */
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      262
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              52
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             86
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            23
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        7
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          53
+#endif
diff --git a/rts/gmp/mpn/powerpc32/lshift.asm b/rts/gmp/mpn/powerpc32/lshift.asm
new file mode 100644
index 0000000000..73a85430ab
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/lshift.asm
@@ -0,0 +1,145 @@
+dnl  PowerPC-32 mpn_lshift -- Shift a number left.
+
+dnl Copyright (C) 1995, 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl size	r5
+dnl cnt		r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	cmpi	cr0,r5,12	C more than 12 limbs?
+	slwi	r0,r5,2
+	add	r4,r4,r0	C make r4 point at end of s1
+	add	r7,r3,r0	C make r7 point at end of res
+	bgt	.LBIG		C branch if more than 12 limbs
+
+	mtctr	r5		C copy size into CTR
+	subfic	r8,r6,32
+	lwzu	r11,-4(r4)	C load first s1 limb
+	srw	r3,r11,r8	C compute function return value
+	bdz	.Lend1
+
+.Loop:	lwzu	r10,-4(r4)
+	slw	r9,r11,r6
+	srw	r12,r10,r8
+	or	r9,r9,r12
+	stwu	r9,-4(r7)
+	bdz	.Lend2
+	lwzu	r11,-4(r4)
+	slw	r9,r10,r6
+	srw	r12,r11,r8
+	or	r9,r9,r12
+	stwu	r9,-4(r7)
+	bdnz	.Loop
+
+.Lend1:	slw	r0,r11,r6
+	stw	r0,-4(r7)
+	blr
+.Lend2:	slw	r0,r10,r6
+	stw	r0,-4(r7)
+	blr
+
+.LBIG:
+	stmw	r24,-32(r1)	C save registers we are supposed to preserve
+	lwzu	r9,-4(r4)
+	subfic	r8,r6,32
+	srw	r3,r9,r8	C compute function return value
+	slw	r0,r9,r6
+	addi	r5,r5,-1
+
+	andi.	r10,r5,3	C count for spill loop
+	beq	.Le
+	mtctr	r10
+	lwzu	r28,-4(r4)
+	bdz	.Lxe0
+
+.Loop0:	slw	r12,r28,r6
+	srw	r24,r28,r8
+	lwzu	r28,-4(r4)
+	or	r24,r0,r24
+	stwu	r24,-4(r7)
+	mr	r0,r12
+	bdnz	.Loop0		C taken at most once!
+
+.Lxe0:	slw	r12,r28,r6
+	srw	r24,r28,r8
+	or	r24,r0,r24
+	stwu	r24,-4(r7)
+	mr	r0,r12
+
+.Le:	srwi	r5,r5,2		C count for unrolled loop
+	addi	r5,r5,-1
+	mtctr	r5
+	lwz	r28,-4(r4)
+	lwz	r29,-8(r4)
+	lwz	r30,-12(r4)
+	lwzu	r31,-16(r4)
+
+.LoopU:	slw	r9,r28,r6
+	srw	r24,r28,r8
+	lwz	r28,-4(r4)
+	slw	r10,r29,r6
+	srw	r25,r29,r8
+	lwz	r29,-8(r4)
+	slw	r11,r30,r6
+	srw	r26,r30,r8
+	lwz	r30,-12(r4)
+	slw	r12,r31,r6
+	srw	r27,r31,r8
+	lwzu	r31,-16(r4)
+	or	r24,r0,r24
+	stw	r24,-4(r7)
+	or	r25,r9,r25
+	stw	r25,-8(r7)
+	or	r26,r10,r26
+	stw	r26,-12(r7)
+	or	r27,r11,r27
+	stwu	r27,-16(r7)
+	mr	r0,r12
+	bdnz	.LoopU
+
+	slw	r9,r28,r6
+	srw	r24,r28,r8
+	slw	r10,r29,r6
+	srw	r25,r29,r8
+	slw	r11,r30,r6
+	srw	r26,r30,r8
+	slw	r12,r31,r6
+	srw	r27,r31,r8
+	or	r24,r0,r24
+	stw	r24,-4(r7)
+	or	r25,r9,r25
+	stw	r25,-8(r7)
+	or	r26,r10,r26
+	stw	r26,-12(r7)
+	or	r27,r11,r27
+	stwu	r27,-16(r7)
+	mr	r0,r12
+
+	stw	r0,-4(r7)
+	lmw	r24,-32(r1)	C restore registers
+	blr
+EPILOGUE(mpn_lshift)
diff --git a/rts/gmp/mpn/powerpc32/mul_1.asm b/rts/gmp/mpn/powerpc32/mul_1.asm
new file mode 100644
index 0000000000..ec878b54d5
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/mul_1.asm
@@ -0,0 +1,86 @@
+dnl PowerPC-32 mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl the result in a second limb vector.
+
+dnl Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl size	r5
+dnl s2_limb	r6
+
+dnl This is optimized for the PPC604 but it runs decently even on PPC601.  It
+dnl has not been tested on a PPC603 since I don't have access to any such
+dnl machines.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	mtctr	r5
+	addi	r3,r3,-4	C adjust res_ptr, it's offset before it's used
+	li	r12,0		C clear upper product reg
+	addic	r0,r0,0		C clear cy
+C Start software pipeline
+	lwz	r8,0(r4)
+	bdz	.Lend3
+	stmw	r30,-8(r1)	C save registers we are supposed to preserve
+	lwzu	r9,4(r4)
+	mullw	r11,r8,r6
+	mulhwu	r0,r8,r6
+	bdz	.Lend1
+C Software pipelined main loop
+.Loop:	lwz	r8,4(r4)
+	mullw	r10,r9,r6
+	adde	r30,r11,r12
+	mulhwu	r12,r9,r6
+	stw	r30,4(r3)
+	bdz	.Lend2
+	lwzu	r9,8(r4)
+	mullw	r11,r8,r6
+	adde	r31,r10,r0
+	mulhwu	r0,r8,r6
+	stwu	r31,8(r3)
+	bdnz	.Loop
+C Finish software pipeline
+.Lend1:	mullw	r10,r9,r6
+	adde	r30,r11,r12
+	mulhwu	r12,r9,r6
+	stw	r30,4(r3)
+	adde	r31,r10,r0
+	stwu	r31,8(r3)
+	addze	r3,r12
+	lmw	r30,-8(r1)	C restore registers from stack
+	blr
+.Lend2:	mullw	r11,r8,r6
+	adde	r31,r10,r0
+	mulhwu	r0,r8,r6
+	stwu	r31,8(r3)
+	adde	r30,r11,r12
+	stw	r30,4(r3)
+	addze	r3,r0
+	lmw	r30,-8(r1)	C restore registers from stack
+	blr
+.Lend3:	mullw	r11,r8,r6
+	stw	r11,4(r3)
+	mulhwu	r3,r8,r6
+	blr
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/powerpc32/regmap.m4 b/rts/gmp/mpn/powerpc32/regmap.m4
new file mode 100644
index 0000000000..978f18902a
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/regmap.m4
@@ -0,0 +1,34 @@
+divert(-1)
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl Map register names r0, r1, etc, to just `0', `1', etc.
+dnl This is needed on all systems but NeXT, Rhapsody, and MacOS-X
+forloop(i,0,31,
+`define(`r'i,i)'
+)
+
+dnl Likewise for cr0, cr1, etc.
+forloop(i,0,7,
+`define(`cr'i,i)'
+)
+
+divert
diff --git a/rts/gmp/mpn/powerpc32/rshift.asm b/rts/gmp/mpn/powerpc32/rshift.asm
new file mode 100644
index 0000000000..a09ba04938
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/rshift.asm
@@ -0,0 +1,60 @@
+dnl PowerPC-32 mpn_rshift -- Shift a number right.
+
+dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl size	r5
+dnl cnt		r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	mtctr	r5		C copy size into CTR
+	addi	r7,r3,-4	C move adjusted res_ptr to free return reg
+	subfic	r8,r6,32
+	lwz	r11,0(r4)	C load first s1 limb
+	slw	r3,r11,r8	C compute function return value
+	bdz	.Lend1
+
+.Loop:	lwzu	r10,4(r4)
+	srw	r9,r11,r6
+	slw	r12,r10,r8
+	or	r9,r9,r12
+	stwu	r9,4(r7)
+	bdz	.Lend2
+	lwzu	r11,4(r4)
+	srw	r9,r10,r6
+	slw	r12,r11,r8
+	or	r9,r9,r12
+	stwu	r9,4(r7)
+	bdnz	.Loop
+
+.Lend1:	srw	r0,r11,r6
+	stw	r0,4(r7)
+	blr
+
+.Lend2:	srw	r0,r10,r6
+	stw	r0,4(r7)
+	blr
+EPILOGUE(mpn_rshift)
diff --git a/rts/gmp/mpn/powerpc32/sub_n.asm b/rts/gmp/mpn/powerpc32/sub_n.asm
new file mode 100644
index 0000000000..b04b4192ef
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/sub_n.asm
@@ -0,0 +1,61 @@
+dnl PowerPC-32 mpn_sub_n -- Subtract two limb vectors of the same length > 0
+dnl and store difference in a third limb vector.
+
+dnl Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl s2_ptr	r5
+dnl size	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	mtctr	r6		C copy size into CTR
+	addic	r0,r6,-1	C set cy
+	lwz	r8,0(r4)	C load least significant s1 limb
+	lwz	r0,0(r5)	C load least significant s2 limb
+	addi	r3,r3,-4	C offset res_ptr, it's updated before it's used
+	bdz	.Lend		C If done, skip loop
+.Loop:	lwz	r9,4(r4)	C load s1 limb
+	lwz	r10,4(r5)	C load s2 limb
+	subfe	r7,r0,r8	C subtract limbs with cy, set cy
+	stw	r7,4(r3)	C store result limb
+	bdz	.Lexit		C decrement CTR and exit if done
+	lwzu	r8,8(r4)	C load s1 limb and update s1_ptr
+	lwzu	r0,8(r5)	C load s2 limb and update s2_ptr
+	subfe	r7,r10,r9	C subtract limbs with cy, set cy
+	stwu	r7,8(r3)	C store result limb and update res_ptr
+	bdnz	.Loop		C decrement CTR and loop back
+
+.Lend:	subfe	r7,r0,r8
+	stw	r7,4(r3)	C store ultimate result limb
+	subfe	r3,r0,r0	C load !cy into ...
+	subfic	r3,r3,0		C ... return value register
+	blr
+.Lexit:	subfe	r7,r10,r9
+	stw	r7,8(r3)
+	subfe	r3,r0,r0	C load !cy into ...
+	subfic	r3,r3,0		C ... return value register
+	blr
+EPILOGUE(mpn_sub_n)
diff --git a/rts/gmp/mpn/powerpc32/submul_1.asm b/rts/gmp/mpn/powerpc32/submul_1.asm
new file mode 100644
index 0000000000..a129e9f9ea
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/submul_1.asm
@@ -0,0 +1,130 @@
+dnl PowerPC-32 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+dnl the result from a second limb vector.
+
+dnl Copyright (C) 1995, 1997, 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl size	r5
+dnl s2_limb	r6
+
+dnl This is optimized for the PPC604.  It has not been tested on PPC601, PPC603
+dnl or PPC750 since I don't have access to any such machines.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	cmpi	cr0,r5,9	C more than 9 limbs?
+	bgt	cr0,.Lbig	C branch if more than 9 limbs
+
+	mtctr	r5
+	lwz	r0,0(r4)
+	mullw	r7,r0,r6
+	mulhwu	r10,r0,r6
+	lwz	r9,0(r3)
+	subfc	r8,r7,r9
+	addc	r7,r7,r8	C invert cy (r7 is junk)
+	addi	r3,r3,-4
+	bdz	.Lend
+.Lloop:
+	lwzu	r0,4(r4)
+	stwu	r8,4(r3)
+	mullw	r8,r0,r6
+	adde	r7,r8,r10
+	mulhwu	r10,r0,r6
+	lwz	r9,4(r3)
+	addze	r10,r10
+	subfc	r8,r7,r9
+	addc	r7,r7,r8	C invert cy (r7 is junk)
+	bdnz	.Lloop
+.Lend:	stw	r8,4(r3)
+	addze	r3,r10
+	blr
+
+.Lbig:	stmw	r30,-32(r1)
+	addi	r5,r5,-1
+	srwi	r0,r5,2
+	mtctr	r0
+
+	lwz	r7,0(r4)
+	mullw	r8,r7,r6
+	mulhwu	r0,r7,r6
+	lwz	r7,0(r3)
+	subfc	r7,r8,r7
+	addc	r8,r8,r7
+	stw	r7,0(r3)
+
+.LloopU:
+	lwz	r7,4(r4)
+	lwz	r12,8(r4)
+	lwz	r30,12(r4)
+	lwzu	r31,16(r4)
+	mullw	r8,r7,r6
+	mullw	r9,r12,r6
+	mullw	r10,r30,r6
+	mullw	r11,r31,r6
+	adde	r8,r8,r0	C add cy_limb
+	mulhwu	r0,r7,r6
+	lwz	r7,4(r3)
+	adde	r9,r9,r0
+	mulhwu	r0,r12,r6
+	lwz	r12,8(r3)
+	adde	r10,r10,r0
+	mulhwu	r0,r30,r6
+	lwz	r30,12(r3)
+	adde	r11,r11,r0
+	mulhwu	r0,r31,r6
+	lwz	r31,16(r3)
+	addze	r0,r0		C new cy_limb
+	subfc	r7,r8,r7
+	stw	r7,4(r3)
+	subfe	r12,r9,r12
+	stw	r12,8(r3)
+	subfe	r30,r10,r30
+	stw	r30,12(r3)
+	subfe	r31,r11,r31
+	stwu	r31,16(r3)
+	subfe	r11,r11,r11	C invert ...
+	addic	r11,r11,1	C ... carry
+	bdnz	.LloopU
+
+	andi.	r31,r5,3
+	mtctr	r31
+	beq	cr0,.Lendx
+
+.LloopE:
+	lwzu	r7,4(r4)
+	mullw	r8,r7,r6
+	adde	r8,r8,r0	C add cy_limb
+	mulhwu	r0,r7,r6
+	lwz	r7,4(r3)
+	addze	r0,r0		C new cy_limb
+	subfc	r7,r8,r7
+	addc	r8,r8,r7
+	stwu	r7,4(r3)
+	bdnz	.LloopE
+.Lendx:
+	addze	r3,r0
+	lmw	r30,-32(r1)
+	blr
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/powerpc32/umul.asm b/rts/gmp/mpn/powerpc32/umul.asm
new file mode 100644
index 0000000000..eeaa0a4dc8
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/umul.asm
@@ -0,0 +1,32 @@
+dnl PowerPC-32 umul_ppmm -- support for longlong.h
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+	mullw	0,4,5
+	mulhwu	9,4,5
+	stw	0,0(3)
+	mr	3,9
+	blr
+EPILOGUE(mpn_umul_ppmm)
diff --git a/rts/gmp/mpn/powerpc64/README b/rts/gmp/mpn/powerpc64/README
new file mode 100644
index 0000000000..c779276917
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/README
@@ -0,0 +1,36 @@
+PPC630 (aka Power3) pipeline information:
+
+Decoding is 4-way and issue is 8-way with some out-of-order capability.
+LS1  - ld/st unit 1
+LS2  - ld/st unit 2
+FXU1 - integer unit 1, handles any simple integer instructions
+FXU2 - integer unit 2, handles any simple integer instructions
+FXU3 - integer unit 3, handles integer multiply and divide
+FPU1 - floating-point unit 1
+FPU2 - floating-point unit 2
+
+Memory:		  Any two memory operations can issue, but memory subsystem
+		  can sustain just one store per cycle.
+Simple integer:	  2 operations (such as add, rl*)
+Integer multiply: 1 operation every 9th cycle worst case; exact timing depends
+		  on 2nd operand most significant bit position (10 bits per
+		  cycle).  Multiply unit is not pipelined, only one multiply
+		  operation in progress is allowed.
+Integer divide:	  ?
+Floating-point:	  Any plain 2 arithmetic instructions (such as fmul, fadd, fmadd)
+		  Latency = 4.
+Floating-point divide:
+		  ?
+Floating-point square root:
+		  ?
+
+Best possible times for the main loops:
+shift:	      1.5 cycles limited by integer unit contention.
+	      With 63 special loops, one for each shift count, we could
+	      reduce the needed integer instructions to 2, which would
+	      reduce the best possible time to 1 cycle.
+add/sub:      1.5 cycles, limited by ld/st unit contention.
+mul:	      18 cycles (average) unless floating-point operations are used,
+	      but that would only help for multiplies of perhaps 10 and more
+	      limbs.
+addmul/submul:Same situation as for mul.
diff --git a/rts/gmp/mpn/powerpc64/add_n.asm b/rts/gmp/mpn/powerpc64/add_n.asm
new file mode 100644
index 0000000000..c3325376dc
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/add_n.asm
@@ -0,0 +1,61 @@
+# PowerPC-64 mpn_add_n -- Add two limb vectors of the same length > 0 and
+# store sum in a third limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	mtctr	r6		# copy size into CTR
+	addic	r0,r0,0		# clear cy
+	ld	r8,0(r4)	# load least significant s1 limb
+	ld	r0,0(r5)	# load least significant s2 limb
+	addi	r3,r3,-8	# offset res_ptr, it's updated before it's used
+	bdz	.Lend		# If done, skip loop
+.Loop:	ld	r9,8(r4)	# load s1 limb
+	ld	r10,8(r5)	# load s2 limb
+	adde	r7,r0,r8	# add limbs with cy, set cy
+	std	r7,8(r3)	# store result limb
+	bdz	.Lexit		# decrement CTR and exit if done
+	ldu	r8,16(r4)	# load s1 limb and update s1_ptr
+	ldu	r0,16(r5)	# load s2 limb and update s2_ptr
+	adde	r7,r10,r9	# add limbs with cy, set cy
+	stdu	r7,16(r3)	# store result limb and update res_ptr
+	bdnz	.Loop		# decrement CTR and loop back
+
+.Lend:	adde	r7,r0,r8
+	std	r7,8(r3)	# store ultimate result limb
+	li	r3,0		# load cy into ...
+	addze	r3,r3		# ... return value register
+	blr
+.Lexit:	adde	r7,r10,r9
+	std	r7,16(r3)
+	li	r3,0		# load cy into ...
+	addze	r3,r3		# ... return value register
+	blr
+EPILOGUE(mpn_add_n)
diff --git a/rts/gmp/mpn/powerpc64/addmul_1.asm b/rts/gmp/mpn/powerpc64/addmul_1.asm
new file mode 100644
index 0000000000..81774482fe
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/addmul_1.asm
@@ -0,0 +1,52 @@
+# PowerPC-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	mtctr	5
+	li	9,0		# cy_limb = 0
+	addic	0,0,0
+	cal	3,-8(3)
+	cal	4,-8(4)
+.Loop:
+	ldu	0,8(4)
+	ld	10,8(3)
+	mulld	7,0,6
+	adde	7,7,9
+	mulhdu	9,0,6
+	addze	9,9
+	addc	7,7,10
+	stdu	7,8(3)
+	bdnz	.Loop
+
+	addze	3,9
+	blr
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/powerpc64/addsub_n.asm b/rts/gmp/mpn/powerpc64/addsub_n.asm
new file mode 100644
index 0000000000..4ed40d71ae
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/addsub_n.asm
@@ -0,0 +1,107 @@
+# PowerPC-64 mpn_addsub_n -- Simultaneous add and sub.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+include(`asm-syntax.m4')
+
+define(SAVE_BORROW_RESTORE_CARRY,
+	`sldi $1,$1,63
+	adde $1,$1,$1')
+define(SAVE_CARRY_RESTORE_BORROW,
+	`sldi $1,$1,63
+	adde $1,$1,$1')
+
+# 19991117
+
+# This is just crafted for testing some ideas, and verifying that we can make
+# it run fast.  It runs at 2.55 cycles/limb on the 630, which is very good.
+# We should play a little with the schedule.  No time has been spent on that.
+
+# To finish this, the loop warm up and cool down code needs to be written,
+# and the result need to be tested.  Also, the proper calling sequence should
+# be used.
+
+#             r1p r2p s1p s2p n
+# Use reg r0, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12
+
+ASM_START()
+PROLOGUE(mpn_addsub_n)
+	std	r14,-64(1)
+	std	r15,-56(1)
+	std	r16,-48(1)
+	std	r17,-40(1)
+	std	r18,-32(1)
+	std	r19,-24(1)
+
+	srdi	r7,r7,2
+	mtctr	r7		# copy size into CTR
+	addic	r0,r0,0		# clear cy
+	addi	r3,r3,-8	# offset res_ptr, it's updated before it's used
+	addi	r4,r4,-8	# offset res_ptr, it's updated before it's used
+
+.Loop:
+	adde	r12,r8,r9
+	std	r12,8(r3)
+	adde	r12,r10,r11
+	std	r12,16(r3)
+
+	SAVE_CARRY_RESTORE_BORROW(r0)
+
+	subfe	r12,r8,r9
+	std	r12,8(r4)
+	ld	r8,8(r5)	# s1 L 1
+	ld	r9,8(r6)	# s2 L 1
+	subfe	r12,r10,r11
+	std	r12,16(r4)
+	ld	r10,16(r5)	# s1 L 2
+	ld	r11,16(r6)	# s2 L 2
+# pair -------------------------
+	subfe	r12,r14,r15
+	std	r12,24(r4)
+	subfe	r12,r16,r17
+	stdu	r12,32(r4)
+
+	SAVE_BORROW_RESTORE_CARRY(r0)
+
+	adde	r12,r14,r15
+	std	r12,24(r3)
+	ld	r14,24(r5)	# s1 L 3
+	ld	r15,24(r6)	# s2 L 3
+	adde	r12,r16,r17
+	stdu	r12,32(r3)
+	ldu	r16,32(r5)	# s1 L 4
+	ldu	r17,32(r6)	# s2 L 4
+	bdnz	.Loop
+
+	ld	r14,-64(1)
+	ld	r15,-56(1)
+	ld	r16,-48(1)
+	ld	r17,-40(1)
+	ld	r18,-32(1)
+	ld	r19,-24(1)
+	blr
+EPILOGUE(mpn_addsub_n)
diff --git a/rts/gmp/mpn/powerpc64/aix.m4 b/rts/gmp/mpn/powerpc64/aix.m4
new file mode 100644
index 0000000000..aee9f1f97a
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/aix.m4
@@ -0,0 +1,40 @@
+divert(-1)
+dnl  m4 macros for AIX 64-bit assembly.
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+define(`ASM_START',
+	`.machine	"ppc64"
+	.toc')
+	
+define(`PROLOGUE',
+	`
+	.globl	$1
+	.globl	.$1
+	.csect	$1[DS],3
+$1:
+	.llong	.$1, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.$1:')
+
+define(`EPILOGUE', `')
+
+divert
diff --git a/rts/gmp/mpn/powerpc64/copyd.asm b/rts/gmp/mpn/powerpc64/copyd.asm
new file mode 100644
index 0000000000..d06e8c25fd
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/copyd.asm
@@ -0,0 +1,45 @@
+# PowerPC-64 mpn_copyd -- Copy a limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# rptr	r3
+# sptr	r4
+# n	r5
+
+include(`../config.m4')
+
+# Unrolling this analogous to sparc64/copyi.s doesn't help for any
+# operand sizes.
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+	cmpdi	cr0,r5,0
+	mtctr	r5
+	sldi	r5,r5,3
+	add	r4,r4,r5
+	add	r3,r3,r5
+	beq	cr0,.Lend
+.Loop:	ldu	r0,-8(r4)
+	stdu	r0,-8(r3)
+	bdnz	.Loop
+.Lend:	blr
+EPILOGUE(mpn_copyd)
diff --git a/rts/gmp/mpn/powerpc64/copyi.asm b/rts/gmp/mpn/powerpc64/copyi.asm
new file mode 100644
index 0000000000..a1bedc4c5b
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/copyi.asm
@@ -0,0 +1,44 @@
+# PowerPC-64 mpn_copyi -- Copy a limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# rptr	r3
+# sptr	r4
+# n	r5
+
+include(`../config.m4')
+
+# Unrolling this analogous to sparc64/copyi.s doesn't help for any
+# operand sizes.
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+	cmpdi	cr0,r5,0
+	mtctr	r5
+	addi	r4,r4,-8
+	addi	r3,r3,-8
+	beq	cr0,.Lend
+.Loop:	ldu	r0,8(r4)
+	stdu	r0,8(r3)
+	bdnz	.Loop
+.Lend:	blr
+EPILOGUE(mpn_copyi)
diff --git a/rts/gmp/mpn/powerpc64/gmp-mparam.h b/rts/gmp/mpn/powerpc64/gmp-mparam.h
new file mode 100644
index 0000000000..6fefb960cd
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/gmp-mparam.h
@@ -0,0 +1,62 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* Generated by tuneup.c, 2000-07-16. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   10
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD       57
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   16
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD       89
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              28
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            216
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            14
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        6
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD         163
+#endif
diff --git a/rts/gmp/mpn/powerpc64/lshift.asm b/rts/gmp/mpn/powerpc64/lshift.asm
new file mode 100644
index 0000000000..cef3a81fdd
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/lshift.asm
@@ -0,0 +1,159 @@
+#  PowerPC-64 mpn_lshift -- Shift a number left.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# cnt		r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	cmpdi	cr0,r5,20	# more than 20 limbs?
+	sldi	r0,r5,3
+	add	r4,r4,r0	# make r4 point at end of s1
+	add	r7,r3,r0	# make r7 point at end of res
+	bgt	.LBIG		# branch if more than 12 limbs
+
+	mtctr	r5		# copy size into CTR
+	subfic	r8,r6,64
+	ldu	r11,-8(r4)	# load first s1 limb
+	srd	r3,r11,r8	# compute function return value
+	bdz	.Lend1
+
+.Loop:	ldu	r10,-8(r4)
+	sld	r9,r11,r6
+	srd	r12,r10,r8
+	or	r9,r9,r12
+	stdu	r9,-8(r7)
+	bdz	.Lend2
+	ldu	r11,-8(r4)
+	sld	r9,r10,r6
+	srd	r12,r11,r8
+	or	r9,r9,r12
+	stdu	r9,-8(r7)
+	bdnz	.Loop
+
+.Lend1:	sld	r0,r11,r6
+	std	r0,-8(r7)
+	blr
+.Lend2:	sld	r0,r10,r6
+	std	r0,-8(r7)
+	blr
+
+.LBIG:
+	std	r24,-64(1)
+	std	r25,-56(1)
+	std	r26,-48(1)
+	std	r27,-40(1)
+	std	r28,-32(1)
+	std	r29,-24(1)
+	std	r30,-16(1)
+	std	r31,-8(1)
+	ldu	r9,-8(r4)
+	subfic	r8,r6,64
+	srd	r3,r9,r8	# compute function return value
+	sld	r0,r9,r6
+	addi	r5,r5,-1
+
+	andi.	r10,r5,3	# count for spill loop
+	beq	.Le
+	mtctr	r10
+	ldu	r28,-8(r4)
+	bdz	.Lxe0
+
+.Loop0:	sld	r12,r28,r6
+	srd	r24,r28,r8
+	ldu	r28,-8(r4)
+	or	r24,r0,r24
+	stdu	r24,-8(r7)
+	mr	r0,r12
+	bdnz	.Loop0		# taken at most once!
+
+.Lxe0:	sld	r12,r28,r6
+	srd	r24,r28,r8
+	or	r24,r0,r24
+	stdu	r24,-8(r7)
+	mr	r0,r12
+
+.Le:	srdi	r5,r5,2		# count for unrolled loop
+	addi	r5,r5,-1
+	mtctr	r5
+	ld	r28,-8(r4)
+	ld	r29,-16(r4)
+	ld	r30,-24(r4)
+	ldu	r31,-32(r4)
+
+.LoopU:	sld	r9,r28,r6
+	srd	r24,r28,r8
+	ld	r28,-8(r4)
+	sld	r10,r29,r6
+	srd	r25,r29,r8
+	ld	r29,-16(r4)
+	sld	r11,r30,r6
+	srd	r26,r30,r8
+	ld	r30,-24(r4)
+	sld	r12,r31,r6
+	srd	r27,r31,r8
+	ldu	r31,-32(r4)
+	or	r24,r0,r24
+	std	r24,-8(r7)
+	or	r25,r9,r25
+	std	r25,-16(r7)
+	or	r26,r10,r26
+	std	r26,-24(r7)
+	or	r27,r11,r27
+	stdu	r27,-32(r7)
+	mr	r0,r12
+	bdnz	.LoopU
+
+	sld	r9,r28,r6
+	srd	r24,r28,r8
+	sld	r10,r29,r6
+	srd	r25,r29,r8
+	sld	r11,r30,r6
+	srd	r26,r30,r8
+	sld	r12,r31,r6
+	srd	r27,r31,r8
+	or	r24,r0,r24
+	std	r24,-8(r7)
+	or	r25,r9,r25
+	std	r25,-16(r7)
+	or	r26,r10,r26
+	std	r26,-24(r7)
+	or	r27,r11,r27
+	stdu	r27,-32(r7)
+	mr	r0,r12
+
+	std	r0,-8(r7)
+	ld	r24,-64(1)
+	ld	r25,-56(1)
+	ld	r26,-48(1)
+	ld	r27,-40(1)
+	ld	r28,-32(1)
+	ld	r29,-24(1)
+	ld	r30,-16(1)
+	ld	r31,-8(1)
+	blr
+EPILOGUE(mpn_lshift)
diff --git a/rts/gmp/mpn/powerpc64/mul_1.asm b/rts/gmp/mpn/powerpc64/mul_1.asm
new file mode 100644
index 0000000000..47597283ff
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/mul_1.asm
@@ -0,0 +1,49 @@
+# PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	mtctr	5
+	li	9,0		# cy_limb = 0
+	addic	0,0,0
+	cal	3,-8(3)
+	cal	4,-8(4)
+.Loop:
+	ldu	0,8(4)
+	mulld	7,0,6
+	adde	7,7,9
+	mulhdu	9,0,6
+	stdu	7,8(3)
+	bdnz	.Loop
+
+	addze	3,9
+	blr
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/powerpc64/rshift.asm b/rts/gmp/mpn/powerpc64/rshift.asm
new file mode 100644
index 0000000000..88272c7fa9
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/rshift.asm
@@ -0,0 +1,60 @@
+# PowerPC-64 mpn_rshift -- Shift a number right.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# cnt		r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	mtctr	r5		# copy size into CTR
+	addi	r7,r3,-8	# move adjusted res_ptr to free return reg
+	subfic	r8,r6,64
+	ld	r11,0(r4)	# load first s1 limb
+	sld	r3,r11,r8	# compute function return value
+	bdz	.Lend1
+
+.Loop:	ldu	r10,8(r4)
+	srd	r9,r11,r6
+	sld	r12,r10,r8
+	or	r9,r9,r12
+	stdu	r9,8(r7)
+	bdz	.Lend2
+	ldu	r11,8(r4)
+	srd	r9,r10,r6
+	sld	r12,r11,r8
+	or	r9,r9,r12
+	stdu	r9,8(r7)
+	bdnz	.Loop
+
+.Lend1:	srd	r0,r11,r6
+	std	r0,8(r7)
+	blr
+
+.Lend2:	srd	r0,r10,r6
+	std	r0,8(r7)
+	blr
+EPILOGUE(mpn_rshift)
diff --git a/rts/gmp/mpn/powerpc64/sub_n.asm b/rts/gmp/mpn/powerpc64/sub_n.asm
new file mode 100644
index 0000000000..4de3de69c7
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/sub_n.asm
@@ -0,0 +1,61 @@
+# PowerPC-64 mpn_sub_n -- Subtract two limb vectors of the same length > 0
+# and store difference in a third limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.b
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	mtctr	r6		# copy size into CTR
+	addic	r0,r6,-1	# set cy
+	ld	r8,0(r4)	# load least significant s1 limb
+	ld	r0,0(r5)	# load least significant s2 limb
+	addi	r3,r3,-8	# offset res_ptr, it's updated before it's used
+	bdz	.Lend		# If done, skip loop
+.Loop:	ld	r9,8(r4)	# load s1 limb
+	ld	r10,8(r5)	# load s2 limb
+	subfe	r7,r0,r8	# subtract limbs with cy, set cy
+	std	r7,8(r3)	# store result limb
+	bdz	.Lexit		# decrement CTR and exit if done
+	ldu	r8,16(r4)	# load s1 limb and update s1_ptr
+	ldu	r0,16(r5)	# load s2 limb and update s2_ptr
+	subfe	r7,r10,r9	# subtract limbs with cy, set cy
+	stdu	r7,16(r3)	# store result limb and update res_ptr
+	bdnz	.Loop		# decrement CTR and loop back
+
+.Lend:	subfe	r7,r0,r8
+	std	r7,8(r3)	# store ultimate result limb
+	subfe	r3,r0,r0	# load !cy into ...
+	subfic	r3,r3,0		# ... return value register
+	blr
+.Lexit:	subfe	r7,r10,r9
+	std	r7,16(r3)
+	subfe	r3,r0,r0	# load !cy into ...
+	subfic	r3,r3,0		# ... return value register
+	blr
+EPILOGUE(mpn_sub_n)
diff --git a/rts/gmp/mpn/powerpc64/submul_1.asm b/rts/gmp/mpn/powerpc64/submul_1.asm
new file mode 100644
index 0000000000..17f6369a38
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/submul_1.asm
@@ -0,0 +1,54 @@
+# PowerPC-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	mtctr	5
+	li	9,0		# cy_limb = 0
+	addic	0,0,0
+	cal	3,-8(3)
+	cal	4,-8(4)
+.Loop:
+	ldu	0,8(4)
+	ld	10,8(3)
+	mulld	7,0,6
+	adde	7,7,9
+	mulhdu	9,0,6
+	addze	9,9
+	subfc	7,7,10
+	stdu	7,8(3)
+	subfe	11,11,11	# invert ...
+	addic	11,11,1		# ... carry
+	bdnz	.Loop
+
+	addze	3,9
+	blr
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/pyr/add_n.s b/rts/gmp/mpn/pyr/add_n.s
new file mode 100644
index 0000000000..e1fc535846
--- /dev/null
+++ b/rts/gmp/mpn/pyr/add_n.s
@@ -0,0 +1,76 @@
+# Pyramid __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align	2
+.globl	___gmpn_add_n
+___gmpn_add_n:
+	movw	$-1,tr0		# representation for carry clear
+
+	movw	pr3,tr2
+	andw	$3,tr2
+	beq	Lend0
+	subw	tr2,pr3
+
+Loop0:	rsubw	$0,tr0		# restore carry bit from carry-save register
+
+	movw	(pr1),tr1
+	addwc	(pr2),tr1
+	movw	tr1,(pr0)
+
+	subwb	tr0,tr0
+	addw	$4,pr0
+	addw	$4,pr1
+	addw	$4,pr2
+	addw	$-1,tr2
+	bne	Loop0
+
+	mtstw	pr3,pr3
+	beq	Lend
+Lend0:
+Loop:	rsubw	$0,tr0		# restore carry bit from carry-save register
+
+	movw	(pr1),tr1
+	addwc	(pr2),tr1
+	movw	tr1,(pr0)
+
+	movw	4(pr1),tr1
+	addwc	4(pr2),tr1
+	movw	tr1,4(pr0)
+
+	movw	8(pr1),tr1
+	addwc	8(pr2),tr1
+	movw	tr1,8(pr0)
+
+	movw	12(pr1),tr1
+	addwc	12(pr2),tr1
+	movw	tr1,12(pr0)
+
+	subwb	tr0,tr0
+	addw	$16,pr0
+	addw	$16,pr1
+	addw	$16,pr2
+	addw	$-4,pr3
+	bne	Loop
+Lend:
+	mnegw	tr0,pr0
+	ret
diff --git a/rts/gmp/mpn/pyr/addmul_1.s b/rts/gmp/mpn/pyr/addmul_1.s
new file mode 100644
index 0000000000..65c3f8f008
--- /dev/null
+++ b/rts/gmp/mpn/pyr/addmul_1.s
@@ -0,0 +1,45 @@
+# Pyramid __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align	2
+.globl	___gmpn_addmul_1
+___gmpn_addmul_1:
+	mova	(pr0)[pr2*4],pr0
+	mova	(pr1)[pr2*4],pr1
+	mnegw	pr2,pr2
+	movw	$0,tr3
+
+Loop:	movw	(pr1)[pr2*4],tr1
+	uemul	pr3,tr0
+	addw	tr3,tr1
+	movw	$0,tr3
+	addwc	tr0,tr3
+	movw	(pr0)[pr2*0x4],tr0
+	addw	tr0,tr1
+	addwc	$0,tr3
+	movw	tr1,(pr0)[pr2*4]
+	addw	$1,pr2
+	bne	Loop
+
+	movw	tr3,pr0
+	ret
diff --git a/rts/gmp/mpn/pyr/mul_1.s b/rts/gmp/mpn/pyr/mul_1.s
new file mode 100644
index 0000000000..1272297c42
--- /dev/null
+++ b/rts/gmp/mpn/pyr/mul_1.s
@@ -0,0 +1,42 @@
+# Pyramid __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align	2
+.globl	___gmpn_mul_1
+___gmpn_mul_1:
+	mova	(pr0)[pr2*4],pr0
+	mova	(pr1)[pr2*4],pr1
+	mnegw	pr2,pr2
+	movw	$0,tr3
+
+Loop:	movw	(pr1)[pr2*4],tr1
+	uemul	pr3,tr0
+	addw	tr3,tr1
+	movw	$0,tr3
+	addwc	tr0,tr3
+	movw	tr1,(pr0)[pr2*4]
+	addw	$1,pr2
+	bne	Loop
+
+	movw	tr3,pr0
+	ret
diff --git a/rts/gmp/mpn/pyr/sub_n.s b/rts/gmp/mpn/pyr/sub_n.s
new file mode 100644
index 0000000000..1fd2eb0f17
--- /dev/null
+++ b/rts/gmp/mpn/pyr/sub_n.s
@@ -0,0 +1,76 @@
+# Pyramid __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align	2
+.globl	___gmpn_sub_n
+___gmpn_sub_n:
+	movw	$-1,tr0		# representation for carry clear
+
+	movw	pr3,tr2
+	andw	$3,tr2
+	beq	Lend0
+	subw	tr2,pr3
+
+Loop0:	rsubw	$0,tr0		# restore carry bit from carry-save register
+
+	movw	(pr1),tr1
+	subwb	(pr2),tr1
+	movw	tr1,(pr0)
+
+	subwb	tr0,tr0
+	addw	$4,pr0
+	addw	$4,pr1
+	addw	$4,pr2
+	addw	$-1,tr2
+	bne	Loop0
+
+	mtstw	pr3,pr3
+	beq	Lend
+Lend0:
+Loop:	rsubw	$0,tr0		# restore carry bit from carry-save register
+
+	movw	(pr1),tr1
+	subwb	(pr2),tr1
+	movw	tr1,(pr0)
+
+	movw	4(pr1),tr1
+	subwb	4(pr2),tr1
+	movw	tr1,4(pr0)
+
+	movw	8(pr1),tr1
+	subwb	8(pr2),tr1
+	movw	tr1,8(pr0)
+
+	movw	12(pr1),tr1
+	subwb	12(pr2),tr1
+	movw	tr1,12(pr0)
+
+	subwb	tr0,tr0
+	addw	$16,pr0
+	addw	$16,pr1
+	addw	$16,pr2
+	addw	$-4,pr3
+	bne	Loop
+Lend:
+	mnegw	tr0,pr0
+	ret
diff --git a/rts/gmp/mpn/sh/add_n.s b/rts/gmp/mpn/sh/add_n.s
new file mode 100644
index 0000000000..df388b31a3
--- /dev/null
+++ b/rts/gmp/mpn/sh/add_n.s
@@ -0,0 +1,47 @@
+! SH __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+! sum in a third limb vector.
+
+! Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r4
+! s1_ptr	r5
+! s2_ptr	r6
+! size		r7
+
+	.text
+	.align 2
+	.global	___gmpn_add_n
+___gmpn_add_n:
+	mov	#0,r3		! clear cy save reg
+
+Loop:	mov.l	@r5+,r1
+	mov.l	@r6+,r2
+	shlr	r3		! restore cy
+	addc	r2,r1
+	movt	r3		! save cy
+	mov.l	r1,@r4
+	dt	r7
+	bf.s	Loop
+	 add	#4,r4
+
+	rts
+	mov	r3,r0		! return carry-out from most sign. limb
diff --git a/rts/gmp/mpn/sh/sh2/addmul_1.s b/rts/gmp/mpn/sh/sh2/addmul_1.s
new file mode 100644
index 0000000000..f34a7f0503
--- /dev/null
+++ b/rts/gmp/mpn/sh/sh2/addmul_1.s
@@ -0,0 +1,53 @@
+! SH2 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+! the result to a second limb vector.
+
+! Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r4
+! s1_ptr	r5
+! size		r6
+! s2_limb	r7
+
+	.text
+	.align 1
+	.global	___gmpn_addmul_1
+___gmpn_addmul_1:
+	mov	#0,r2		! cy_limb = 0
+	mov	#0,r0		! Keep r0 = 0 for entire loop
+	clrt
+
+Loop:	mov.l	@r5+,r3
+	dmulu.l	r3,r7
+	sts	macl,r1
+	addc	r2,r1		! lo_prod += old cy_limb
+	sts	mach,r2		! new cy_limb = hi_prod
+	mov.l	@r4,r3
+	addc	r0,r2		! cy_limb += T, T = 0
+	addc	r3,r1
+	addc	r0,r2		! cy_limb += T, T = 0
+	dt	r6
+	mov.l	r1,@r4
+	bf.s	Loop
+	add	#4,r4
+
+	rts
+	mov	r2,r0
diff --git a/rts/gmp/mpn/sh/sh2/mul_1.s b/rts/gmp/mpn/sh/sh2/mul_1.s
new file mode 100644
index 0000000000..2a117a3175
--- /dev/null
+++ b/rts/gmp/mpn/sh/sh2/mul_1.s
@@ -0,0 +1,50 @@
+! SH2 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+! the result in a second limb vector.
+
+! Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r4
+! s1_ptr	r5
+! size		r6
+! s2_limb	r7
+
+	.text
+	.align 1
+	.global	___gmpn_mul_1
+___gmpn_mul_1:
+	mov	#0,r2		! cy_limb = 0
+	mov	#0,r0		! Keep r0 = 0 for entire loop
+	clrt
+
+Loop:	mov.l	@r5+,r3
+	dmulu.l	r3,r7
+	sts	macl,r1
+	addc	r2,r1
+	sts	mach,r2
+	addc	r0,r2		! propagate carry to cy_limb (dt clobbers T)
+	dt	r6
+	mov.l	r1,@r4
+	bf.s	Loop
+	add	#4,r4
+
+	rts
+	mov	r2,r0
diff --git a/rts/gmp/mpn/sh/sh2/submul_1.s b/rts/gmp/mpn/sh/sh2/submul_1.s
new file mode 100644
index 0000000000..eb9a27dde3
--- /dev/null
+++ b/rts/gmp/mpn/sh/sh2/submul_1.s
@@ -0,0 +1,53 @@
+! SH2 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+! the result from a second limb vector.
+
+! Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r4
+! s1_ptr	r5
+! size		r6
+! s2_limb	r7
+
+	.text
+	.align 1
+	.global	___gmpn_submul_1
+___gmpn_submul_1:
+	mov	#0,r2		! cy_limb = 0
+	mov	#0,r0		! Keep r0 = 0 for entire loop
+	clrt
+
+Loop:	mov.l	@r5+,r3
+	dmulu.l	r3,r7
+	sts	macl,r1
+	addc	r2,r1		! lo_prod += old cy_limb
+	sts	mach,r2		! new cy_limb = hi_prod
+	mov.l	@r4,r3
+	addc	r0,r2		! cy_limb += T, T = 0
+	subc	r3,r1
+	addc	r0,r2		! cy_limb += T, T = 0
+	dt	r6
+	mov.l	r1,@r4
+	bf.s	Loop
+	add	#4,r4
+
+	rts
+	mov	r2,r0
diff --git a/rts/gmp/mpn/sh/sub_n.s b/rts/gmp/mpn/sh/sub_n.s
new file mode 100644
index 0000000000..5f818c95a8
--- /dev/null
+++ b/rts/gmp/mpn/sh/sub_n.s
@@ -0,0 +1,47 @@
+! SH __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and store
+! difference in a third limb vector.
+
+! Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r4
+! s1_ptr	r5
+! s2_ptr	r6
+! size		r7
+
+	.text
+	.align 2
+	.global	___gmpn_sub_n
+___gmpn_sub_n:
+	mov	#0,r3		! clear cy save reg
+
+Loop:	mov.l	@r5+,r1
+	mov.l	@r6+,r2
+	shlr	r3		! restore cy
+	subc	r2,r1
+	movt	r3		! save cy
+	mov.l	r1,@r4
+	dt	r7
+	bf.s	Loop
+	 add	#4,r4
+
+	rts
+	mov	r3,r0		! return carry-out from most sign. limb
diff --git a/rts/gmp/mpn/sparc32/README b/rts/gmp/mpn/sparc32/README
new file mode 100644
index 0000000000..7c19df7bc4
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/README
@@ -0,0 +1,36 @@
+This directory contains mpn functions for various SPARC chips.  Code that
+runs only on version 8 SPARC implementations, is in the v8 subdirectory.
+
+RELEVANT OPTIMIZATION ISSUES
+
+  Load and Store timing
+
+On most early SPARC implementations, the ST instructions takes multiple
+cycles, while a STD takes just a single cycle more than an ST.  For the CPUs
+in SPARCstation I and II, the times are 3 and 4 cycles, respectively.
+Therefore, combining two ST instrucitons into a STD when possible is a
+significant optimiation.
+
+Later SPARC implementations have single cycle ST.
+
+For SuperSPARC, we can perform just one memory instruction per cycle, even
+if up to two integer instructions can be executed in its pipeline.  For
+programs that perform so many memory operations that there are not enough
+non-memory operations to issue in parallel with all memory operations, using
+LDD and STD when possible helps.
+
+STATUS
+
+1. On a SuperSPARC, mpn_lshift and mpn_rshift run at 3 cycles/limb, or 2.5
+   cycles/limb asymptotically.  We could optimize speed for special counts
+   by using ADDXCC.
+
+2. On a SuperSPARC, mpn_add_n and mpn_sub_n runs at 2.5 cycles/limb, or 2
+   cycles/limb asymptotically.
+
+3. mpn_mul_1 runs at what is believed to be optimal speed.
+
+4. On SuperSPARC, mpn_addmul_1 and mpn_submul_1 could both be improved by a
+   cycle by avoiding one of the add instrucitons.  See a29k/addmul_1.
+
+The speed of the code for other SPARC implementations is uncertain.
diff --git a/rts/gmp/mpn/sparc32/add_n.asm b/rts/gmp/mpn/sparc32/add_n.asm
new file mode 100644
index 0000000000..5f1d00c0e0
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/add_n.asm
@@ -0,0 +1,236 @@
+dnl  SPARC mpn_add_n -- Add two limb vectors of the same length > 0 and store
+dnl  sum in a third limb vector.
+
+dnl  Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(res_ptr,%o0)
+define(s1_ptr,%o1)
+define(s2_ptr,%o2)
+define(n,%o3)
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	xor	s2_ptr,res_ptr,%g1
+	andcc	%g1,4,%g0
+	bne	L(1)			C branch if alignment differs
+	nop
+C **  V1a  **
+L(0):	andcc	res_ptr,4,%g0		C res_ptr unaligned? Side effect: cy=0
+	be	L(v1)			C if no, branch
+	nop
+C Add least significant limb separately to align res_ptr and s2_ptr
+	ld	[s1_ptr],%g4
+	add	s1_ptr,4,s1_ptr
+	ld	[s2_ptr],%g2
+	add	s2_ptr,4,s2_ptr
+	add	n,-1,n
+	addcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+L(v1):	addx	%g0,%g0,%o4		C save cy in register
+	cmp	n,2			C if n < 2 ...
+	bl	L(end2)			C ... branch to tail code
+	subcc	%g0,%o4,%g0		C restore cy
+
+	ld	[s1_ptr+0],%g4
+	addcc	n,-10,n
+	ld	[s1_ptr+4],%g1
+	ldd	[s2_ptr+0],%g2
+	blt	L(fin1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop1):
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+8],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+12],%g1
+	ldd	[s2_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+16],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+20],%g1
+	ldd	[s2_ptr+16],%g2
+	std	%o4,[res_ptr+8]
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+24],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+28],%g1
+	ldd	[s2_ptr+24],%g2
+	std	%o4,[res_ptr+16]
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+32],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+36],%g1
+	ldd	[s2_ptr+32],%g2
+	std	%o4,[res_ptr+24]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop1)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin1):
+	addcc	n,8-2,n
+	blt	L(end1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 2 limbs until less than 2 limbs remain
+L(loope1):
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+8],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+12],%g1
+	ldd	[s2_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope1)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end1):
+	addxcc	%g4,%g2,%o4
+	addxcc	%g1,%g3,%o5
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+
+	andcc	n,1,%g0
+	be	L(ret1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+	ld	[s1_ptr+8],%g4
+	ld	[s2_ptr+8],%g2
+	addxcc	%g4,%g2,%o4
+	st	%o4,[res_ptr+8]
+
+L(ret1):
+	retl
+	addx	%g0,%g0,%o0	C return carry-out from most sign. limb
+
+L(1):	xor	s1_ptr,res_ptr,%g1
+	andcc	%g1,4,%g0
+	bne	L(2)
+	nop
+C **  V1b  **
+	mov	s2_ptr,%g1
+	mov	s1_ptr,s2_ptr
+	b	L(0)
+	mov	%g1,s1_ptr
+
+C **  V2  **
+C If we come here, the alignment of s1_ptr and res_ptr as well as the
+C alignment of s2_ptr and res_ptr differ.  Since there are only two ways
+C things can be aligned (that we care about) we now know that the alignment
+C of s1_ptr and s2_ptr are the same.
+
+L(2):	cmp	n,1
+	be	L(jone)
+	nop
+	andcc	s1_ptr,4,%g0		C s1_ptr unaligned? Side effect: cy=0
+	be	L(v2)			C if no, branch
+	nop
+C Add least significant limb separately to align s1_ptr and s2_ptr
+	ld	[s1_ptr],%g4
+	add	s1_ptr,4,s1_ptr
+	ld	[s2_ptr],%g2
+	add	s2_ptr,4,s2_ptr
+	add	n,-1,n
+	addcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+
+L(v2):	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	blt	L(fin2)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop2):
+	ldd	[s1_ptr+0],%g2
+	ldd	[s2_ptr+0],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+0]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+4]
+	ldd	[s1_ptr+8],%g2
+	ldd	[s2_ptr+8],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+8]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+12]
+	ldd	[s1_ptr+16],%g2
+	ldd	[s2_ptr+16],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+16]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+20]
+	ldd	[s1_ptr+24],%g2
+	ldd	[s2_ptr+24],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+24]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+28]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop2)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin2):
+	addcc	n,8-2,n
+	blt	L(end2)
+	subcc	%g0,%o4,%g0		C restore cy
+L(loope2):
+	ldd	[s1_ptr+0],%g2
+	ldd	[s2_ptr+0],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+0]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+4]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope2)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end2):
+	andcc	n,1,%g0
+	be	L(ret2)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+L(jone):
+	ld	[s1_ptr],%g4
+	ld	[s2_ptr],%g2
+	addxcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+
+L(ret2):
+	retl
+	addx	%g0,%g0,%o0	C return carry-out from most sign. limb
+EPILOGUE(mpn_add_n)
diff --git a/rts/gmp/mpn/sparc32/addmul_1.asm b/rts/gmp/mpn/sparc32/addmul_1.asm
new file mode 100644
index 0000000000..80c94e4251
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/addmul_1.asm
@@ -0,0 +1,146 @@
+dnl  SPARC mpn_addmul_1 -- Multiply a limb vector with a limb and add the
+dnl  result to a second limb vector.
+
+dnl  Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	C Make S1_PTR and RES_PTR point at the end of their blocks
+	C and put (- 4 x SIZE) in index/loop counter.
+	sll	%o2,2,%o2
+	add	%o0,%o2,%o4	C RES_PTR in o4 since o0 is retval
+	add	%o1,%o2,%o1
+	sub	%g0,%o2,%o2
+
+	cmp	%o3,0xfff
+	bgu	L(large)
+	nop
+
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	b	L(0)
+	 add	%o4,-4,%o4
+L(loop0):
+	addcc	%o5,%g1,%g1
+	ld	[%o1+%o2],%o5
+	addx	%o0,%g0,%o0
+	st	%g1,[%o4+%o2]
+L(0):	wr	%g0,%o3,%y
+	sra	%o5,31,%g2
+	and	%o3,%g2,%g2
+	andcc	%g1,0,%g1
+	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,0,%g1
+	sra	%g1,20,%g4
+	sll	%g1,12,%g1
+ 	rd	%y,%g3
+	srl	%g3,20,%g3
+	or	%g1,%g3,%g1
+
+	addcc	%g1,%o0,%g1
+	addx	%g2,%g4,%o0	C add sign-compensation and cy to hi limb
+	addcc	%o2,4,%o2	C loop counter
+	bne	L(loop0)
+	 ld	[%o4+%o2],%o5
+
+	addcc	%o5,%g1,%g1
+	addx	%o0,%g0,%o0
+	retl
+	st	%g1,[%o4+%o2]
+
+L(large):
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	sra	%o3,31,%g4	C g4 = mask of ones iff S2_LIMB < 0
+	b	L(1)
+	 add	%o4,-4,%o4
+L(loop):
+	addcc	%o5,%g3,%g3
+	ld	[%o1+%o2],%o5
+	addx	%o0,%g0,%o0
+	st	%g3,[%o4+%o2]
+L(1):	wr	%g0,%o5,%y
+	and	%o5,%g4,%g2
+	andcc	%g0,%g0,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%g0,%g1
+	rd	%y,%g3
+	addcc	%g3,%o0,%g3
+	addx	%g2,%g1,%o0
+	addcc	%o2,4,%o2
+	bne	L(loop)
+	 ld	[%o4+%o2],%o5
+
+	addcc	%o5,%g3,%g3
+	addx	%o0,%g0,%o0
+	retl
+	st	%g3,[%o4+%o2]
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/sparc32/lshift.asm b/rts/gmp/mpn/sparc32/lshift.asm
new file mode 100644
index 0000000000..529733ac2d
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/lshift.asm
@@ -0,0 +1,97 @@
+dnl  SPARC mpn_lshift -- Shift a number left.
+dnl  
+
+dnl  Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	%o0
+C src_ptr	%o1
+C size		%o2
+C cnt		%o3
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	sll	%o2,2,%g1
+	add	%o1,%g1,%o1	C make %o1 point at end of src
+	ld	[%o1-4],%g2	C load first limb
+	sub	%g0,%o3,%o5	C negate shift count
+	add	%o0,%g1,%o0	C make %o0 point at end of res
+	add	%o2,-1,%o2
+	andcc	%o2,4-1,%g4	C number of limbs in first loop
+	srl	%g2,%o5,%g1	C compute function result
+	be	L(0)		C if multiple of 4 limbs, skip first loop
+	st	%g1,[%sp+80]
+
+	sub	%o2,%g4,%o2	C adjust count for main loop
+
+L(loop0):
+	ld	[%o1-8],%g3
+	add	%o0,-4,%o0
+	add	%o1,-4,%o1
+	addcc	%g4,-1,%g4
+	sll	%g2,%o3,%o4
+	srl	%g3,%o5,%g1
+	mov	%g3,%g2
+	or	%o4,%g1,%o4
+	bne	L(loop0)
+	 st	%o4,[%o0+0]
+
+L(0):	tst	%o2
+	be	L(end)
+	 nop
+
+L(loop):
+	ld	[%o1-8],%g3
+	add	%o0,-16,%o0
+	addcc	%o2,-4,%o2
+	sll	%g2,%o3,%o4
+	srl	%g3,%o5,%g1
+
+	ld	[%o1-12],%g2
+	sll	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	st	%o4,[%o0+12]
+	srl	%g2,%o5,%g1
+
+	ld	[%o1-16],%g3
+	sll	%g2,%o3,%o4
+	or	%g4,%g1,%g4
+	st	%g4,[%o0+8]
+	srl	%g3,%o5,%g1
+
+	ld	[%o1-20],%g2
+	sll	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	st	%o4,[%o0+4]
+	srl	%g2,%o5,%g1
+
+	add	%o1,-16,%o1
+	or	%g4,%g1,%g4
+	bne	L(loop)
+	 st	%g4,[%o0+0]
+
+L(end):	sll	%g2,%o3,%g2
+	st	%g2,[%o0-4]
+	retl
+	ld	[%sp+80],%o0
+EPILOGUE(mpn_lshift)
diff --git a/rts/gmp/mpn/sparc32/mul_1.asm b/rts/gmp/mpn/sparc32/mul_1.asm
new file mode 100644
index 0000000000..e5fedeabaa
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/mul_1.asm
@@ -0,0 +1,137 @@
+dnl  SPARC mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl  the result in a second limb vector.
+
+dnl  Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	C Make S1_PTR and RES_PTR point at the end of their blocks
+	C and put (- 4 x SIZE) in index/loop counter.
+	sll	%o2,2,%o2
+	add	%o0,%o2,%o4	C RES_PTR in o4 since o0 is retval
+	add	%o1,%o2,%o1
+	sub	%g0,%o2,%o2
+
+	cmp	%o3,0xfff
+	bgu	L(large)
+	nop
+
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	b	L(0)
+	 add	%o4,-4,%o4
+L(loop0):
+	st	%g1,[%o4+%o2]
+L(0):	wr	%g0,%o3,%y
+	sra	%o5,31,%g2
+	and	%o3,%g2,%g2
+	andcc	%g1,0,%g1
+	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,0,%g1
+	sra	%g1,20,%g4
+	sll	%g1,12,%g1
+ 	rd	%y,%g3
+	srl	%g3,20,%g3
+	or	%g1,%g3,%g1
+
+	addcc	%g1,%o0,%g1
+	addx	%g2,%g4,%o0	C add sign-compensation and cy to hi limb
+	addcc	%o2,4,%o2	C loop counter
+	bne,a	L(loop0)
+	 ld	[%o1+%o2],%o5
+
+	retl
+	st	%g1,[%o4+%o2]
+
+
+L(large):
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	sra	%o3,31,%g4	C g4 = mask of ones iff S2_LIMB < 0
+	b	L(1)
+	 add	%o4,-4,%o4
+L(loop):
+	st	%g3,[%o4+%o2]
+L(1):	wr	%g0,%o5,%y
+	and	%o5,%g4,%g2	C g2 = S1_LIMB iff S2_LIMB < 0, else 0
+	andcc	%g0,%g0,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%g0,%g1
+	rd	%y,%g3
+	addcc	%g3,%o0,%g3
+	addx	%g2,%g1,%o0	C add sign-compensation and cy to hi limb
+	addcc	%o2,4,%o2	C loop counter
+	bne,a	L(loop)
+	 ld	[%o1+%o2],%o5
+
+	retl
+	st	%g3,[%o4+%o2]
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/sparc32/rshift.asm b/rts/gmp/mpn/sparc32/rshift.asm
new file mode 100644
index 0000000000..9187dbaa6f
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/rshift.asm
@@ -0,0 +1,93 @@
+dnl  SPARC mpn_rshift -- Shift a number right.
+
+dnl  Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	%o0
+C src_ptr	%o1
+C size		%o2
+C cnt		%o3
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	ld	[%o1],%g2	C load first limb
+	sub	%g0,%o3,%o5	C negate shift count
+	add	%o2,-1,%o2
+	andcc	%o2,4-1,%g4	C number of limbs in first loop
+	sll	%g2,%o5,%g1	C compute function result
+	be	L(0)		C if multiple of 4 limbs, skip first loop
+	st	%g1,[%sp+80]
+
+	sub	%o2,%g4,%o2	C adjust count for main loop
+
+L(loop0):
+	ld	[%o1+4],%g3
+	add	%o0,4,%o0
+	add	%o1,4,%o1
+	addcc	%g4,-1,%g4
+	srl	%g2,%o3,%o4
+	sll	%g3,%o5,%g1
+	mov	%g3,%g2
+	or	%o4,%g1,%o4
+	bne	L(loop0)
+	 st	%o4,[%o0-4]
+
+L(0):	tst	%o2
+	be	L(end)
+	 nop
+
+L(loop):
+	ld	[%o1+4],%g3
+	add	%o0,16,%o0
+	addcc	%o2,-4,%o2
+	srl	%g2,%o3,%o4
+	sll	%g3,%o5,%g1
+
+	ld	[%o1+8],%g2
+	srl	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	st	%o4,[%o0-16]
+	sll	%g2,%o5,%g1
+
+	ld	[%o1+12],%g3
+	srl	%g2,%o3,%o4
+	or	%g4,%g1,%g4
+	st	%g4,[%o0-12]
+	sll	%g3,%o5,%g1
+
+	ld	[%o1+16],%g2
+	srl	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	st	%o4,[%o0-8]
+	sll	%g2,%o5,%g1
+
+	add	%o1,16,%o1
+	or	%g4,%g1,%g4
+	bne	L(loop)
+	 st	%g4,[%o0-4]
+
+L(end):	srl	%g2,%o3,%g2
+	st	%g2,[%o0-0]
+	retl
+	ld	[%sp+80],%o0
+EPILOGUE(mpn_rshift)
diff --git a/rts/gmp/mpn/sparc32/sub_n.asm b/rts/gmp/mpn/sparc32/sub_n.asm
new file mode 100644
index 0000000000..071909a1b6
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/sub_n.asm
@@ -0,0 +1,326 @@
+dnl  SPARC mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl  store difference in a third limb vector.
+
+dnl  Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(res_ptr,%o0)
+define(s1_ptr,%o1)
+define(s2_ptr,%o2)
+define(n,%o3)
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	xor	s2_ptr,res_ptr,%g1
+	andcc	%g1,4,%g0
+	bne	L(1)			C branch if alignment differs
+	nop
+C **  V1a  **
+	andcc	res_ptr,4,%g0		C res_ptr unaligned? Side effect: cy=0
+	be	L(v1)			C if no, branch
+	nop
+C Add least significant limb separately to align res_ptr and s2_ptr
+	ld	[s1_ptr],%g4
+	add	s1_ptr,4,s1_ptr
+	ld	[s2_ptr],%g2
+	add	s2_ptr,4,s2_ptr
+	add	n,-1,n
+	subcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+L(v1):	addx	%g0,%g0,%o4		C save cy in register
+	cmp	n,2			C if n < 2 ...
+	bl	L(end2)			C ... branch to tail code
+	subcc	%g0,%o4,%g0		C restore cy
+
+	ld	[s1_ptr+0],%g4
+	addcc	n,-10,n
+	ld	[s1_ptr+4],%g1
+	ldd	[s2_ptr+0],%g2
+	blt	L(fin1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop1):
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+8],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+12],%g1
+	ldd	[s2_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+16],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+20],%g1
+	ldd	[s2_ptr+16],%g2
+	std	%o4,[res_ptr+8]
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+24],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+28],%g1
+	ldd	[s2_ptr+24],%g2
+	std	%o4,[res_ptr+16]
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+32],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+36],%g1
+	ldd	[s2_ptr+32],%g2
+	std	%o4,[res_ptr+24]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop1)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin1):
+	addcc	n,8-2,n
+	blt	L(end1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 2 limbs until less than 2 limbs remain
+L(loope1):
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+8],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+12],%g1
+	ldd	[s2_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope1)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end1):
+	subxcc	%g4,%g2,%o4
+	subxcc	%g1,%g3,%o5
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+
+	andcc	n,1,%g0
+	be	L(ret1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+	ld	[s1_ptr+8],%g4
+	ld	[s2_ptr+8],%g2
+	subxcc	%g4,%g2,%o4
+	st	%o4,[res_ptr+8]
+
+L(ret1):
+	retl
+	addx	%g0,%g0,%o0	C return carry-out from most sign. limb
+
+L(1):	xor	s1_ptr,res_ptr,%g1
+	andcc	%g1,4,%g0
+	bne	L(2)
+	nop
+C **  V1b  **
+	andcc	res_ptr,4,%g0		C res_ptr unaligned? Side effect: cy=0
+	be	L(v1b)			C if no, branch
+	nop
+C Add least significant limb separately to align res_ptr and s1_ptr
+	ld	[s2_ptr],%g4
+	add	s2_ptr,4,s2_ptr
+	ld	[s1_ptr],%g2
+	add	s1_ptr,4,s1_ptr
+	add	n,-1,n
+	subcc	%g2,%g4,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+L(v1b):	addx	%g0,%g0,%o4		C save cy in register
+	cmp	n,2			C if n < 2 ...
+	bl	L(end2)			C ... branch to tail code
+	subcc	%g0,%o4,%g0		C restore cy
+
+	ld	[s2_ptr+0],%g4
+	addcc	n,-10,n
+	ld	[s2_ptr+4],%g1
+	ldd	[s1_ptr+0],%g2
+	blt	L(fin1b)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop1b):
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+8],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+12],%g1
+	ldd	[s1_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+16],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+20],%g1
+	ldd	[s1_ptr+16],%g2
+	std	%o4,[res_ptr+8]
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+24],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+28],%g1
+	ldd	[s1_ptr+24],%g2
+	std	%o4,[res_ptr+16]
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+32],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+36],%g1
+	ldd	[s1_ptr+32],%g2
+	std	%o4,[res_ptr+24]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop1b)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin1b):
+	addcc	n,8-2,n
+	blt	L(end1b)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 2 limbs until less than 2 limbs remain
+L(loope1b):
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+8],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+12],%g1
+	ldd	[s1_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope1b)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end1b):
+	subxcc	%g2,%g4,%o4
+	subxcc	%g3,%g1,%o5
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+
+	andcc	n,1,%g0
+	be	L(ret1b)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+	ld	[s2_ptr+8],%g4
+	ld	[s1_ptr+8],%g2
+	subxcc	%g2,%g4,%o4
+	st	%o4,[res_ptr+8]
+
+L(ret1b):
+	retl
+	addx	%g0,%g0,%o0		C return carry-out from most sign. limb
+
+C **  V2  **
+C If we come here, the alignment of s1_ptr and res_ptr as well as the
+C alignment of s2_ptr and res_ptr differ.  Since there are only two ways
+C things can be aligned (that we care about) we now know that the alignment
+C of s1_ptr and s2_ptr are the same.
+
+L(2):	cmp	n,1
+	be	L(jone)
+	nop
+	andcc	s1_ptr,4,%g0		C s1_ptr unaligned? Side effect: cy=0
+	be	L(v2)			C if no, branch
+	nop
+C Add least significant limb separately to align s1_ptr and s2_ptr
+	ld	[s1_ptr],%g4
+	add	s1_ptr,4,s1_ptr
+	ld	[s2_ptr],%g2
+	add	s2_ptr,4,s2_ptr
+	add	n,-1,n
+	subcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+
+L(v2):	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	blt	L(fin2)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop2):
+	ldd	[s1_ptr+0],%g2
+	ldd	[s2_ptr+0],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+0]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+4]
+	ldd	[s1_ptr+8],%g2
+	ldd	[s2_ptr+8],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+8]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+12]
+	ldd	[s1_ptr+16],%g2
+	ldd	[s2_ptr+16],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+16]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+20]
+	ldd	[s1_ptr+24],%g2
+	ldd	[s2_ptr+24],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+24]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+28]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop2)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin2):
+	addcc	n,8-2,n
+	blt	L(end2)
+	subcc	%g0,%o4,%g0		C restore cy
+L(loope2):
+	ldd	[s1_ptr+0],%g2
+	ldd	[s2_ptr+0],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+0]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+4]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope2)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end2):
+	andcc	n,1,%g0
+	be	L(ret2)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+L(jone):
+	ld	[s1_ptr],%g4
+	ld	[s2_ptr],%g2
+	subxcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+
+L(ret2):
+	retl
+	addx	%g0,%g0,%o0		C return carry-out from most sign. limb
+EPILOGUE(mpn_sub_n)
diff --git a/rts/gmp/mpn/sparc32/submul_1.asm b/rts/gmp/mpn/sparc32/submul_1.asm
new file mode 100644
index 0000000000..12abd844ce
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/submul_1.asm
@@ -0,0 +1,146 @@
+dnl  SPARC mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+dnl  the result from a second limb vector.
+
+dnl  Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	C Make S1_PTR and RES_PTR point at the end of their blocks
+	C and put (- 4 x SIZE) in index/loop counter.
+	sll	%o2,2,%o2
+	add	%o0,%o2,%o4	C RES_PTR in o4 since o0 is retval
+	add	%o1,%o2,%o1
+	sub	%g0,%o2,%o2
+
+	cmp	%o3,0xfff
+	bgu	L(large)
+	nop
+
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	b	L(0)
+	 add	%o4,-4,%o4
+L(loop0):
+	subcc	%o5,%g1,%g1
+	ld	[%o1+%o2],%o5
+	addx	%o0,%g0,%o0
+	st	%g1,[%o4+%o2]
+L(0):	wr	%g0,%o3,%y
+	sra	%o5,31,%g2
+	and	%o3,%g2,%g2
+	andcc	%g1,0,%g1
+	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,0,%g1
+	sra	%g1,20,%g4
+	sll	%g1,12,%g1
+ 	rd	%y,%g3
+	srl	%g3,20,%g3
+	or	%g1,%g3,%g1
+
+	addcc	%g1,%o0,%g1
+	addx	%g2,%g4,%o0	C add sign-compensation and cy to hi limb
+	addcc	%o2,4,%o2	C loop counter
+	bne	L(loop0)
+	 ld	[%o4+%o2],%o5
+
+	subcc	%o5,%g1,%g1
+	addx	%o0,%g0,%o0
+	retl
+	st	%g1,[%o4+%o2]
+
+L(large):
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	sra	%o3,31,%g4	C g4 = mask of ones iff S2_LIMB < 0
+	b	L(1)
+	 add	%o4,-4,%o4
+L(loop):
+	subcc	%o5,%g3,%g3
+	ld	[%o1+%o2],%o5
+	addx	%o0,%g0,%o0
+	st	%g3,[%o4+%o2]
+L(1):	wr	%g0,%o5,%y
+	and	%o5,%g4,%g2
+	andcc	%g0,%g0,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%g0,%g1
+	rd	%y,%g3
+	addcc	%g3,%o0,%g3
+	addx	%g2,%g1,%o0
+	addcc	%o2,4,%o2
+	bne	L(loop)
+	 ld	[%o4+%o2],%o5
+
+	subcc	%o5,%g3,%g3
+	addx	%o0,%g0,%o0
+	retl
+	st	%g3,[%o4+%o2]
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/sparc32/udiv_fp.asm b/rts/gmp/mpn/sparc32/udiv_fp.asm
new file mode 100644
index 0000000000..e340e147d2
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/udiv_fp.asm
@@ -0,0 +1,158 @@
+dnl  SPARC v7 __udiv_qrnnd division support, used from longlong.h.
+dnl  This is for v7 CPUs with a floating-point unit.
+
+dnl  Copyright (C) 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr	i0
+C n1		i1
+C n0		i2
+C d		i3
+
+ASM_START()
+
+ifdef(`PIC',
+`	TEXT
+L(getpc):
+	retl
+	nop')
+
+	TEXT
+	ALIGN(8)
+L(C0):	.double	0r4294967296
+L(C1):	.double	0r2147483648
+
+PROLOGUE(mpn_udiv_qrnnd)
+	save	%sp,-104,%sp
+	st	%i1,[%fp-8]
+	ld	[%fp-8],%f10
+
+ifdef(`PIC',
+`L(pc):	call	L(getpc)		C put address of this insn in %o7
+	ldd	[%o7+L(C0)-L(pc)],%f8',
+`	sethi	%hi(L(C0)),%o7
+	ldd	[%o7+%lo(L(C0))],%f8')
+
+	fitod	%f10,%f4
+	cmp	%i1,0
+	bge	L(248)
+	mov	%i0,%i5
+	faddd	%f4,%f8,%f4
+L(248):
+	st	%i2,[%fp-8]
+	ld	[%fp-8],%f10
+	fmuld	%f4,%f8,%f6
+	cmp	%i2,0
+	bge	L(249)
+	fitod	%f10,%f2
+	faddd	%f2,%f8,%f2
+L(249):
+	st	%i3,[%fp-8]
+	faddd	%f6,%f2,%f2
+	ld	[%fp-8],%f10
+	cmp	%i3,0
+	bge	L(250)
+	fitod	%f10,%f4
+	faddd	%f4,%f8,%f4
+L(250):
+	fdivd	%f2,%f4,%f2
+
+ifdef(`PIC',
+`	ldd	[%o7+L(C1)-L(pc)],%f4',
+`	sethi	%hi(L(C1)),%o7
+	ldd	[%o7+%lo(L(C1))],%f4')
+
+	fcmped	%f2,%f4
+	nop
+	fbge,a	L(251)
+	fsubd	%f2,%f4,%f2
+	fdtoi	%f2,%f2
+	st	%f2,[%fp-8]
+	b	L(252)
+	ld	[%fp-8],%i4
+L(251):
+	fdtoi	%f2,%f2
+	st	%f2,[%fp-8]
+	ld	[%fp-8],%i4
+	sethi	%hi(-2147483648),%g2
+	xor	%i4,%g2,%i4
+L(252):
+	wr	%g0,%i4,%y
+	sra	%i3,31,%g2
+	and	%i4,%g2,%g2
+	andcc	%g0,0,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,0,%g1
+	add	%g1,%g2,%i0
+	rd	%y,%g3
+	subcc	%i2,%g3,%o7
+	subxcc	%i1,%i0,%g0
+	be	L(253)
+	cmp	%o7,%i3
+
+	add	%i4,-1,%i0
+	add	%o7,%i3,%o7
+	st	%o7,[%i5]
+	ret
+	restore
+L(253):
+	blu	L(246)
+	mov	%i4,%i0
+	add	%i4,1,%i0
+	sub	%o7,%i3,%o7
+L(246):
+	st	%o7,[%i5]
+	ret
+	restore
+EPILOGUE(mpn_udiv_qrnnd)
diff --git a/rts/gmp/mpn/sparc32/udiv_nfp.asm b/rts/gmp/mpn/sparc32/udiv_nfp.asm
new file mode 100644
index 0000000000..ae19f4c6e9
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/udiv_nfp.asm
@@ -0,0 +1,193 @@
+dnl  SPARC v7 __udiv_qrnnd division support, used from longlong.h.
+dnl  This is for v7 CPUs without a floating-point unit.
+
+dnl  Copyright (C) 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+ 
+C INPUT PARAMETERS
+C rem_ptr	o0
+C n1		o1
+C n0		o2
+C d		o3
+
+ASM_START()
+PROLOGUE(mpn_udiv_qrnnd)
+	tst	%o3
+	bneg	L(largedivisor)
+	mov	8,%g1
+
+	b	L(p1)
+	addxcc	%o2,%o2,%o2
+
+L(plop):
+	bcc	L(n1)
+	addxcc	%o2,%o2,%o2
+L(p1):	addx	%o1,%o1,%o1
+	subcc	%o1,%o3,%o4
+	bcc	L(n2)
+	addxcc	%o2,%o2,%o2
+L(p2):	addx	%o1,%o1,%o1
+	subcc	%o1,%o3,%o4
+	bcc	L(n3)
+	addxcc	%o2,%o2,%o2
+L(p3):	addx	%o1,%o1,%o1
+	subcc	%o1,%o3,%o4
+	bcc	L(n4)
+	addxcc	%o2,%o2,%o2
+L(p4):	addx	%o1,%o1,%o1
+	addcc	%g1,-1,%g1
+	bne	L(plop)
+	subcc	%o1,%o3,%o4
+	bcc	L(n5)
+	addxcc	%o2,%o2,%o2
+L(p5):	st	%o1,[%o0]
+	retl
+	xnor	%g0,%o2,%o0
+
+L(nlop):
+	bcc	L(p1)
+	addxcc	%o2,%o2,%o2
+L(n1):	addx	%o4,%o4,%o4
+	subcc	%o4,%o3,%o1
+	bcc	L(p2)
+	addxcc	%o2,%o2,%o2
+L(n2):	addx	%o4,%o4,%o4
+	subcc	%o4,%o3,%o1
+	bcc	L(p3)
+	addxcc	%o2,%o2,%o2
+L(n3):	addx	%o4,%o4,%o4
+	subcc	%o4,%o3,%o1
+	bcc	L(p4)
+	addxcc	%o2,%o2,%o2
+L(n4):	addx	%o4,%o4,%o4
+	addcc	%g1,-1,%g1
+	bne	L(nlop)
+	subcc	%o4,%o3,%o1
+	bcc	L(p5)
+	addxcc	%o2,%o2,%o2
+L(n5):	st	%o4,[%o0]
+	retl
+	xnor	%g0,%o2,%o0
+
+L(largedivisor):
+	and	%o2,1,%o5	C %o5 = n0 & 1
+
+	srl	%o2,1,%o2
+	sll	%o1,31,%g2
+	or	%g2,%o2,%o2	C %o2 = lo(n1n0 >> 1)
+	srl	%o1,1,%o1	C %o1 = hi(n1n0 >> 1)
+
+	and	%o3,1,%g2
+	srl	%o3,1,%g3	C %g3 = floor(d / 2)
+	add	%g3,%g2,%g3	C %g3 = ceil(d / 2)
+
+	b	L(Lp1)
+	addxcc	%o2,%o2,%o2
+
+L(Lplop):
+	bcc	L(Ln1)
+	addxcc	%o2,%o2,%o2
+L(Lp1):	addx	%o1,%o1,%o1
+	subcc	%o1,%g3,%o4
+	bcc	L(Ln2)
+	addxcc	%o2,%o2,%o2
+L(Lp2):	addx	%o1,%o1,%o1
+	subcc	%o1,%g3,%o4
+	bcc	L(Ln3)
+	addxcc	%o2,%o2,%o2
+L(Lp3):	addx	%o1,%o1,%o1
+	subcc	%o1,%g3,%o4
+	bcc	L(Ln4)
+	addxcc	%o2,%o2,%o2
+L(Lp4):	addx	%o1,%o1,%o1
+	addcc	%g1,-1,%g1
+	bne	L(Lplop)
+	subcc	%o1,%g3,%o4
+	bcc	L(Ln5)
+	addxcc	%o2,%o2,%o2
+L(Lp5):	add	%o1,%o1,%o1	C << 1
+	tst	%g2
+	bne	L(oddp)
+	add	%o5,%o1,%o1
+	st	%o1,[%o0]
+	retl
+	xnor	%g0,%o2,%o0
+
+L(Lnlop):
+	bcc	L(Lp1)
+	addxcc	%o2,%o2,%o2
+L(Ln1):	addx	%o4,%o4,%o4
+	subcc	%o4,%g3,%o1
+	bcc	L(Lp2)
+	addxcc	%o2,%o2,%o2
+L(Ln2):	addx	%o4,%o4,%o4
+	subcc	%o4,%g3,%o1
+	bcc	L(Lp3)
+	addxcc	%o2,%o2,%o2
+L(Ln3):	addx	%o4,%o4,%o4
+	subcc	%o4,%g3,%o1
+	bcc	L(Lp4)
+	addxcc	%o2,%o2,%o2
+L(Ln4):	addx	%o4,%o4,%o4
+	addcc	%g1,-1,%g1
+	bne	L(Lnlop)
+	subcc	%o4,%g3,%o1
+	bcc	L(Lp5)
+	addxcc	%o2,%o2,%o2
+L(Ln5):	add	%o4,%o4,%o4	C << 1
+	tst	%g2
+	bne	L(oddn)
+	add	%o5,%o4,%o4
+	st	%o4,[%o0]
+	retl
+	xnor	%g0,%o2,%o0
+
+L(oddp):
+	xnor	%g0,%o2,%o2
+	C q' in %o2. r' in %o1
+	addcc	%o1,%o2,%o1
+	bcc	L(Lp6)
+	addx	%o2,0,%o2
+	sub	%o1,%o3,%o1
+L(Lp6):	subcc	%o1,%o3,%g0
+	bcs	L(Lp7)
+	subx	%o2,-1,%o2
+	sub	%o1,%o3,%o1
+L(Lp7):	st	%o1,[%o0]
+	retl
+	mov	%o2,%o0
+
+L(oddn):
+	xnor	%g0,%o2,%o2
+	C q' in %o2. r' in %o4
+	addcc	%o4,%o2,%o4
+	bcc	L(Ln6)
+	addx	%o2,0,%o2
+	sub	%o4,%o3,%o4
+L(Ln6):	subcc	%o4,%o3,%g0
+	bcs	L(Ln7)
+	subx	%o2,-1,%o2
+	sub	%o4,%o3,%o4
+L(Ln7):	st	%o4,[%o0]
+	retl
+	mov	%o2,%o0
+EPILOGUE(mpn_udiv_qrnnd)
diff --git a/rts/gmp/mpn/sparc32/umul.asm b/rts/gmp/mpn/sparc32/umul.asm
new file mode 100644
index 0000000000..efa56851d6
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/umul.asm
@@ -0,0 +1,68 @@
+dnl  SPARC mpn_umul_ppmm -- support for longlong.h for non-gcc.
+
+dnl  Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+	wr	%g0,%o1,%y
+	sra	%o2,31,%g2	C Don't move this insn
+	and	%o1,%g2,%g2	C Don't move this insn
+	andcc	%g0,0,%g1	C Don't move this insn
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,0,%g1
+	rd	%y,%g3
+	st	%g3,[%o0]
+	retl
+	add	%g1,%g2,%o0
+EPILOGUE(mpn_umul_ppmm)
diff --git a/rts/gmp/mpn/sparc32/v8/addmul_1.asm b/rts/gmp/mpn/sparc32/v8/addmul_1.asm
new file mode 100644
index 0000000000..da44644b51
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/addmul_1.asm
@@ -0,0 +1,122 @@
+dnl  SPARC v8 mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl  add the result to a second limb vector.
+
+dnl  Copyright (C) 1992, 1993, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	orcc	%g0,%g0,%g2
+	ld	[%o1+0],%o4	C 1
+
+	sll	%o2,4,%g1
+	and	%g1,(4-1)<<4,%g1
+ifdef(`PIC',
+`	mov	%o7,%g4		C Save return address register
+0:	call	1f
+	add	%o7,L(1)-0b,%g3
+1:	mov	%g4,%o7		C Restore return address register
+',
+`	sethi	%hi(L(1)),%g3
+	or	%g3,%lo(L(1)),%g3
+')
+	jmp	%g3+%g1
+	nop
+L(1):
+L(L00):	add	%o0,-4,%o0
+	b	L(loop00)	C 4, 8, 12, ...
+	add	%o1,-4,%o1
+	nop
+L(L01):	b	L(loop01)	C 1, 5, 9, ...
+	nop
+	nop
+	nop
+L(L10):	add	%o0,-12,%o0	C 2, 6, 10, ...
+	b	L(loop10)
+	add	%o1,4,%o1
+	nop
+L(L11):	add	%o0,-8,%o0	C 3, 7, 11, ...
+	b	L(loop11)
+	add	%o1,-8,%o1
+	nop
+
+L(loop):
+	addcc	%g3,%g2,%g3	C 1
+	ld	[%o1+4],%o4	C 2
+	rd	%y,%g2		C 1
+	addx	%g0,%g2,%g2
+	ld	[%o0+0],%g1	C 2
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+0]	C 1
+L(loop00):
+	umul	%o4,%o3,%g3	C 2
+	ld	[%o0+4],%g1	C 2
+	addxcc	%g3,%g2,%g3	C 2
+	ld	[%o1+8],%o4	C 3
+	rd	%y,%g2		C 2
+	addx	%g0,%g2,%g2
+	nop
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+4]	C 2
+L(loop11):
+	umul	%o4,%o3,%g3	C 3
+	addxcc	%g3,%g2,%g3	C 3
+	ld	[%o1+12],%o4	C 4
+	rd	%y,%g2		C 3
+	add	%o1,16,%o1
+	addx	%g0,%g2,%g2
+	ld	[%o0+8],%g1	C 2
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+8]	C 3
+L(loop10):
+	umul	%o4,%o3,%g3	C 4
+	addxcc	%g3,%g2,%g3	C 4
+	ld	[%o1+0],%o4	C 1
+	rd	%y,%g2		C 4
+	addx	%g0,%g2,%g2
+	ld	[%o0+12],%g1	C 2
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+12]	C 4
+	add	%o0,16,%o0
+	addx	%g0,%g2,%g2
+L(loop01):
+	addcc	%o2,-4,%o2
+	bg	L(loop)
+	umul	%o4,%o3,%g3	C 1
+
+	addcc	%g3,%g2,%g3	C 4
+	rd	%y,%g2		C 4
+	addx	%g0,%g2,%g2
+	ld	[%o0+0],%g1	C 2
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+0]	C 4
+	addx	%g0,%g2,%o0
+
+	retl
+	 nop
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/sparc32/v8/mul_1.asm b/rts/gmp/mpn/sparc32/v8/mul_1.asm
new file mode 100644
index 0000000000..801247553a
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/mul_1.asm
@@ -0,0 +1,103 @@
+dnl  SPARC v8 mpn_mul_1 -- Multiply a limb vector with a single limb and
+dnl  store the product in a second limb vector.
+
+dnl  Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	sll	%o2,4,%g1
+	and	%g1,(4-1)<<4,%g1
+ifdef(`PIC',
+`	mov	%o7,%g4		C Save return address register
+0:	call	1f
+	add	%o7,L(1)-0b,%g3
+1:	mov	%g4,%o7		C Restore return address register
+',
+`	sethi	%hi(L(1)),%g3
+	or	%g3,%lo(L(1)),%g3
+')
+	jmp	%g3+%g1
+	ld	[%o1+0],%o4	C 1
+L(1):
+L(L00):	add	%o0,-4,%o0
+	add	%o1,-4,%o1
+	b	L(loop00)	C 4, 8, 12, ...
+	orcc	%g0,%g0,%g2
+L(L01):	b	L(loop01)	C 1, 5, 9, ...
+	orcc	%g0,%g0,%g2
+	nop
+	nop
+L(L10):	add	%o0,-12,%o0	C 2, 6, 10, ...
+	add	%o1,4,%o1
+	b	L(loop10)
+	orcc	%g0,%g0,%g2
+	nop
+L(L11):	add	%o0,-8,%o0	C 3, 7, 11, ...
+	add	%o1,-8,%o1
+	b	L(loop11)
+	orcc	%g0,%g0,%g2
+
+L(loop):
+	addcc	%g3,%g2,%g3	C 1
+	ld	[%o1+4],%o4	C 2
+	st	%g3,[%o0+0]	C 1
+	rd	%y,%g2		C 1
+L(loop00):
+	umul	%o4,%o3,%g3	C 2
+	addxcc	%g3,%g2,%g3	C 2
+	ld	[%o1+8],%o4	C 3
+	st	%g3,[%o0+4]	C 2
+	rd	%y,%g2		C 2
+L(loop11):
+	umul	%o4,%o3,%g3	C 3
+	addxcc	%g3,%g2,%g3	C 3
+	ld	[%o1+12],%o4	C 4
+	add	%o1,16,%o1
+	st	%g3,[%o0+8]	C 3
+	rd	%y,%g2		C 3
+L(loop10):
+	umul	%o4,%o3,%g3	C 4
+	addxcc	%g3,%g2,%g3	C 4
+	ld	[%o1+0],%o4	C 1
+	st	%g3,[%o0+12]	C 4
+	add	%o0,16,%o0
+	rd	%y,%g2		C 4
+	addx	%g0,%g2,%g2
+L(loop01):
+	addcc	%o2,-4,%o2
+	bg	L(loop)
+	umul	%o4,%o3,%g3	C 1
+
+	addcc	%g3,%g2,%g3	C 4
+	st	%g3,[%o0+0]	C 4
+	rd	%y,%g2		C 4
+
+	retl
+	addx	%g0,%g2,%o0
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/sparc32/v8/submul_1.asm b/rts/gmp/mpn/sparc32/v8/submul_1.asm
new file mode 100644
index 0000000000..9ed132f4c1
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/submul_1.asm
@@ -0,0 +1,58 @@
+dnl  SPARC v8 mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	sub	%g0,%o2,%o2		C negate ...
+	sll	%o2,2,%o2		C ... and scale size
+	sub	%o1,%o2,%o1		C o1 is offset s1_ptr
+	sub	%o0,%o2,%g1		C g1 is offset res_ptr
+
+	mov	0,%o0			C clear cy_limb
+
+L(loop):
+	ld	[%o1+%o2],%o4
+	ld	[%g1+%o2],%g2
+	umul	%o4,%o3,%o5
+	rd	%y,%g3
+	addcc	%o5,%o0,%o5
+	addx	%g3,0,%o0
+	subcc	%g2,%o5,%g2
+	addx	%o0,0,%o0
+	st	%g2,[%g1+%o2]
+
+	addcc	%o2,4,%o2
+	bne	L(loop)
+	 nop
+
+	retl
+	 nop
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm b/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm
new file mode 100644
index 0000000000..0d5e8d415d
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm
@@ -0,0 +1,122 @@
+dnl  SuperSPARC mpn_udiv_qrnnd division support, used from longlong.h.
+dnl  This is for SuperSPARC only, to compensate for its semi-functional
+dnl  udiv instruction.
+
+dnl  Copyright (C) 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr	i0
+C n1		i1
+C n0		i2
+C d		i3
+
+ASM_START()
+
+ifdef(`PIC',
+`	TEXT
+L(getpc):
+	retl
+	nop')
+
+	TEXT
+	ALIGN(8)
+L(C0):	.double	0r4294967296
+L(C1):	.double	0r2147483648
+
+PROLOGUE(mpn_udiv_qrnnd)
+	save	%sp,-104,%sp
+	st	%i1,[%fp-8]
+	ld	[%fp-8],%f10
+
+ifdef(`PIC',
+`L(pc):	call	L(getpc)		C put address of this insn in %o7
+	ldd	[%o7+L(C0)-L(pc)],%f8',
+`	sethi	%hi(L(C0)),%o7
+	ldd	[%o7+%lo(L(C0))],%f8')
+
+	fitod	%f10,%f4
+	cmp	%i1,0
+	bge	L(248)
+	mov	%i0,%i5
+	faddd	%f4,%f8,%f4
+L(248):
+	st	%i2,[%fp-8]
+	ld	[%fp-8],%f10
+	fmuld	%f4,%f8,%f6
+	cmp	%i2,0
+	bge	L(249)
+	fitod	%f10,%f2
+	faddd	%f2,%f8,%f2
+L(249):
+	st	%i3,[%fp-8]
+	faddd	%f6,%f2,%f2
+	ld	[%fp-8],%f10
+	cmp	%i3,0
+	bge	L(250)
+	fitod	%f10,%f4
+	faddd	%f4,%f8,%f4
+L(250):
+	fdivd	%f2,%f4,%f2
+
+ifdef(`PIC',
+`	ldd	[%o7+L(C1)-L(pc)],%f4',
+`	sethi	%hi(L(C1)),%o7
+	ldd	[%o7+%lo(L(C1))],%f4')
+
+	fcmped	%f2,%f4
+	nop
+	fbge,a	L(251)
+	fsubd	%f2,%f4,%f2
+	fdtoi	%f2,%f2
+	st	%f2,[%fp-8]
+	b	L(252)
+	ld	[%fp-8],%i4
+L(251):
+	fdtoi	%f2,%f2
+	st	%f2,[%fp-8]
+	ld	[%fp-8],%i4
+	sethi	%hi(-2147483648),%g2
+	xor	%i4,%g2,%i4
+L(252):
+	umul	%i3,%i4,%g3
+	rd	%y,%i0
+	subcc	%i2,%g3,%o7
+	subxcc	%i1,%i0,%g0
+	be	L(253)
+	cmp	%o7,%i3
+
+	add	%i4,-1,%i0
+	add	%o7,%i3,%o7
+	st	%o7,[%i5]
+	ret
+	restore
+L(253):
+	blu	L(246)
+	mov	%i4,%i0
+	add	%i4,1,%i0
+	sub	%o7,%i3,%o7
+L(246):
+	st	%o7,[%i5]
+	ret
+	restore
+EPILOGUE(mpn_udiv_qrnnd)
diff --git a/rts/gmp/mpn/sparc32/v8/umul.asm b/rts/gmp/mpn/sparc32/v8/umul.asm
new file mode 100644
index 0000000000..ae8f692a0a
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/umul.asm
@@ -0,0 +1,31 @@
+dnl  SPARC v8 mpn_umul_ppmm -- support for longlong.h for non-gcc.
+
+dnl  Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+	umul %o1,%o2,%g2
+	st %g2,[%o0]
+	retl
+	rd %y,%o0
+EPILOGUE(mpn_umul_ppmm)
diff --git a/rts/gmp/mpn/sparc32/v9/README b/rts/gmp/mpn/sparc32/v9/README
new file mode 100644
index 0000000000..9b39713271
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/README
@@ -0,0 +1,4 @@
+Code for SPARC processors implementing version 9 of the SPARC architecture.
+This code is for systems that doesn't preserve the full 64-bit contents of
+integer register at context switch.  For other systems (such as Solaris 7 or
+later) use the code in ../../sparc64.
diff --git a/rts/gmp/mpn/sparc32/v9/addmul_1.asm b/rts/gmp/mpn/sparc32/v9/addmul_1.asm
new file mode 100644
index 0000000000..c1762cc41f
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/addmul_1.asm
@@ -0,0 +1,288 @@
+dnl  SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl  add the result to a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+PROLOGUE(mpn_addmul_1)
+	save %sp,-256,%sp
+
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hi(L(noll)),%g1
+	ld	[%g1+%lo(L(noll))],%f10')
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-16]
+	ld	[%fp-16],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-16]
+	ld	[%fp-16],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1],%f11
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end2)
+	std	%f12,[%fp-16]
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end3)
+	std	%f12,[%fp-32]
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end4)
+	std	%f12,[%fp-16]
+
+	b,a	L(loopm)
+
+	.align 16
+C BEGIN LOOP
+L(loop):
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(loope)
+	add	%i0,4,%i0		C res_ptr++
+L(loopm):
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	subcc	%i2,1,%i2
+	bne,pt	%icc,L(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+
+	fxtod	%f10,%f2
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	b,a	L(xxx)
+L(loope):
+L(end4):
+	fxtod	%f10,%f2
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	b,a	L(yyy)
+
+L(end3):
+	fxtod	%f10,%f2
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+L(xxx):	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	ldx	[%fp-32],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+	b,a	L(ret)
+
+L(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+L(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	ldx	[%fp-32],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+	b,a	L(ret)
+
+L(end1):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+L(ret):	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	st	%g4,[%i0-4]
+
+	ret
+	restore %g0,%g3,%o0		C sideeffect: put cy in retreg
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/sparc32/v9/gmp-mparam.h b/rts/gmp/mpn/sparc32/v9/gmp-mparam.h
new file mode 100644
index 0000000000..f946b900f0
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/gmp-mparam.h
@@ -0,0 +1,69 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+/* These values are for UltraSPARC I, II, and IIi.  It is bogus that
+   this file lives in v9, but that will do for now.  */
+
+/* Variations in addmul_1 speed make the multiply and square thresholds
+   doubtful.  TOOM3_SQR_THRESHOLD had to be estimated here.  */
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   30
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      200
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   59
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      500
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD             107
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            146
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            29
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD           3
+#endif
diff --git a/rts/gmp/mpn/sparc32/v9/mul_1.asm b/rts/gmp/mpn/sparc32/v9/mul_1.asm
new file mode 100644
index 0000000000..f8f0fdd8c2
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/mul_1.asm
@@ -0,0 +1,267 @@
+dnl  SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and
+dnl  store the result in a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+PROLOGUE(mpn_mul_1)
+	save %sp,-256,%sp
+
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hi(L(noll)),%g1
+	ld	[%g1+%lo(L(noll))],%f10')
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-16]
+	ld	[%fp-16],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-16]
+	ld	[%fp-16],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1],%f11
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end2)
+	std	%f12,[%fp-16]
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end3)
+	std	%f12,[%fp-32]
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end4)
+	std	%f12,[%fp-16]
+
+	b,a	L(loopm)
+
+	.align 16
+C BEGIN LOOP
+L(loop):
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(loope)
+	add	%i0,4,%i0		C res_ptr++
+L(loopm):
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	subcc	%i2,1,%i2
+	bne,pt	%icc,L(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	b,a	L(xxx)
+L(loope):
+L(end4):
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	b,a	L(yyy)
+
+L(end3):
+	fxtod	%f10,%f2
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+L(xxx):	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	ldx	[%fp-32],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+	b,a	L(ret)
+
+L(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+L(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	ldx	[%fp-32],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+	b,a	L(ret)
+
+L(end1):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+L(ret):	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	st	%g4,[%i0-4]
+
+	ret
+	restore %g0,%g3,%o0		C sideeffect: put cy in retreg
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/sparc32/v9/submul_1.asm b/rts/gmp/mpn/sparc32/v9/submul_1.asm
new file mode 100644
index 0000000000..6195ea88ea
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/submul_1.asm
@@ -0,0 +1,291 @@
+dnl  SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+PROLOGUE(mpn_submul_1)
+	save %sp,-256,%sp
+
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hi(L(noll)),%g1
+	ld	[%g1+%lo(L(noll))],%f10')
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-16]
+	ld	[%fp-16],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-16]
+	ld	[%fp-16],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1],%f11
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end2)
+	std	%f12,[%fp-16]
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end3)
+	std	%f12,[%fp-32]
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end4)
+	std	%f12,[%fp-16]
+
+	b,a	L(loopm)
+
+	.align 16
+C BEGIN LOOP
+L(loop):
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	subcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	addx	%g3,0,%g3
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(loope)
+	add	%i0,4,%i0		C res_ptr++
+L(loopm):
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	subcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	addx	%g3,0,%g3
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	subcc	%i2,1,%i2
+	bne,pt	%icc,L(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	subcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	b,a	L(xxx)
+L(loope):
+L(end4):
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	subcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	b,a	L(yyy)
+
+L(end3):
+	fxtod	%f10,%f2
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+L(xxx):	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	ldx	[%fp-32],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+	b,a	L(ret)
+
+L(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+L(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	ldx	[%fp-32],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+	b,a	L(ret)
+
+L(end1):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+L(ret):	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	srlx	%g4,32,%g3
+	st	%l2,[%i0-4]
+
+	addx	%g3,%g0,%g3
+	ret
+	restore %g0,%g3,%o0		C sideeffect: put cy in retreg
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/sparc64/README b/rts/gmp/mpn/sparc64/README
new file mode 100644
index 0000000000..6923a133f3
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/README
@@ -0,0 +1,48 @@
+This directory contains mpn functions for 64-bit V9 SPARC
+
+RELEVANT OPTIMIZATION ISSUES
+
+The Ultra I/II pipeline executes up to two simple integer arithmetic operations
+per cycle.  The 64-bit integer multiply instruction mulx takes from 5 cycles to
+35 cycles, depending on the position of the most significant bit of the 1st
+source operand.  It cannot overlap with other instructions.  For our use of
+mulx, it will take from 5 to 20 cycles.
+
+Integer conditional move instructions cannot dual-issue with other integer
+instructions.  No conditional move can issue 1-5 cycles after a load.  (Or
+something such bizzare.)
+
+Integer branches can issue with two integer arithmetic instructions.  Likewise
+for integer loads.  Four instructions may issue (arith, arith, ld/st, branch)
+but only if the branch is last.
+
+(The V9 architecture manual recommends that the 2nd operand of a multiply
+instruction be the smaller one.  For UltraSPARC, they got things backwards and
+optimize for the wrong operand!  Really helpful in the light of that multiply
+is incredibly slow on these CPUs!)
+
+STATUS
+
+There is new code in ~/prec/gmp-remote/sparc64.  Not tested or completed, but
+the pipelines are worked out.  Here are the timings:
+
+* lshift, rshift: The code is well-optimized and runs at 2.0 cycles/limb.
+
+* add_n, sub_n: add3.s currently runs at 6 cycles/limb.  We use a bizarre
+  scheme of compares and branches (with some nops and fnops to align things)
+  and carefully stay away from the instructions intended for this application
+  (i.e., movcs and movcc).
+
+  Using movcc/movcs, even with deep unrolling, seems to get down to 7
+  cycles/limb.
+
+  The most promising approach is to split operands in 32-bit pieces using
+  srlx, then use two addccc, and finally compile the results with sllx+or.
+  The result could run at 5 cycles/limb, I think.  It might be possible to
+  do without unrolling, or with minimal unrolling.
+
+* addmul_1/submul_1: Should optimize for when scalar operand < 2^32.
+* addmul_1/submul_1: Since mulx is horrendously slow on UltraSPARC I/II,
+  Karatsuba's method should save up to 16 cycles (i.e. > 20%).
+* mul_1 (and possibly the other multiply functions): Handle carry in the
+  same tricky way as add_n,sub_n.
diff --git a/rts/gmp/mpn/sparc64/add_n.asm b/rts/gmp/mpn/sparc64/add_n.asm
new file mode 100644
index 0000000000..72b3895a5b
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/add_n.asm
@@ -0,0 +1,172 @@
+! SPARC v9 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+! sum in a third limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! s1_ptr	%o1
+! s2_ptr	%o2
+! size		%o3
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_add_n)
+
+! 12 mem ops >= 12 cycles
+! 8 shift insn >= 8 cycles
+! 8 addccc, executing alone, +8 cycles
+! Unrolling not mandatory...perhaps 2-way is best?
+! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl
+! All in all, it runs at 5 cycles/limb
+
+	save	%sp,-160,%sp
+
+	addcc	%g0,%g0,%g0
+
+	add	%i3,-4,%i3
+	brlz,pn	%i3,L(there)
+	nop
+
+	ldx	[%i1+0],%l0
+	ldx	[%i2+0],%l4
+	ldx	[%i1+8],%l1
+	ldx	[%i2+8],%l5
+	ldx	[%i1+16],%l2
+	ldx	[%i2+16],%l6
+	ldx	[%i1+24],%l3
+	ldx	[%i2+24],%l7
+	add	%i1,32,%i1
+	add	%i2,32,%i2
+
+	add	%i3,-4,%i3
+	brlz,pn	%i3,L(skip)
+	nop
+	b	L(loop1)	! jump instead of executing many NOPs
+	nop
+	ALIGN(32)
+!---------  Start main loop ---------
+L(loop1):
+	addccc	%l0,%l4,%g1
+!-
+	srlx	%l0,32,%o0
+	ldx	[%i1+0],%l0
+!-
+	srlx	%l4,32,%o4
+	ldx	[%i2+0],%l4
+!-
+	addccc	%o0,%o4,%g0
+!-
+	addccc	%l1,%l5,%g2
+!-
+	srlx	%l1,32,%o1
+	ldx	[%i1+8],%l1
+!-
+	srlx	%l5,32,%o5
+	ldx	[%i2+8],%l5
+!-
+	addccc	%o1,%o5,%g0
+!-
+	addccc	%l2,%l6,%g3
+!-
+	srlx	%l2,32,%o2
+	ldx	[%i1+16],%l2
+!-
+	srlx	%l6,32,%g5	! asymmetry
+	ldx	[%i2+16],%l6
+!-
+	addccc	%o2,%g5,%g0
+!-
+	addccc	%l3,%l7,%g4
+!-
+	srlx	%l3,32,%o3
+	ldx	[%i1+24],%l3
+	add	%i1,32,%i1
+!-
+	srlx	%l7,32,%o7
+	ldx	[%i2+24],%l7
+	add	%i2,32,%i2
+!-
+	addccc	%o3,%o7,%g0
+!-
+	stx	%g1,[%i0+0]
+!-
+	stx	%g2,[%i0+8]
+!-
+	stx	%g3,[%i0+16]
+	add	%i3,-4,%i3
+!-
+	stx	%g4,[%i0+24]
+	add	%i0,32,%i0
+
+	brgez,pt	%i3,L(loop1)
+	nop
+!---------  End main loop ---------
+L(skip):
+	addccc	%l0,%l4,%g1
+	srlx	%l0,32,%o0
+	srlx	%l4,32,%o4
+	addccc	%o0,%o4,%g0
+	addccc	%l1,%l5,%g2
+	srlx	%l1,32,%o1
+	srlx	%l5,32,%o5
+	addccc	%o1,%o5,%g0
+	addccc	%l2,%l6,%g3
+	srlx	%l2,32,%o2
+	srlx	%l6,32,%g5	! asymmetry
+	addccc	%o2,%g5,%g0
+	addccc	%l3,%l7,%g4
+	srlx	%l3,32,%o3
+	srlx	%l7,32,%o7
+	addccc	%o3,%o7,%g0
+	stx	%g1,[%i0+0]
+	stx	%g2,[%i0+8]
+	stx	%g3,[%i0+16]
+	stx	%g4,[%i0+24]
+	add	%i0,32,%i0
+
+L(there):
+	add	%i3,4,%i3
+	brz,pt	%i3,L(end)
+	nop
+
+L(loop2):
+	ldx	[%i1+0],%l0
+	add	%i1,8,%i1
+	ldx	[%i2+0],%l4
+	add	%i2,8,%i2
+	srlx	%l0,32,%g2
+	srlx	%l4,32,%g3
+	addccc	%l0,%l4,%g1
+	addccc	%g2,%g3,%g0
+	stx	%g1,[%i0+0]
+	add	%i0,8,%i0
+	add	%i3,-1,%i3
+	brgz,pt	%i3,L(loop2)
+	nop
+
+L(end):	addc	%g0,%g0,%i0
+	ret
+	restore
+EPILOGUE(mpn_add_n)
diff --git a/rts/gmp/mpn/sparc64/addmul1h.asm b/rts/gmp/mpn/sparc64/addmul1h.asm
new file mode 100644
index 0000000000..96cb5f7369
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/addmul1h.asm
@@ -0,0 +1,203 @@
+dnl  SPARC 64-bit addmull/addmulu -- Helper for mpn_addmul_1 and mpn_mul_1.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+ifdef(`LOWPART',
+`addmull:',
+`addmulu:')
+	save %sp,-256,%sp
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1+4],%f11
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end2)
+	std	%f12,[%fp-17]
+
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end3)
+	std	%f12,[%fp-33]
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ld	[%i0+DLO],%g5
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end4)
+	std	%f12,[%fp-17]
+
+	b,a	E(loop)
+	nop				C nop is cheap to nullify
+
+	ALIGN(16)
+C BEGIN LOOP
+E(loop):
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0+DHI],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	sub	%i2,2,%i2
+	add	%i0,4,%i0		C res_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0+DLO],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DHI]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-17]
+	brnz,pt	%i2,E(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+E(loope):
+E(end4):
+	fxtod	%f10,%f2
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0+DHI],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0+DLO],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DHI]
+	b,a	E(yyy)
+
+E(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	ld	[%i0+DLO],%g5
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+E(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+ifdef(`LOWPART',
+`	ld	[%i0+DHI],%g5')
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	ldx	[%fp-33],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+ifdef(`LOWPART',
+`	add	%g5,%g1,%g1')		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+ifdef(`LOWPART',
+`	st	%g4,[%i0-4+DHI]
+	srlx	%g4,32,%g4')
+
+	ret
+	restore %g0,%g4,%o0		C sideeffect: put cy in retreg
+ifdef(`LOWPART',
+`EPILOGUE(addmull)',
+`EPILOGUE(addmulu)')
diff --git a/rts/gmp/mpn/sparc64/addmul_1.asm b/rts/gmp/mpn/sparc64/addmul_1.asm
new file mode 100644
index 0000000000..c3f04cea6a
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/addmul_1.asm
@@ -0,0 +1,114 @@
+dnl  SPARC 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl  add the result to a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+
+PROLOGUE(mpn_addmul_1)
+	save	%sp,-256,%sp
+
+C We store 0.0 in f10 and keep it invariant accross thw two
+C function calls below.  Note that this is not ABI conformant,
+C but since the functions are local, that's acceptable.
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hh(L(noll)),%g2
+	sethi	%lm(L(noll)),%g1
+	or	%g2,%hm(L(noll)),%g2
+	or	%g1,%lo(L(noll)),%g1
+	sllx	%g2,32,%g2
+	ld	[%g1+%g2],%f10')
+
+	sub	%i1,%i0,%g1
+	srlx	%g1,3,%g1
+	cmp	%g1,%i2
+	bcc,pt	%xcc,L(nooverlap)
+	nop
+
+	sllx	%i2,3,%g2		C compute stack allocation byte count
+	add	%g2,15,%o0
+	and	%o0,-16,%o0
+	sub	%sp,%o0,%sp
+	add	%sp,2223,%o0
+
+	mov	%i1,%o1			C copy s1_ptr to mpn_copyi's srcp
+	call	mpn_copyi
+	mov	%i2,%o2			C copy n to mpn_copyi's count parameter
+
+	add	%sp,2223,%i1
+
+L(nooverlap):
+C First multiply-add with low 32 bits of s2_limb
+	mov	%i0,%o0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	addmull
+	srl	%i3,0,%o3
+
+	mov	%o0,%l0			C keep carry-out from accmull
+
+C Now multiply-add with high 32 bits of s2_limb, unless it is zero.
+	srlx	%i3,32,%o3
+	brz,a,pn	%o3,L(small)
+	 mov	%o0,%i0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	addmulu
+	add	%i0,4,%o0
+
+	add	%l0,%o0,%i0
+L(small):
+	ret
+	restore	%g0,%g0,%g0
+EPILOGUE(mpn_addmul_1)
+
+C Put a zero in the text segment to allow us to t the address
+C quickly when compiling for PIC
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+define(`LO',`(+4)')
+define(`HI',`(-4)')
+
+define(`DLO',`(+4)')
+define(`DHI',`(-4)')
+define(`LOWPART')
+define(`E',`L(l.$1)')
+include_mpn(`sparc64/addmul1h.asm')
+
+define(`DLO',`(-4)')
+define(`DHI',`(+4)')
+undefine(`LOWPART')
+define(`E',`L(u.$1)')
+include_mpn(`sparc64/addmul1h.asm')
diff --git a/rts/gmp/mpn/sparc64/copyi.asm b/rts/gmp/mpn/sparc64/copyi.asm
new file mode 100644
index 0000000000..d9957e3c90
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/copyi.asm
@@ -0,0 +1,79 @@
+! SPARC v9 __gmpn_copy -- Copy a limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! rptr	%o0
+! sptr	%o1
+! n	%o2
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_copyi)
+	add	%o2,-8,%o2
+	brlz,pn	%o2,L(skip)
+	nop
+	b,a	L(loop1)
+	nop
+
+	ALIGN(16)
+L(loop1):
+	ldx	[%o1+0],%g1
+	ldx	[%o1+8],%g2
+	ldx	[%o1+16],%g3
+	ldx	[%o1+24],%g4
+	ldx	[%o1+32],%g5
+	ldx	[%o1+40],%o3
+	ldx	[%o1+48],%o4
+	ldx	[%o1+56],%o5
+	add	%o1,64,%o1
+	stx	%g1,[%o0+0]
+	stx	%g2,[%o0+8]
+	stx	%g3,[%o0+16]
+	stx	%g4,[%o0+24]
+	stx	%g5,[%o0+32]
+	stx	%o3,[%o0+40]
+	stx	%o4,[%o0+48]
+	stx	%o5,[%o0+56]
+	add	%o2,-8,%o2
+	brgez,pt	%o2,L(loop1)
+	add	%o0,64,%o0
+
+L(skip):
+	add	%o2,8,%o2
+	brz,pt	%o2,L(end)
+	nop
+
+L(loop2):
+	ldx	[%o1],%g1
+	add	%o1,8,%o1
+	add	%o2,-1,%o2
+	stx	%g1,[%o0]
+	add	%o0,8,%o0
+	brgz,pt	%o2,L(loop2)
+	nop
+
+L(end):	retl
+	nop
+EPILOGUE(mpn_copyi)
diff --git a/rts/gmp/mpn/sparc64/gmp-mparam.h b/rts/gmp/mpn/sparc64/gmp-mparam.h
new file mode 100644
index 0000000000..74f61661c1
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/gmp-mparam.h
@@ -0,0 +1,88 @@
+/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* Tell the toom3 multiply implementation to call low-level mpn
+   functions instead of open-coding operations in C.  */
+#define USE_MORE_MPN 1
+
+
+/* Run on sun workshop cc. */
+/* Generated by tuneup.c, 2000-07-30. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD     12
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD         95
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD     33
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD        125
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD                27
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD              107
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD              12
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD          4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD           199
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 304, 608, 1344, 2304, 7168, 20480, 49152, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     320
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         1664
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 304, 608, 1344, 2816, 7168, 20480, 49152, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     320
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         1664
+#endif
diff --git a/rts/gmp/mpn/sparc64/lshift.asm b/rts/gmp/mpn/sparc64/lshift.asm
new file mode 100644
index 0000000000..2d2edc50a7
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/lshift.asm
@@ -0,0 +1,97 @@
+! SPARC v9 __gmpn_lshift --
+
+! Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! src_ptr	%o1
+! size		%o2
+! cnt		%o3
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_lshift)
+	sllx	%o2,3,%g1
+	add	%o1,%g1,%o1	! make %o1 point at end of src
+	ldx	[%o1-8],%g2	! load first limb
+	sub	%g0,%o3,%o5	! negate shift count
+	add	%o0,%g1,%o0	! make %o0 point at end of res
+	add	%o2,-1,%o2
+	and	%o2,4-1,%g4	! number of limbs in first loop
+	srlx	%g2,%o5,%g1	! compute function result
+	brz,pn	%g4,L(0)		! if multiple of 4 limbs, skip first loop
+	mov	%g1,%g5
+
+	sub	%o2,%g4,%o2	! adjust count for main loop
+
+L(loop0):
+	ldx	[%o1-16],%g3
+	add	%o0,-8,%o0
+	add	%o1,-8,%o1
+	add	%g4,-1,%g4
+	sllx	%g2,%o3,%o4
+	srlx	%g3,%o5,%g1
+	mov	%g3,%g2
+	or	%o4,%g1,%o4
+	brnz,pt	%g4,L(loop0)
+	 stx	%o4,[%o0+0]
+
+L(0):	brz,pn	%o2,L(end)
+	 nop
+
+L(loop1):
+	ldx	[%o1-16],%g3
+	add	%o0,-32,%o0
+	add	%o2,-4,%o2
+	sllx	%g2,%o3,%o4
+	srlx	%g3,%o5,%g1
+
+	ldx	[%o1-24],%g2
+	sllx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0+24]
+	srlx	%g2,%o5,%g1
+
+	ldx	[%o1-32],%g3
+	sllx	%g2,%o3,%o4
+	or	%g4,%g1,%g4
+	stx	%g4,[%o0+16]
+	srlx	%g3,%o5,%g1
+
+	ldx	[%o1-40],%g2
+	sllx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0+8]
+	srlx	%g2,%o5,%g1
+
+	add	%o1,-32,%o1
+	or	%g4,%g1,%g4
+	brnz,pt	%o2,L(loop1)
+	 stx	%g4,[%o0+0]
+
+L(end):	sllx	%g2,%o3,%g2
+	stx	%g2,[%o0-8]
+	retl
+	mov	%g5,%o0
+EPILOGUE(mpn_lshift)
diff --git a/rts/gmp/mpn/sparc64/mul_1.asm b/rts/gmp/mpn/sparc64/mul_1.asm
new file mode 100644
index 0000000000..f2f2821d51
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/mul_1.asm
@@ -0,0 +1,113 @@
+dnl  SPARC 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and
+dnl  store the result to a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+
+PROLOGUE(mpn_mul_1)
+	save	%sp,-256,%sp
+
+C We store 0.0 in f10 and keep it invariant accross thw two
+C function calls below.  Note that this is not ABI conformant,
+C but since the functions are local, that's acceptable.
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hh(L(noll)),%g2
+	sethi	%lm(L(noll)),%g1
+	or	%g2,%hm(L(noll)),%g2
+	or	%g1,%lo(L(noll)),%g1
+	sllx	%g2,32,%g2
+	ld	[%g1+%g2],%f10')
+
+	sub	%i1,%i0,%g1
+	srlx	%g1,3,%g1
+	cmp	%g1,%i2
+	bcc,pt	%xcc,L(nooverlap)
+	nop
+
+	sllx	%i2,3,%g2		C compute stack allocation byte count
+	add	%g2,15,%o0
+	and	%o0,-16,%o0
+	sub	%sp,%o0,%sp
+	add	%sp,2223,%o0
+
+	mov	%i1,%o1			C copy s1_ptr to mpn_copyi's srcp
+	call	mpn_copyi
+	mov	%i2,%o2			C copy n to mpn_copyi's count parameter
+
+	add	%sp,2223,%i1
+
+L(nooverlap):
+C First multiply-add with low 32 bits of s2_limb
+	mov	%i0,%o0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	mull
+	srl	%i3,0,%o3
+
+	mov	%o0,%l0			C keep carry-out from accmull
+
+C Now multiply-add with high 32 bits of s2_limb, unless it is zero.
+	srlx	%i3,32,%o3
+	brz,a,pn	%o3,L(small)
+	 mov	%o0,%i0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	addmulu
+	add	%i0,4,%o0
+
+	add	%l0,%o0,%i0
+L(small):
+	ret
+	restore	%g0,%g0,%g0
+EPILOGUE(mpn_mul_1)
+
+C Put a zero in the text segment to allow us to t the address
+C quickly when compiling for PIC
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+define(`LO',`(+4)')
+define(`HI',`(-4)')
+
+define(`DLO',`(+4)')
+define(`DHI',`(-4)')
+define(`E',`L($1)')
+include_mpn(`sparc64/mul_1h.asm')
+
+define(`DLO',`(-4)')
+define(`DHI',`(+4)')
+undefine(`LOWPART')
+define(`E',`L(u.$1)')
+include_mpn(`sparc64/addmul1h.asm')
diff --git a/rts/gmp/mpn/sparc64/mul_1h.asm b/rts/gmp/mpn/sparc64/mul_1h.asm
new file mode 100644
index 0000000000..5078c01c3f
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/mul_1h.asm
@@ -0,0 +1,183 @@
+dnl  SPARC 64-bit mull -- Helper for mpn_mul_1.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+mull:
+	save %sp,-256,%sp
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1+4],%f11
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end2)
+	std	%f12,[%fp-17]
+
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end3)
+	std	%f12,[%fp-33]
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end4)
+	std	%f12,[%fp-17]
+
+	b,a	E(loop)
+	nop				C nop is cheap to nullify
+
+	ALIGN(16)
+C BEGIN LOOP
+E(loop):
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	sub	%i2,2,%i2
+	add	%i0,4,%i0		C res_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DHI]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-17]
+	brnz,pt	%i2,E(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+E(loope):
+E(end4):
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DHI]
+	b,a	E(yyy)
+
+E(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+E(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	ldx	[%fp-33],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	st	%g4,[%i0-4+DHI]
+	srlx	%g4,32,%g4
+
+	ret
+	restore %g0,%g4,%o0		C sideeffect: put cy in retreg
+EPILOGUE(mull)
diff --git a/rts/gmp/mpn/sparc64/rshift.asm b/rts/gmp/mpn/sparc64/rshift.asm
new file mode 100644
index 0000000000..baf7920efb
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/rshift.asm
@@ -0,0 +1,94 @@
+! SPARC v9 __gmpn_rshift --
+
+! Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! src_ptr	%o1
+! size		%o2
+! cnt		%o3
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_rshift)
+	ldx	[%o1],%g2	! load first limb
+	sub	%g0,%o3,%o5	! negate shift count
+	add	%o2,-1,%o2
+	and	%o2,4-1,%g4	! number of limbs in first loop
+	sllx	%g2,%o5,%g1	! compute function result
+	brz,pn	%g4,L(0)	! if multiple of 4 limbs, skip first loop
+	mov	%g1,%g5
+
+	sub	%o2,%g4,%o2	! adjust count for main loop
+
+L(loop0):
+	ldx	[%o1+8],%g3
+	add	%o0,8,%o0
+	add	%o1,8,%o1
+	add	%g4,-1,%g4
+	srlx	%g2,%o3,%o4
+	sllx	%g3,%o5,%g1
+	mov	%g3,%g2
+	or	%o4,%g1,%o4
+	brnz,pt	%g4,L(loop0)
+	 stx	%o4,[%o0-8]
+
+L(0):	brz,pn	%o2,L(end)
+	 nop
+
+L(loop1):
+	ldx	[%o1+8],%g3
+	add	%o0,32,%o0
+	add	%o2,-4,%o2
+	srlx	%g2,%o3,%o4
+	sllx	%g3,%o5,%g1
+
+	ldx	[%o1+16],%g2
+	srlx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0-32]
+	sllx	%g2,%o5,%g1
+
+	ldx	[%o1+24],%g3
+	srlx	%g2,%o3,%o4
+	or	%g4,%g1,%g4
+	stx	%g4,[%o0-24]
+	sllx	%g3,%o5,%g1
+
+	ldx	[%o1+32],%g2
+	srlx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0-16]
+	sllx	%g2,%o5,%g1
+
+	add	%o1,32,%o1
+	or	%g4,%g1,%g4
+	brnz	%o2,L(loop1)
+	 stx	%g4,[%o0-8]
+
+L(end):	srlx	%g2,%o3,%g2
+	stx	%g2,[%o0-0]
+	retl
+	mov	%g5,%o0
+EPILOGUE(mpn_rshift)
diff --git a/rts/gmp/mpn/sparc64/sub_n.asm b/rts/gmp/mpn/sparc64/sub_n.asm
new file mode 100644
index 0000000000..61547138e0
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/sub_n.asm
@@ -0,0 +1,172 @@
+! SPARC v9 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+! store difference in a third limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! s1_ptr	%o1
+! s2_ptr	%o2
+! size		%o3
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_sub_n)
+
+! 12 mem ops >= 12 cycles
+! 8 shift insn >= 8 cycles
+! 8 addccc, executing alone, +8 cycles
+! Unrolling not mandatory...perhaps 2-way is best?
+! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl
+! All in all, it runs at 5 cycles/limb
+
+	save	%sp,-160,%sp
+
+	addcc	%g0,%g0,%g0
+
+	add	%i3,-4,%i3
+	brlz,pn	%i3,L(there)
+	nop
+
+	ldx	[%i1+0],%l0
+	ldx	[%i2+0],%l4
+	ldx	[%i1+8],%l1
+	ldx	[%i2+8],%l5
+	ldx	[%i1+16],%l2
+	ldx	[%i2+16],%l6
+	ldx	[%i1+24],%l3
+	ldx	[%i2+24],%l7
+	add	%i1,32,%i1
+	add	%i2,32,%i2
+
+	add	%i3,-4,%i3
+	brlz,pn	%i3,L(skip)
+	nop
+	b	L(loop1)	! jump instead of executing many NOPs
+	nop
+	ALIGN(32)
+!---------  Start main loop ---------
+L(loop1):
+	subccc	%l0,%l4,%g1
+!-
+	srlx	%l0,32,%o0
+	ldx	[%i1+0],%l0
+!-
+	srlx	%l4,32,%o4
+	ldx	[%i2+0],%l4
+!-
+	subccc	%o0,%o4,%g0
+!-
+	subccc	%l1,%l5,%g2
+!-
+	srlx	%l1,32,%o1
+	ldx	[%i1+8],%l1
+!-
+	srlx	%l5,32,%o5
+	ldx	[%i2+8],%l5
+!-
+	subccc	%o1,%o5,%g0
+!-
+	subccc	%l2,%l6,%g3
+!-
+	srlx	%l2,32,%o2
+	ldx	[%i1+16],%l2
+!-
+	srlx	%l6,32,%g5	! asymmetry
+	ldx	[%i2+16],%l6
+!-
+	subccc	%o2,%g5,%g0
+!-
+	subccc	%l3,%l7,%g4
+!-
+	srlx	%l3,32,%o3
+	ldx	[%i1+24],%l3
+	add	%i1,32,%i1
+!-
+	srlx	%l7,32,%o7
+	ldx	[%i2+24],%l7
+	add	%i2,32,%i2
+!-
+	subccc	%o3,%o7,%g0
+!-
+	stx	%g1,[%i0+0]
+!-
+	stx	%g2,[%i0+8]
+!-
+	stx	%g3,[%i0+16]
+	add	%i3,-4,%i3
+!-
+	stx	%g4,[%i0+24]
+	add	%i0,32,%i0
+
+	brgez,pt	%i3,L(loop1)
+	nop
+!---------  End main loop ---------
+L(skip):
+	subccc	%l0,%l4,%g1
+	srlx	%l0,32,%o0
+	srlx	%l4,32,%o4
+	subccc	%o0,%o4,%g0
+	subccc	%l1,%l5,%g2
+	srlx	%l1,32,%o1
+	srlx	%l5,32,%o5
+	subccc	%o1,%o5,%g0
+	subccc	%l2,%l6,%g3
+	srlx	%l2,32,%o2
+	srlx	%l6,32,%g5	! asymmetry
+	subccc	%o2,%g5,%g0
+	subccc	%l3,%l7,%g4
+	srlx	%l3,32,%o3
+	srlx	%l7,32,%o7
+	subccc	%o3,%o7,%g0
+	stx	%g1,[%i0+0]
+	stx	%g2,[%i0+8]
+	stx	%g3,[%i0+16]
+	stx	%g4,[%i0+24]
+	add	%i0,32,%i0
+
+L(there):
+	add	%i3,4,%i3
+	brz,pt	%i3,L(end)
+	nop
+
+L(loop2):
+	ldx	[%i1+0],%l0
+	add	%i1,8,%i1
+	ldx	[%i2+0],%l4
+	add	%i2,8,%i2
+	srlx	%l0,32,%g2
+	srlx	%l4,32,%g3
+	subccc	%l0,%l4,%g1
+	subccc	%g2,%g3,%g0
+	stx	%g1,[%i0+0]
+	add	%i0,8,%i0
+	add	%i3,-1,%i3
+	brgz,pt	%i3,L(loop2)
+	nop
+
+L(end):	addc	%g0,%g0,%i0
+	ret
+	restore
+EPILOGUE(mpn_sub_n)
diff --git a/rts/gmp/mpn/sparc64/submul1h.asm b/rts/gmp/mpn/sparc64/submul1h.asm
new file mode 100644
index 0000000000..7f51ba59c6
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/submul1h.asm
@@ -0,0 +1,204 @@
+dnl  SPARC 64-bit submull/submulu -- Helper for mpn_submul_1 and mpn_mul_1.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+ifdef(`LOWPART',
+`submull:',
+`submulu:')
+	save %sp,-256,%sp
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1+4],%f11
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end2)
+	std	%f12,[%fp-17]
+
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end3)
+	std	%f12,[%fp-33]
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ld	[%i0+DLO],%g5
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end4)
+	std	%f12,[%fp-17]
+
+	b,a	E(loop)
+	nop				C nop is cheap to nullify
+
+	ALIGN(16)
+C BEGIN LOOP
+E(loop):
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0+DHI],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	sub	%i2,2,%i2
+	add	%i0,4,%i0		C res_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0+DLO],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DHI]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-17]
+	brnz,pt	%i2,E(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+E(loope):
+E(end4):
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0+DHI],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0+DLO],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DHI]
+	b,a	E(yyy)
+
+E(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	ld	[%i0+DLO],%g5
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+E(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+ifdef(`LOWPART',
+`	ld	[%i0+DHI],%g5')
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	ldx	[%fp-33],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DLO]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+ifdef(`LOWPART',
+`	subxcc	%g5,%g4,%l2')		C add *res_ptr to p0 (ADD2)
+ifdef(`LOWPART',
+`	st	%l2,[%i0-4+DHI]
+	srlx	%g4,32,%g4')
+
+	addx	%g4,0,%g4
+	ret
+	restore %g0,%g4,%o0		C sideeffect: put cy in retreg
+ifdef(`LOWPART',
+`EPILOGUE(submull)',
+`EPILOGUE(submulu)')
diff --git a/rts/gmp/mpn/sparc64/submul_1.asm b/rts/gmp/mpn/sparc64/submul_1.asm
new file mode 100644
index 0000000000..7c6af0a98b
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/submul_1.asm
@@ -0,0 +1,114 @@
+dnl  SPARC 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+
+PROLOGUE(mpn_submul_1)
+	save	%sp,-256,%sp
+
+C We store 0.0 in f10 and keep it invariant accross thw two
+C function calls below.  Note that this is not ABI conformant,
+C but since the functions are local, that's acceptable.
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hh(L(noll)),%g2
+	sethi	%lm(L(noll)),%g1
+	or	%g2,%hm(L(noll)),%g2
+	or	%g1,%lo(L(noll)),%g1
+	sllx	%g2,32,%g2
+	ld	[%g1+%g2],%f10')
+
+	sub	%i1,%i0,%g1
+	srlx	%g1,3,%g1
+	cmp	%g1,%i2
+	bcc,pt	%xcc,L(nooverlap)
+	nop
+
+	sllx	%i2,3,%g2		C compute stack allocation byte count
+	add	%g2,15,%o0
+	and	%o0,-16,%o0
+	sub	%sp,%o0,%sp
+	add	%sp,2223,%o0
+
+	mov	%i1,%o1			C copy s1_ptr to mpn_copyi's srcp
+	call	mpn_copyi
+	mov	%i2,%o2			C copy n to mpn_copyi's count parameter
+
+	add	%sp,2223,%i1
+
+L(nooverlap):
+C First multiply-add with low 32 bits of s2_limb
+	mov	%i0,%o0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	submull
+	srl	%i3,0,%o3
+
+	mov	%o0,%l0			C keep carry-out from accmull
+
+C Now multiply-add with high 32 bits of s2_limb, unless it is zero.
+	srlx	%i3,32,%o3
+	brz,a,pn	%o3,L(small)
+	 mov	%o0,%i0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	submulu
+	add	%i0,4,%o0
+
+	add	%l0,%o0,%i0
+L(small):
+	ret
+	restore	%g0,%g0,%g0
+EPILOGUE(mpn_submul_1)
+
+C Put a zero in the text segment to allow us to t the address
+C quickly when compiling for PIC
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+define(`LO',`(+4)')
+define(`HI',`(-4)')
+
+define(`DLO',`(+4)')
+define(`DHI',`(-4)')
+define(`LOWPART')
+define(`E',`L(l.$1)')
+include_mpn(`sparc64/submul1h.asm')
+
+define(`DLO',`(-4)')
+define(`DHI',`(+4)')
+undefine(`LOWPART')
+define(`E',`L(u.$1)')
+include_mpn(`sparc64/submul1h.asm')
diff --git a/rts/gmp/mpn/thumb/add_n.s b/rts/gmp/mpn/thumb/add_n.s
new file mode 100644
index 0000000000..c1eeb6ca87
--- /dev/null
+++ b/rts/gmp/mpn/thumb/add_n.s
@@ -0,0 +1,50 @@
+@ ARM/Thumb __gmpn_add -- Add two limb vectors of the same length > 0 and store
+@ sum in a third limb vector.
+
+@ Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+
+@ INPUT PARAMETERS
+@ RES_ptr	r0
+@ S1_ptr	r1
+@ S2_ptr	r2
+@ SIZE		r3
+
+@ NOT TESTED CODE
+
+	.text
+	.thumb
+	.align	0
+	.global	___gmpn_add_n
+___gmpn_add_n:
+	push	{r4, r5, r6, lr}
+	mov	r6, #1			@ init carry save register
+
+Loop:	sub	r6, #1			@ restore carry (set iff r6 was 0)
+	ldmia	r1!, {r4}		@ load next limb from S1
+	ldmia	r2!, {r5}		@ load next limb from S2
+	adc	r4, r5
+	stmia	r0!, {r4}		@ store result limb to RES
+	sbc	r6, r6			@ save negated carry
+	sub	r3, #1
+	bge	Loop			@ loop back while remaining count >= 4
+
+	mov	r0, r6
+	pop	{r4, r5, r6, pc}
diff --git a/rts/gmp/mpn/thumb/sub_n.s b/rts/gmp/mpn/thumb/sub_n.s
new file mode 100644
index 0000000000..53c292375f
--- /dev/null
+++ b/rts/gmp/mpn/thumb/sub_n.s
@@ -0,0 +1,50 @@
+@ ARM/Thumb __gmpn_sub -- Subtract two limb vectors of the same length > 0 and
+@ store difference in a third limb vector.
+
+@ Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+
+@ INPUT PARAMETERS
+@ RES_ptr	r0
+@ S1_ptr	r1
+@ S2_ptr	r2
+@ SIZE		r3
+
+@ NOT TESTED CODE
+
+	.text
+	.thumb
+	.align	0
+	.global	___gmpn_sub_n
+___gmpn_sub_n:
+	push	{r4, r5, r6, lr}
+	mov	r6, #1			@ init carry save register
+
+Loop:	sub	r6, #1			@ restore carry (set iff r6 was 0)
+	ldmia	r1!, {r4}		@ load next limb from S1
+	ldmia	r2!, {r5}		@ load next limb from S2
+	sbc	r4, r5
+	stmia	r0!, {r4}		@ store result limb to RES
+	sbc	r6, r6			@ save negated carry
+	sub	r3, #1
+	bge	Loop			@ loop back while remaining count >= 4
+
+	mov	r0, r6
+	pop	{r4, r5, r6, pc}
diff --git a/rts/gmp/mpn/underscore.h b/rts/gmp/mpn/underscore.h
new file mode 100644
index 0000000000..240dae0f63
--- /dev/null
+++ b/rts/gmp/mpn/underscore.h
@@ -0,0 +1,26 @@
+/*
+Copyright (C) 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#if __STDC__
+#define C_SYMBOL_NAME(name) _##name
+#else
+#define C_SYMBOL_NAME(name) _/**/name
+#endif
diff --git a/rts/gmp/mpn/vax/add_n.s b/rts/gmp/mpn/vax/add_n.s
new file mode 100644
index 0000000000..cf4060f521
--- /dev/null
+++ b/rts/gmp/mpn/vax/add_n.s
@@ -0,0 +1,61 @@
+# VAX __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# s2_ptr	(sp + 12)
+# size		(sp + 16)
+
+.text
+	.align 1
+.globl ___gmpn_add_n
+___gmpn_add_n:
+	.word	0x0
+	movl	16(ap),r0
+	movl	12(ap),r1
+	movl	8(ap),r2
+	movl	4(ap),r3
+	mnegl	r0,r5
+	addl2	$3,r0
+	ashl	$-2,r0,r0	# unroll loop count
+	bicl2	$-4,r5		# mask out low 2 bits
+	movaq	(r5)[r5],r5	# 9x
+	jmp	Loop(r5)
+
+Loop:	movl	(r2)+,r4
+	adwc	(r1)+,r4
+	movl	r4,(r3)+
+	movl	(r2)+,r4
+	adwc	(r1)+,r4
+	movl	r4,(r3)+
+	movl	(r2)+,r4
+	adwc	(r1)+,r4
+	movl	r4,(r3)+
+	movl	(r2)+,r4
+	adwc	(r1)+,r4
+	movl	r4,(r3)+
+	sobgtr	r0,Loop
+
+	adwc	r0,r0
+	ret
diff --git a/rts/gmp/mpn/vax/addmul_1.s b/rts/gmp/mpn/vax/addmul_1.s
new file mode 100644
index 0000000000..379061dcb7
--- /dev/null
+++ b/rts/gmp/mpn/vax/addmul_1.s
@@ -0,0 +1,126 @@
+# VAX __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# size		(sp + 12)
+# s2_limb	(sp + 16)
+
+.text
+	.align 1
+.globl ___gmpn_addmul_1
+___gmpn_addmul_1:
+	.word	0xfc0
+	movl	12(ap),r4
+	movl	8(ap),r8
+	movl	4(ap),r9
+	movl	16(ap),r6
+	jlss	s2_big
+
+	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L1
+	clrl	r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1:	movl	(r8)+,r1
+	jlss	L1n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	$0,r3
+	addl2	r2,(r9)+
+	adwc	$0,r3
+L1:	movl	(r8)+,r1
+	jlss	L1n1
+L1p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	$0,r11
+	addl2	r10,(r9)+
+	adwc	$0,r11
+
+	sobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+L1n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	addl2	r2,(r9)+
+	adwc	$0,r3
+	movl	(r8)+,r1
+	jgeq	L1p1
+L1n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	addl2	r10,(r9)+
+	adwc	$0,r11
+
+	sobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+
+s2_big:	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L2
+	clrl	r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2:	movl	(r8)+,r1
+	jlss	L2n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r1,r3
+	addl2	r2,(r9)+
+	adwc	$0,r3
+L2:	movl	(r8)+,r1
+	jlss	L2n1
+L2p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r1,r11
+	addl2	r10,(r9)+
+	adwc	$0,r11
+
+	sobgtr	r7,Loop2
+	movl	r11,r0
+	ret
+
+L2n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	addl2	r2,(r9)+
+	adwc	r1,r3
+	movl	(r8)+,r1
+	jgeq	L2p1
+L2n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	addl2	r10,(r9)+
+	adwc	r1,r11
+
+	sobgtr	r7,Loop2
+	movl	r11,r0
+	ret
diff --git a/rts/gmp/mpn/vax/lshift.s b/rts/gmp/mpn/vax/lshift.s
new file mode 100644
index 0000000000..fd311a9782
--- /dev/null
+++ b/rts/gmp/mpn/vax/lshift.s
@@ -0,0 +1,58 @@
+# VAX __gmpn_lshift -- left shift.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# rptr		(sp + 4)
+# sptr		(sp + 8)
+# size		(sp + 12)
+# cnt		(sp + 16)
+# r0=retval r1=size r2,r3=itmp r4,r5=otmp	call-used registers
+# r6=sptr r7=rptr r8=cnt r9 r10 r11		call-saved registers
+
+.text
+	.align 1
+.globl ___gmpn_lshift
+___gmpn_lshift:
+	.word	0x1c0
+	movl	4(ap),r7
+	movl	8(ap),r6
+	movl	12(ap),r1
+	movl	16(ap),r8
+
+	moval	(r6)[r1],r6
+	moval	(r7)[r1],r7
+	clrl	r3
+	movl	-(r6),r2
+	ashq	r8,r2,r4
+	movl	r5,r0
+	movl	r2,r3
+	decl	r1
+	jeql	Lend
+
+Loop:	movl	-(r6),r2
+	ashq	r8,r2,r4
+	movl	r5,-(r7)
+	movl	r2,r3
+	jsobgtr	r1,Loop
+
+Lend:	movl	r4,-4(r7)
+	ret
diff --git a/rts/gmp/mpn/vax/mul_1.s b/rts/gmp/mpn/vax/mul_1.s
new file mode 100644
index 0000000000..708e8ca6ca
--- /dev/null
+++ b/rts/gmp/mpn/vax/mul_1.s
@@ -0,0 +1,123 @@
+# VAX __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# size		(sp + 12)
+# s2_limb	(sp + 16)
+
+.text
+	.align 1
+.globl ___gmpn_mul_1
+___gmpn_mul_1:
+	.word	0xfc0
+	movl	12(ap),r4
+	movl	8(ap),r8
+	movl	4(ap),r9
+	movl	16(ap),r6
+	jlss	s2_big
+
+# One might want to combine the addl2 and the store below, but that
+# is actually just slower according to my timing tests.  (VAX 3600)
+
+	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L1
+	clrl	r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1:	movl	(r8)+,r1
+	jlss	L1n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	$0,r3
+	movl	r2,(r9)+
+L1:	movl	(r8)+,r1
+	jlss	L1n1
+L1p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	$0,r11
+	movl	r10,(r9)+
+
+	sobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+L1n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	movl	r2,(r9)+
+	movl	(r8)+,r1
+	jgeq	L1p1
+L1n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	movl	r10,(r9)+
+
+	sobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+
+s2_big:	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L2
+	clrl	r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2:	movl	(r8)+,r1
+	jlss	L2n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r1,r3
+	movl	r2,(r9)+
+L2:	movl	(r8)+,r1
+	jlss	L2n1
+L2p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r1,r11
+	movl	r10,(r9)+
+
+	sobgtr	r7,Loop2
+	movl	r11,r0
+	ret
+
+L2n0:	emul	r1,r6,$0,r2
+	addl2	r1,r3
+	addl2	r11,r2
+	adwc	r6,r3
+	movl	r2,(r9)+
+	movl	(r8)+,r1
+	jgeq	L2p1
+L2n1:	emul	r1,r6,$0,r10
+	addl2	r1,r11
+	addl2	r3,r10
+	adwc	r6,r11
+	movl	r10,(r9)+
+
+	sobgtr	r7,Loop2
+	movl	r11,r0
+	ret
diff --git a/rts/gmp/mpn/vax/rshift.s b/rts/gmp/mpn/vax/rshift.s
new file mode 100644
index 0000000000..515813208d
--- /dev/null
+++ b/rts/gmp/mpn/vax/rshift.s
@@ -0,0 +1,56 @@
+# VAX __gmpn_rshift -- right shift.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# rptr		(sp + 4)
+# sptr		(sp + 8)
+# size		(sp + 12)
+# cnt		(sp + 16)
+# r0=retval r1=size r2,r3=itmp r4,r5=otmp	call-used registers
+# r6=sptr r7=rptr r8=cnt r9 r10 r11		call-saved registers
+
+.text
+	.align 1
+.globl ___gmpn_rshift
+___gmpn_rshift:
+	.word	0x1c0
+	movl	4(ap),r7
+	movl	8(ap),r6
+	movl	12(ap),r1
+	movl	16(ap),r8
+
+	movl	(r6)+,r2
+	subl3	r8,$32,r8
+	ashl	r8,r2,r0
+	decl	r1
+	jeql	Lend
+
+Loop:	movl	(r6)+,r3
+	ashq	r8,r2,r4
+	movl	r5,(r7)+
+	movl	r3,r2
+	jsobgtr	r1,Loop
+
+Lend:	clrl	r3
+	ashq	r8,r2,r4
+	movl	r5,(r7)
+	ret
diff --git a/rts/gmp/mpn/vax/sub_n.s b/rts/gmp/mpn/vax/sub_n.s
new file mode 100644
index 0000000000..eff4b1c044
--- /dev/null
+++ b/rts/gmp/mpn/vax/sub_n.s
@@ -0,0 +1,61 @@
+# VAX __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and store
+# difference in a third limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# s2_ptr	(sp + 12)
+# size		(sp + 16)
+
+.text
+	.align 1
+.globl ___gmpn_sub_n
+___gmpn_sub_n:
+	.word	0x0
+	movl	16(ap),r0
+	movl	12(ap),r1
+	movl	8(ap),r2
+	movl	4(ap),r3
+	mnegl	r0,r5
+	addl2	$3,r0
+	ashl	$-2,r0,r0	# unroll loop count
+	bicl2	$-4,r5		# mask out low 2 bits
+	movaq	(r5)[r5],r5	# 9x
+	jmp	Loop(r5)
+
+Loop:	movl	(r2)+,r4
+	sbwc	(r1)+,r4
+	movl	r4,(r3)+
+	movl	(r2)+,r4
+	sbwc	(r1)+,r4
+	movl	r4,(r3)+
+	movl	(r2)+,r4
+	sbwc	(r1)+,r4
+	movl	r4,(r3)+
+	movl	(r2)+,r4
+	sbwc	(r1)+,r4
+	movl	r4,(r3)+
+	sobgtr	r0,Loop
+
+	adwc	r0,r0
+	ret
diff --git a/rts/gmp/mpn/vax/submul_1.s b/rts/gmp/mpn/vax/submul_1.s
new file mode 100644
index 0000000000..be42286935
--- /dev/null
+++ b/rts/gmp/mpn/vax/submul_1.s
@@ -0,0 +1,126 @@
+# VAX __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# size		(sp + 12)
+# s2_limb	(sp + 16)
+
+.text
+	.align 1
+.globl ___gmpn_submul_1
+___gmpn_submul_1:
+	.word	0xfc0
+	movl	12(ap),r4
+	movl	8(ap),r8
+	movl	4(ap),r9
+	movl	16(ap),r6
+	jlss	s2_big
+
+	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L1
+	clrl	r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1:	movl	(r8)+,r1
+	jlss	L1n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	$0,r3
+	subl2	r2,(r9)+
+	adwc	$0,r3
+L1:	movl	(r8)+,r1
+	jlss	L1n1
+L1p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	$0,r11
+	subl2	r10,(r9)+
+	adwc	$0,r11
+
+	sobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+L1n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	subl2	r2,(r9)+
+	adwc	$0,r3
+	movl	(r8)+,r1
+	jgeq	L1p1
+L1n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	subl2	r10,(r9)+
+	adwc	$0,r11
+
+	sobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+
+s2_big:	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L2
+	clrl	r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2:	movl	(r8)+,r1
+	jlss	L2n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r1,r3
+	subl2	r2,(r9)+
+	adwc	$0,r3
+L2:	movl	(r8)+,r1
+	jlss	L2n1
+L2p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r1,r11
+	subl2	r10,(r9)+
+	adwc	$0,r11
+
+	sobgtr	r7,Loop2
+	movl	r11,r0
+	ret
+
+L2n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	subl2	r2,(r9)+
+	adwc	r1,r3
+	movl	(r8)+,r1
+	jgeq	L2p1
+L2n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	subl2	r10,(r9)+
+	adwc	r1,r11
+
+	sobgtr	r7,Loop2
+	movl	r11,r0
+	ret
diff --git a/rts/gmp/mpn/x86/README b/rts/gmp/mpn/x86/README
new file mode 100644
index 0000000000..3507548b8c
--- /dev/null
+++ b/rts/gmp/mpn/x86/README
@@ -0,0 +1,40 @@
+
+                      X86 MPN SUBROUTINES
+
+
+This directory contains mpn functions for various 80x86 chips.
+
+
+CODE ORGANIZATION
+
+	x86              i386, i486, generic
+	x86/pentium      Intel Pentium (P5, P54)
+	x86/pentium/mmx  Intel Pentium with MMX (P55)
+	x86/p6           Intel Pentium Pro
+	x86/p6/mmx       Intel Pentium II, III
+	x86/p6/p3mmx     Intel Pentium III
+	x86/k6           AMD K6, K6-2, K6-3
+	x86/k6/mmx
+	x86/k6/k62mmx    AMD K6-2
+	x86/k7           AMD Athlon
+	x86/k7/mmx
+
+
+The x86 directory is also the main support for P6 at the moment, and
+is something of a blended style, meant to be reasonable on all x86s.
+
+	
+
+STATUS
+
+The code is well-optimized for AMD and Intel chips, but not so well
+optimized for Cyrix chips.
+
+
+
+RELEVANT OPTIMIZATION ISSUES
+
+For implementations with slow double shift instructions (SHLD and
+SHRD), it might be better to mimic their operation with SHL+SHR+OR.
+(M2 is likely to benefit from that, but not Pentium due to its slow
+plain SHL and SHR.)
diff --git a/rts/gmp/mpn/x86/README.family b/rts/gmp/mpn/x86/README.family
new file mode 100644
index 0000000000..3bc73f58b0
--- /dev/null
+++ b/rts/gmp/mpn/x86/README.family
@@ -0,0 +1,333 @@
+
+                    X86 CPU FAMILY MPN SUBROUTINES
+
+
+This file has some notes on things common to all the x86 family code.
+
+
+
+ASM FILES
+
+The x86 .asm files are BSD style x86 assembler code, first put through m4
+for macro processing.  The generic mpn/asm-defs.m4 is used, together with
+mpn/x86/x86-defs.m4.  Detailed notes are in those files.
+
+The code is meant for use with GNU "gas" or a system "as".  There's no
+support for assemblers that demand Intel style, and with gas freely
+available and easy to use that shouldn't be a problem.
+
+
+
+STACK FRAME
+
+m4 macros are used to define the parameters passed on the stack, and these
+act like comments on what the stack frame looks like too.  For example,
+mpn_mul_1() has the following.
+
+        defframe(PARAM_MULTIPLIER, 16)
+        defframe(PARAM_SIZE,       12)
+        defframe(PARAM_SRC,         8)
+        defframe(PARAM_DST,         4)
+
+Here PARAM_MULTIPLIER gets defined as `FRAME+16(%esp)', and the others
+similarly.  The return address is at offset 0, but there's not normally any
+need to access that.
+
+FRAME is redefined as necessary through the code so it's the number of bytes
+pushed on the stack, and hence the offsets in the parameter macros stay
+correct.  At the start of a routine FRAME should be zero.
+
+        deflit(`FRAME',0)
+	...
+	deflit(`FRAME',4)
+	...
+	deflit(`FRAME',8)
+	...
+
+Helper macros FRAME_pushl(), FRAME_popl(), FRAME_addl_esp() and
+FRAME_subl_esp() exist to adjust FRAME for the effect of those instructions,
+and can be used instead of explicit definitions if preferred.
+defframe_pushl() is a combination FRAME_pushl() and defframe().
+
+There's generally some slackness in redefining FRAME.  If new values aren't
+going to get used, then the redefinitions are omitted to keep from
+cluttering up the code.  This happens for instance at the end of a routine,
+where there might be just four register pops and then a ret, so FRAME isn't
+getting used.
+
+Local variables and saved registers can be similarly defined, with negative
+offsets representing stack space below the initial stack pointer.  For
+example,
+
+	defframe(SAVE_ESI,   -4)
+	defframe(SAVE_EDI,   -8)
+	defframe(VAR_COUNTER,-12)
+
+	deflit(STACK_SPACE, 12)
+
+Here STACK_SPACE gets used in a "subl $STACK_SPACE, %esp" to allocate the
+space, and that instruction must be followed by a redefinition of FRAME
+(setting it equal to STACK_SPACE) to reflect the change in %esp.
+
+Definitions for pushed registers are only put in when they're going to be
+used.  If registers are just saved and restored with pushes and pops then
+definitions aren't made.
+
+
+
+ASSEMBLER EXPRESSIONS
+
+Only addition and subtraction seem to be universally available, certainly
+that's all the Solaris 8 "as" seems to accept.  If expressions are wanted
+then m4 eval() should be used.
+
+In particular note that a "/" anywhere in a line starts a comment in Solaris
+"as", and in some configurations of gas too.
+
+	addl	$32/2, %eax           <-- wrong
+
+	addl	$eval(32/2), %eax     <-- right
+
+Binutils gas/config/tc-i386.c has a choice between "/" being a comment
+anywhere in a line, or only at the start.  FreeBSD patches 2.9.1 to select
+the latter, and as of 2.9.5 it's the default for GNU/Linux too.
+
+
+
+ASSEMBLER COMMENTS
+
+Solaris "as" doesn't support "#" commenting, using /* */ instead,
+unfortunately.  For that reason "C" commenting is used (see asm-defs.m4) and
+the intermediate ".s" files have no comments.
+
+
+
+ZERO DISPLACEMENTS
+
+In a couple of places addressing modes like 0(%ebx) with a byte-sized zero
+displacement are wanted, rather than (%ebx) with no displacement.  These are
+either for computed jumps or to get desirable code alignment.  Explicit
+.byte sequences are used to ensure the assembler doesn't turn 0(%ebx) into
+(%ebx).  The Zdisp() macro in x86-defs.m4 is used for this.
+
+Current gas 2.9.5 or recent 2.9.1 leave 0(%ebx) as written, but old gas
+1.92.3 changes it.  In general changing would be the sort of "optimization"
+an assembler might perform, hence explicit ".byte"s are used where
+necessary.
+
+
+
+SHLD/SHRD INSTRUCTIONS
+
+The %cl count forms of double shift instructions like "shldl %cl,%eax,%ebx"
+must be written "shldl %eax,%ebx" for some assemblers.  gas takes either,
+Solaris "as" doesn't allow %cl, gcc generates %cl for gas and NeXT (which is
+gas), and omits %cl elsewhere.
+
+For GMP an autoconf test is used to determine whether %cl should be used and
+the macros shldl, shrdl, shldw and shrdw in mpn/x86/x86-defs.m4 then pass
+through or omit %cl as necessary.  See comments with those macros for usage.
+
+
+
+DIRECTION FLAG
+
+The x86 calling conventions say that the direction flag should be clear at
+function entry and exit.  (See iBCS2 and SVR4 ABI books, references below.)
+
+Although this has been so since the year dot, it's not absolutely clear
+whether it's universally respected.  Since it's better to be safe than
+sorry, gmp follows glibc and does a "cld" if it depends on the direction
+flag being clear.  This happens only in a few places.
+
+
+
+POSITION INDEPENDENT CODE
+
+Defining the symbol PIC in m4 processing selects position independent code.
+This mainly affects computed jumps, and these are implemented in a
+self-contained fashion (without using the global offset table).  The few
+calls from assembly code to global functions use the normal procedure
+linkage table.
+
+PIC is necessary for ELF shared libraries because they can be mapped into
+different processes at different virtual addresses.  Text relocations in
+shared libraries are allowed, but that presumably means a page with such a
+relocation isn't shared.  The use of the PLT for PIC adds a fixed cost to
+every function call, which is small but might be noticeable when working with
+small operands.
+
+Calls from one library function to another don't need to go through the PLT,
+since of course the call instruction uses a displacement, not an absolute
+address, and the relative locations of object files are known when libgmp.so
+is created.  "ld -Bsymbolic" (or "gcc -Wl,-Bsymbolic") will resolve calls
+this way, so that there's no jump through the PLT, but of course leaving
+setups of the GOT address in %ebx that may be unnecessary.
+
+The %ebx setup could be avoided in assembly if a separate option controlled
+PIC for calls as opposed to computed jumps etc.  But there's only ever
+likely to be a handful of calls out of assembler, and getting the same
+optimization for C intra-library calls would be more important.  There seems
+no easy way to tell gcc that certain functions can be called non-PIC, and
+unfortunately many gmp functions use the global memory allocation variables,
+so they need the GOT anyway.  Object files with no global data references
+and only intra-library calls could go into the library as non-PIC under
+-Bsymbolic.  Integrating this into libtool and automake is left as an
+exercise for the reader.
+
+
+
+SIMPLE LOOPS
+
+The overheads in setting up for an unrolled loop can mean that at small
+sizes a simple loop is faster.  Making small sizes go fast is important,
+even if it adds a cycle or two to bigger sizes.  To this end various
+routines choose between a simple loop and an unrolled loop according to
+operand size.  The path to the simple loop, or to special case code for
+small sizes, is always as fast as possible.
+
+Adding a simple loop requires a conditional jump to choose between the
+simple and unrolled code.  The size of a branch misprediction penalty
+affects whether a simple loop is worthwhile.
+
+The convention is for an m4 definition UNROLL_THRESHOLD to set the crossover
+point, with sizes < UNROLL_THRESHOLD using the simple loop, sizes >=
+UNROLL_THRESHOLD using the unrolled loop.  If position independent code adds
+a couple of cycles to an unrolled loop setup, the threshold will vary with
+PIC or non-PIC.  Something like the following is typical.
+
+	ifdef(`PIC',`
+	deflit(UNROLL_THRESHOLD, 10)
+	',`
+	deflit(UNROLL_THRESHOLD, 8)
+	')
+
+There's no automated way to determine the threshold.  Setting it to a small
+value and then to a big value makes it possible to measure the simple and
+unrolled loops each over a range of sizes, from which the crossover point
+can be determined.  Alternately, just adjust the threshold up or down until
+there's no more speedups.
+
+
+
+UNROLLED LOOP CODING
+
+The x86 addressing modes allow a byte displacement of -128 to +127, making
+it possible to access 256 bytes, which is 64 limbs, without adjusting
+pointer registers within the loop.  Dword sized displacements can be used
+too, but they increase code size, and unrolling to 64 ought to be enough.
+
+When unrolling to the full 64 limbs/loop, the limb at the top of the loop
+will have a displacement of -128, so pointers have to have a corresponding
++128 added before entering the loop.  When unrolling to 32 limbs/loop
+displacements 0 to 127 can be used with 0 at the top of the loop and no
+adjustment needed to the pointers.
+
+Where 64 limbs/loop is supported, the +128 adjustment is done only when 64
+limbs/loop is selected.  Usually the gain in speed using 64 instead of 32 or
+16 is small, so support for 64 limbs/loop is generally only for comparison.
+
+
+
+COMPUTED JUMPS
+
+When working from least significant limb to most significant limb (most
+routines) the computed jump and pointer calculations in preparation for an
+unrolled loop are as follows.
+
+	S = operand size in limbs
+	N = number of limbs per loop (UNROLL_COUNT)
+	L = log2 of unrolling (UNROLL_LOG2)
+	M = mask for unrolling (UNROLL_MASK)
+	C = code bytes per limb in the loop
+	B = bytes per limb (4 for x86)
+	
+	computed jump            (-S & M) * C + entrypoint
+	subtract from pointers   (-S & M) * B
+	initial loop counter     (S-1) >> L
+	displacements            0 to B*(N-1)
+
+The loop counter is decremented at the end of each loop, and the looping
+stops when the decrement takes the counter to -1.  The displacements are for
+the addressing accessing each limb, eg. a load with "movl disp(%ebx), %eax".
+
+Usually the multiply by "C" can be handled without an imul, using instead an
+leal, or a shift and subtract.
+
+When working from most significant to least significant limb (eg. mpn_lshift
+and mpn_copyd), the calculations change as follows.
+
+	add to pointers          (-S & M) * B
+	displacements            0 to -B*(N-1)
+
+
+
+OLD GAS 1.92.3
+
+This version comes with FreeBSD 2.2.8 and has a couple of gremlins that
+affect gmp code.
+
+Firstly, an expression involving two forward references to labels comes out
+as zero.  For example,
+
+		addl	$bar-foo, %eax
+	foo:
+		nop
+	bar:
+
+This should lead to "addl $1, %eax", but it comes out as "addl $0, %eax".
+When only one forward reference is involved, it works correctly, as for
+example,
+
+	foo:
+		addl	$bar-foo, %eax
+		nop
+	bar:
+
+Secondly, an expression involving two labels can't be used as the
+displacement for an leal.  For example,
+
+	foo:
+		nop
+	bar:
+		leal	bar-foo(%eax,%ebx,8), %ecx
+
+A slightly cryptic error is given, "Unimplemented segment type 0 in
+parse_operand".  When only one label is used it's ok, and the label can be a
+forward reference too, as for example,
+
+		leal	foo(%eax,%ebx,8), %ecx
+		nop
+	foo:
+
+These problems only affect PIC computed jump calculations.  The workarounds
+are just to do an leal without a displacement and then an addl, and to make
+sure the code is placed so that there's at most one forward reference in the
+addl.
+
+
+
+REFERENCES
+
+"Intel Architecture Software Developer's Manual", volumes 1 to 3, 1999,
+order numbers 243190, 243191 and 243192.  Available on-line,
+
+	ftp://download.intel.com/design/PentiumII/manuals/243190.htm
+	ftp://download.intel.com/design/PentiumII/manuals/243191.htm
+	ftp://download.intel.com/design/PentiumII/manuals/243192.htm
+
+"Intel386 Family Binary Compatibility Specification 2", Intel Corporation,
+published by McGraw-Hill, 1991, ISBN 0-07-031219-2.
+
+"System V Application Binary Interface", Unix System Laboratories Inc, 1992,
+published by Prentice Hall, ISBN 0-13-880410-9.  And the "Intel386 Processor
+Supplement", AT&T, 1991, ISBN 0-13-877689-X.  (These have details of ELF
+shared library PIC coding.)
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/addsub_n.S b/rts/gmp/mpn/x86/addsub_n.S
new file mode 100644
index 0000000000..fe6f648f53
--- /dev/null
+++ b/rts/gmp/mpn/x86/addsub_n.S
@@ -0,0 +1,174 @@
+/* Currently not working and not used. */
+
+/*
+Copyright (C) 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+
+#define SAVE_BORROW_RESTORE_CARRY(r)	adcl r,r; shll $31,r
+#define SAVE_CARRY_RESTORE_BORROW(r)	adcl r,r
+
+	.globl	mpn_addsub_n_0
+	.globl	mpn_addsub_n_1
+
+/* Cute i386/i486/p6 addsub loop for the "full overlap" case r1==s2,r2==s1.
+   We let subtraction and addition alternate in being two limbs
+   ahead of the other, thereby avoiding some SAVE_RESTORE. */
+// r1 = r2 + r1    edi = esi + edi
+// r2 = r2 - r1    esi = esi - edi
+//			s1  s2
+//	                r2  r1
+//	eax,ebx,ecx,edx,esi,edi,ebp
+mpn_addsub_n_0:
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),%edi		/* res_ptr */
+	movl	24(%esp),%esi		/* s1_ptr */
+	movl	36(%esp),%ebp		/* size */
+
+	shrl	$2,%ebp
+	xorl	%edx,%edx
+	.align	4
+Loop0:				// L=load E=execute S=store
+	movl	(%esi),%ebx	// sub 0 L
+	movl	4(%esi),%ecx	// sub 1 L
+	sbbl	(%edi),%ebx	// sub 0 LE
+	sbbl	4(%edi),%ecx	// sub 1 LE
+//	SAVE_BORROW_RESTORE_CARRY(%edx)
+	movl	(%esi),%eax	// add 0 L
+	adcl	%eax,(%edi)	// add 0 LES
+	movl	4(%esi),%eax	// add 1 L
+	adcl	%eax,4(%edi)	// add 1 LES
+	movl	%ebx,(%esi)	// sub 0 S
+	movl	%ecx,4(%esi)	// sub 1 S
+	movl	8(%esi),%ebx	// add 2 L
+	adcl	8(%edi),%ebx	// add 2 LE
+	movl	12(%esi),%ecx	// add 3 L
+	adcl	12(%edi),%ecx	// add 3 LE
+//	SAVE_CARRY_RESTORE_BORROW(%edx)
+	movl	8(%edi),%eax	// sub 2 L
+	sbbl	%eax,8(%esi)	// sub 2 LES
+	movl	12(%edi),%eax	// sub 3 L
+	sbbl	%eax,12(%esi)	// sub 3 LES
+	movl	%ebx,8(%edi)	// add 2 S
+	movl	%ecx,12(%edi)	// add 3 S
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ebp
+	jnz	Loop0
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+/* Cute i386/i486/p6 addsub loop for the "full overlap" case r1==s1,r2==s2.
+   We let subtraction and addition alternate in being two limbs
+   ahead of the other, thereby avoiding some SAVE_RESTORE. */
+// r1 = r1 + r2    edi = edi + esi
+// r2 = r1 - r2    esi = edi - esi
+//			s2  s1
+//	                r2  r1
+//	eax,ebx,ecx,edx,esi,edi,ebp
+mpn_addsub_n_1:
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),%edi		/* res_ptr */
+	movl	24(%esp),%esi		/* s1_ptr */
+	movl	36(%esp),%ebp		/* size */
+
+	shrl	$2,%ebp
+	xorl	%edx,%edx
+	.align	4
+Loop1:				// L=load E=execute S=store
+	movl	(%edi),%ebx	// sub 0 L
+	sbbl	(%esi),%ebx	// sub 0 LE
+	movl	4(%edi),%ecx	// sub 1 L
+	sbbl	4(%esi),%ecx	// sub 1 LE
+//	SAVE_BORROW_RESTORE_CARRY(%edx)
+	movl	(%esi),%eax	// add 0 L
+	adcl	%eax,(%edi)	// add 0 LES
+	movl	4(%esi),%eax	// add 1 L
+	adcl	%eax,4(%edi)	// add 1 LES
+	movl	%ebx,(%esi)	// sub 0 S
+	movl	%ecx,4(%esi)	// sub 1 S
+	movl	8(%esi),%ebx	// add 2 L
+	adcl	8(%edi),%ebx	// add 2 LE
+	movl	12(%esi),%ecx	// add 3 L
+	adcl	12(%edi),%ecx	// add 3 LE
+//	SAVE_CARRY_RESTORE_BORROW(%edx)
+	movl	8(%edi),%eax	// sub 2 L
+	sbbl	8(%esi),%eax	// sub 2 LES
+	movl	%eax,8(%esi)	// sub 2 S
+	movl	12(%edi),%eax	// sub 3 L
+	sbbl	12(%esi),%eax	// sub 3 LE
+	movl	%eax,12(%esi)	// sub 3 S
+	movl	%ebx,8(%edi)	// add 2 S
+	movl	%ecx,12(%edi)	// add 3 S
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ebp
+	jnz	Loop1
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+	.globl	mpn_copy
+mpn_copy:
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),%edi		/* res_ptr */
+	movl	24(%esp),%esi		/* s1_ptr */
+	movl	28(%esp),%ebp		/* size */
+
+	shrl	$2,%ebp
+	.align	4
+Loop2:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	8(%esi),%eax
+	movl	12(%esi),%ebx
+	movl	%eax,8(%edi)
+	movl	%ebx,12(%edi)
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ebp
+	jnz	Loop2
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
diff --git a/rts/gmp/mpn/x86/aors_n.asm b/rts/gmp/mpn/x86/aors_n.asm
new file mode 100644
index 0000000000..18ef816b4d
--- /dev/null
+++ b/rts/gmp/mpn/x86/aors_n.asm
@@ -0,0 +1,187 @@
+dnl  x86 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl  Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl  Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_add_n',`
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+
+',`ifdef(`OPERATION_sub_n',`
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C	                    mp_size_t size, mp_limb_t carry);
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(M4_function_nc)
+deflit(`FRAME',0)
+
+	pushl	%edi		FRAME_pushl()
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%edx
+	movl	PARAM_SIZE,%ecx
+
+	movl	%ecx,%eax
+	shrl	$3,%ecx			C compute count for unrolled loop
+	negl	%eax
+	andl	$7,%eax			C get index where to start loop
+	jz	LF(M4_function_n,oopgo)	C necessary special case for 0
+	incl	%ecx			C adjust loop count
+	shll	$2,%eax			C adjustment for pointers...
+	subl	%eax,%edi		C ... since they are offset ...
+	subl	%eax,%esi		C ... by a constant when we ...
+	subl	%eax,%edx		C ... enter the loop
+	shrl	$2,%eax			C restore previous value
+
+ifdef(`PIC',`
+	C Calculate start address in loop for PIC.  Due to limitations in
+	C old gas, LF(M4_function_n,oop)-L(0a)-3 cannot be put into the leal
+	call	L(0a)
+L(0a):	leal	(%eax,%eax,8),%eax
+	addl	(%esp),%eax
+	addl	$LF(M4_function_n,oop)-L(0a)-3,%eax
+	addl	$4,%esp
+',`
+	C Calculate start address in loop for non-PIC.
+ 	leal	LF(M4_function_n,oop)-3(%eax,%eax,8),%eax
+')
+
+	C These lines initialize carry from the 5th parameter.  Should be
+	C possible to simplify.
+	pushl	%ebp		FRAME_pushl()
+	movl	PARAM_CARRY,%ebp
+	shrl	$1,%ebp			C shift bit 0 into carry
+	popl	%ebp		FRAME_popl()
+
+	jmp	*%eax			C jump into loop
+
+EPILOGUE()
+
+
+	ALIGN(8)
+PROLOGUE(M4_function_n)
+deflit(`FRAME',0)
+
+	pushl	%edi		FRAME_pushl()
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%edx
+	movl	PARAM_SIZE,%ecx
+
+	movl	%ecx,%eax
+	shrl	$3,%ecx			C compute count for unrolled loop
+	negl	%eax
+	andl	$7,%eax			C get index where to start loop
+	jz	L(oop)			C necessary special case for 0
+	incl	%ecx			C adjust loop count
+	shll	$2,%eax			C adjustment for pointers...
+	subl	%eax,%edi		C ... since they are offset ...
+	subl	%eax,%esi		C ... by a constant when we ...
+	subl	%eax,%edx		C ... enter the loop
+	shrl	$2,%eax			C restore previous value
+
+ifdef(`PIC',`
+	C Calculate start address in loop for PIC.  Due to limitations in
+	C some assemblers, L(oop)-L(0b)-3 cannot be put into the leal
+	call	L(0b)
+L(0b):	leal	(%eax,%eax,8),%eax
+	addl	(%esp),%eax
+	addl	$L(oop)-L(0b)-3,%eax
+	addl	$4,%esp
+',`
+	C Calculate start address in loop for non-PIC.
+ 	leal	L(oop)-3(%eax,%eax,8),%eax
+')
+	jmp	*%eax			C jump into loop
+
+L(oopgo):
+	pushl	%ebp		FRAME_pushl()
+	movl	PARAM_CARRY,%ebp
+	shrl	$1,%ebp			C shift bit 0 into carry
+	popl	%ebp		FRAME_popl()
+
+	ALIGN(8)
+L(oop):	movl	(%esi),%eax
+	M4_inst	(%edx),%eax
+	movl	%eax,(%edi)
+	movl	4(%esi),%eax
+	M4_inst	4(%edx),%eax
+	movl	%eax,4(%edi)
+	movl	8(%esi),%eax
+	M4_inst	8(%edx),%eax
+	movl	%eax,8(%edi)
+	movl	12(%esi),%eax
+	M4_inst	12(%edx),%eax
+	movl	%eax,12(%edi)
+	movl	16(%esi),%eax
+	M4_inst	16(%edx),%eax
+	movl	%eax,16(%edi)
+	movl	20(%esi),%eax
+	M4_inst	20(%edx),%eax
+	movl	%eax,20(%edi)
+	movl	24(%esi),%eax
+	M4_inst	24(%edx),%eax
+	movl	%eax,24(%edi)
+	movl	28(%esi),%eax
+	M4_inst	28(%edx),%eax
+	movl	%eax,28(%edi)
+	leal	32(%edi),%edi
+	leal	32(%esi),%esi
+	leal	32(%edx),%edx
+	decl	%ecx
+	jnz	L(oop)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/aorsmul_1.asm b/rts/gmp/mpn/x86/aorsmul_1.asm
new file mode 100644
index 0000000000..f32ad83989
--- /dev/null
+++ b/rts/gmp/mpn/x86/aorsmul_1.asm
@@ -0,0 +1,134 @@
+dnl  x86 __gmpn_addmul_1 (for 386 and 486) -- Multiply a limb vector with a
+dnl  limb and add the result to a second limb vector.
+
+
+dnl  Copyright (C) 1992, 1994, 1997, 1999, 2000 Free Software Foundation,
+dnl  Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_addmul_1',`
+      define(M4_inst,        addl)
+      define(M4_function_1,  mpn_addmul_1)
+
+',`ifdef(`OPERATION_submul_1',`
+      define(M4_inst,        subl)
+      define(M4_function_1,  mpn_submul_1)
+
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult);
+
+define(PARAM_MULTIPLIER, `FRAME+16(%esp)')
+define(PARAM_SIZE,       `FRAME+12(%esp)')
+define(PARAM_SRC,        `FRAME+8(%esp)')
+define(PARAM_DST,        `FRAME+4(%esp)')
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(M4_function_1)
+deflit(`FRAME',0)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ecx
+
+	xorl	%ebx,%ebx
+	andl	$3,%ecx
+	jz	L(end0)
+
+L(oop0):
+	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	leal	4(%esi),%esi
+	addl	%ebx,%eax
+	movl	$0,%ebx
+	adcl	%ebx,%edx
+	M4_inst	%eax,(%edi)
+	adcl	%edx,%ebx	C propagate carry into cylimb
+
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oop0)
+
+L(end0):
+	movl	PARAM_SIZE,%ecx
+	shrl	$2,%ecx
+	jz	L(end)
+
+	ALIGN(8)
+L(oop):	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	addl	%eax,%ebx
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	4(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	M4_inst	%ebx,(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	8(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	M4_inst	%ebp,4(%edi)
+	adcl	%eax,%ebx	C new lo + cylimb
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	12(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	M4_inst	%ebx,8(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	M4_inst	%ebp,12(%edi)
+	adcl	$0,%ebx		C propagate carry into cylimb
+
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ecx
+	jnz	L(oop)
+
+L(end):	movl	%ebx,%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/copyd.asm b/rts/gmp/mpn/x86/copyd.asm
new file mode 100644
index 0000000000..439640e836
--- /dev/null
+++ b/rts/gmp/mpn/x86/copyd.asm
@@ -0,0 +1,80 @@
+dnl  x86 mpn_copyd -- copy limb vector, decrementing.
+dnl 
+dnl  Future: On P6 an MMX loop should be able to go faster than this code.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, working from high to low addresses.
+C
+C The code here is very generic and can be expected to be reasonable on all
+C the x86 family.
+C
+C P5 - 1.0 cycles/limb.
+C
+C P6 - 2.4 cycles/limb, approx 40 cycles startup.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_copyd)
+	C eax	saved esi
+	C ebx
+	C ecx	counter
+	C edx	saved edi
+	C esi	src
+	C edi	dst
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, %eax
+
+	movl	PARAM_SRC, %esi
+	movl	%edi, %edx
+
+	movl	PARAM_DST, %edi
+	leal	-4(%esi,%ecx,4), %esi
+
+	leal	-4(%edi,%ecx,4), %edi
+	
+	std
+
+	rep
+	movsl
+
+	cld
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/copyi.asm b/rts/gmp/mpn/x86/copyi.asm
new file mode 100644
index 0000000000..5bc4e36689
--- /dev/null
+++ b/rts/gmp/mpn/x86/copyi.asm
@@ -0,0 +1,79 @@
+dnl  x86 mpn_copyi -- copy limb vector, incrementing.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, working from low to high addresses.
+C
+C The code here is very generic and can be expected to be reasonable on all
+C the x86 family.
+C
+C P5 - 1.0 cycles/limb.
+C
+C P6 - 0.75 cycles/limb.  An MMX based copy was tried, but was found to be
+C      slower than a rep movs in all cases.  The fastest MMX found was 0.8
+C      cycles/limb (when fully aligned).  A rep movs seems to have a startup
+C      time of about 15 cycles, but doing something special for small sizes
+C      could lead to a branch misprediction that would destroy any saving.
+C      For now a plain rep movs seems ok for P6.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	.text
+	ALIGN(32)
+
+	C eax	saved esi
+	C ebx
+	C ecx	counter
+	C edx	saved edi
+	C esi	src
+	C edi	dst
+	C ebp
+
+PROLOGUE(mpn_copyi)
+
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, %eax
+
+	movl	PARAM_SRC, %esi
+	movl	%edi, %edx
+
+	movl	PARAM_DST, %edi
+
+	cld	C better safe than sorry, see mpn/x86/README.family
+
+	rep
+	movsl
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/diveby3.asm b/rts/gmp/mpn/x86/diveby3.asm
new file mode 100644
index 0000000000..df879da9e1
--- /dev/null
+++ b/rts/gmp/mpn/x86/diveby3.asm
@@ -0,0 +1,115 @@
+dnl  x86 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl  The following all have their own optimized versions of this routine,
+dnl  but for reference the code here runs as follows.
+dnl
+dnl       cycles/limb
+dnl  P54     18.0
+dnl  P55     17.0
+dnl  P6      14.5
+dnl  K6      14.0
+dnl  K7      10.0
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                              mp_limb_t carry);
+
+defframe(PARAM_CARRY,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,   8)
+defframe(PARAM_DST,   4)
+
+dnl  multiplicative inverse of 3, modulo 2^32
+deflit(INVERSE_3,       0xAAAAAAAB)
+
+dnl  ceil(b/3) and ceil(b*2/3) where b=2^32
+deflit(ONE_THIRD_CEIL,  0x55555556)
+deflit(TWO_THIRDS_CEIL, 0xAAAAAAAB)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(mpn_divexact_by3c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SRC, %ecx
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_SIZE, %ebp
+	pushl	%edi		FRAME_pushl()
+
+	movl	PARAM_DST, %edi
+	pushl	%esi		FRAME_pushl()
+
+	movl	$INVERSE_3, %esi
+	pushl	%ebx		FRAME_pushl()
+
+	leal	(%ecx,%ebp,4), %ecx
+	movl	PARAM_CARRY, %ebx
+
+	leal	(%edi,%ebp,4), %edi
+	negl	%ebp
+
+
+	ALIGN(8)
+L(top):
+	C eax	scratch, low product
+	C ebx	carry limb (0 to 3)
+	C ecx	&src[size]
+	C edx	scratch, high product
+	C esi	multiplier
+	C edi	&dst[size]
+	C ebp	counter, limbs, negative
+
+	movl	(%ecx,%ebp,4), %eax
+
+	subl	%ebx, %eax
+
+	setc	%bl
+
+	imull	%esi
+
+	cmpl	$ONE_THIRD_CEIL, %eax
+	movl	%eax, (%edi,%ebp,4)
+
+	sbbl	$-1, %ebx	C +1 if eax>=ceil(b/3)
+	cmpl	$TWO_THIRDS_CEIL, %eax
+
+	sbbl	$-1, %ebx	C +1 if eax>=ceil(b*2/3)
+	incl	%ebp
+
+	jnz	L(top)
+
+
+	movl	%ebx, %eax
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	popl	%ebp
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/divrem_1.asm b/rts/gmp/mpn/x86/divrem_1.asm
new file mode 100644
index 0000000000..12f14676d6
--- /dev/null
+++ b/rts/gmp/mpn/x86/divrem_1.asm
@@ -0,0 +1,232 @@
+dnl  x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl        cycles/limb
+dnl  K6        20
+dnl  P5        44
+dnl  P6        39
+dnl  486   approx 43 maybe
+dnl
+dnl
+dnl  The following have their own optimized divrem_1 implementations, but
+dnl  for reference the code here runs as follows.
+dnl
+dnl        cycles/limb
+dnl  P6MMX     39
+dnl  K7        42
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C
+C Divide src,size by divisor and store the quotient in dst+xsize,size.
+C Extend the division to fractional quotient limbs in dst,xsize.  Return the
+C remainder.  Either or both xsize and size can be 0.
+C
+C mpn_divrem_1c takes a carry parameter which is an initial high limb,
+C effectively one extra limb at the top of src,size.  Must have
+C carry<divisor.
+C
+C
+C Essentially the code is the same as the division based part of
+C mpn/generic/divrem_1.c, but has the following advantages.
+C
+C - If gcc isn't being used then divrem_1.c will get the generic C
+C   udiv_qrnnd() and be rather slow.
+C
+C - On K6, using the loop instruction is a 10% speedup, but gcc doesn't
+C   generate that instruction (as of gcc 2.95.2 at least).
+C
+C A test is done to see if the high limb is less the the divisor, and if so
+C one less div is done.  A div is between 20 and 40 cycles on the various
+C x86s, so assuming high<divisor about half the time, then this test saves
+C half that amount.  The branch misprediction penalty on each chip is less
+C than half a div.
+C  	
+C
+C K6: Back-to-back div instructions run at 20 cycles, the same as the loop
+C     here, so it seems there's nothing to gain by rearranging the loop.
+C     Pairing the mov and loop instructions was found to gain nothing.  (The
+C     same is true of the mpn/x86/mod_1.asm loop.)
+C
+C     With a "decl/jnz" rather than a "loop" this code runs at 22 cycles.
+C     The loop_or_decljnz macro is an easy way to get a 10% speedup.
+C
+C     The fast K6 multiply might be thought to suit a multiply-by-inverse,
+C     but that algorithm has been found to suffer from the releatively poor
+C     carry handling on K6 and too many auxiliary instructions.  The
+C     fractional part however could be done at about 13 c/l.
+C
+C P5: Moving the load down to pair with the store might save 1 cycle, but
+C     that doesn't seem worth bothering with, since it'd be only a 2.2%
+C     saving.
+C
+C     Again here the auxiliary instructions hinder a multiply-by-inverse,
+C     though there might be a 10-15% speedup available
+
+
+defframe(PARAM_CARRY,  24)
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+	.text
+	ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%edi		FRAME_pushl()
+	
+	movl	PARAM_SRC, %edi
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %esi
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_DST, %ebx
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_XSIZE, %ebp
+	orl	%ecx, %ecx
+
+	movl	PARAM_CARRY, %edx
+	jz	LF(mpn_divrem_1,fraction)
+
+	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
+	jmp	LF(mpn_divrem_1,integer_top)
+
+EPILOGUE()
+
+
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%edi		FRAME_pushl()
+	
+	movl	PARAM_SRC, %edi
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %esi
+	orl	%ecx,%ecx
+
+	jz	L(size_zero)
+	pushl	%ebx		FRAME_pushl()
+
+	movl	-4(%edi,%ecx,4), %eax	C src high limb
+	xorl	%edx, %edx
+
+	movl	PARAM_DST, %ebx
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_XSIZE, %ebp
+	cmpl	%esi, %eax
+
+	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
+	jae	L(integer_entry)
+
+
+	C high<divisor, so high of dst is zero, and avoid one div
+
+	movl	%edx, (%ebx,%ecx,4)
+	decl	%ecx
+
+	movl	%eax, %edx
+	jz	L(fraction)
+
+
+L(integer_top):
+	C eax	scratch (quotient)
+	C ebx	dst+4*xsize-4
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	divisor
+	C edi	src
+	C ebp	xsize
+
+	movl	-4(%edi,%ecx,4), %eax
+L(integer_entry):
+
+	divl	%esi
+
+	movl	%eax, (%ebx,%ecx,4)
+	loop_or_decljnz	L(integer_top)
+
+
+L(fraction):
+	orl	%ebp, %ecx
+	jz	L(done)
+
+	movl	PARAM_DST, %ebx
+
+
+L(fraction_top):
+	C eax	scratch (quotient)
+	C ebx	dst
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	divisor
+	C edi
+	C ebp
+
+	xorl	%eax, %eax
+
+	divl	%esi
+
+	movl	%eax, -4(%ebx,%ecx,4)
+	loop_or_decljnz	L(fraction_top)
+
+
+L(done):
+	popl	%ebp
+	movl	%edx, %eax
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+L(size_zero):
+deflit(`FRAME',8)
+	movl	PARAM_XSIZE, %ecx
+	xorl	%eax, %eax
+
+	movl	PARAM_DST, %edi
+
+	cld	C better safe than sorry, see mpn/x86/README.family
+
+	rep
+	stosl
+
+	popl	%esi
+	popl	%edi
+	ret
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/README b/rts/gmp/mpn/x86/k6/README
new file mode 100644
index 0000000000..3ad96c8b89
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/README
@@ -0,0 +1,237 @@
+
+			AMD K6 MPN SUBROUTINES
+
+
+
+This directory contains code optimized for AMD K6 CPUs, meaning K6, K6-2 and
+K6-3.
+
+The mmx and k62mmx subdirectories have routines using MMX instructions.  All
+K6s have MMX, the separate directories are just so that ./configure can omit
+them if the assembler doesn't support MMX.
+
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache, are as follows.
+
+                                 cycles/limb
+
+	mpn_add_n/sub_n            3.25 normal, 2.75 in-place
+
+	mpn_mul_1                  6.25
+	mpn_add/submul_1           7.65-8.4  (varying with data values)
+
+	mpn_mul_basecase           9.25 cycles/crossproduct (approx)
+	mpn_sqr_basecase           4.7  cycles/crossproduct (approx)
+                                   or 9.2 cycles/triangleproduct (approx)
+
+	mpn_divrem_1              20.0
+	mpn_mod_1                 20.0
+	mpn_divexact_by3          11.0
+
+	mpn_l/rshift               3.0
+
+	mpn_copyi/copyd            1.0
+
+	mpn_com_n                  1.5-1.85  \
+	mpn_and/andn/ior/xor_n     1.5-1.75  | varying with
+	mpn_iorn/xnor_n            2.0-2.25  | data alignment
+	mpn_nand/nior_n            2.0-2.25  /
+
+	mpn_popcount		  12.5
+	mpn_hamdist		  13.0
+
+
+K6-2 and K6-3 have dual-issue MMX and get the following improvements.
+
+	mpn_l/rshift               1.75
+
+	mpn_copyi/copyd            0.56 or 1.0  \
+                                                |
+	mpn_com_n                  1.0-1.2      | varying with
+	mpn_and/andn/ior/xor_n     1.2-1.5      | data alignment
+	mpn_iorn/xnor_n            1.5-2.0      |
+	mpn_nand/nior_n            1.75-2.0     /
+
+	mpn_popcount		   9.0
+	mpn_hamdist		  11.5
+
+
+Prefetching of sources hasn't yet given any joy.  With the 3DNow "prefetch"
+instruction, code seems to run slower, and with just "mov" loads it doesn't
+seem faster.  Results so far are inconsistent.  The K6 does a hardware
+prefetch of the second cache line in a sector, so the penalty for not
+prefetching in software is reduced.
+
+
+
+
+NOTES
+
+All K6 family chips have MMX, but only K6-2 and K6-3 have 3DNow.
+
+Plain K6 executes MMX instructions only in the X pipe, but K6-2 and K6-3 can
+execute them in both X and Y (and together).
+
+Branch misprediction penalty is 1 to 4 cycles (Optimization Manual
+chapter 6 table 12).
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+Store queue is 7 entries of 64 bits each.
+
+Floating point multiplications can be done in parallel with integer
+multiplications, but there doesn't seem to be any way to make use of this.
+
+
+
+OPTIMIZATIONS
+
+Unrolled loops are used to reduce looping overhead.  The unrolling is
+configurable up to 32 limbs/loop for most routines, up to 64 for some.
+
+Sometimes computed jumps into the unrolling are used to handle sizes not a
+multiple of the unrolling.  An attractive feature of this is that times
+smoothly increase with operand size, but an indirect jump is about 6 cycles
+and the setups about another 6, so it depends on how much the unrolled code
+is faster than a simple loop as to whether a computed jump ought to be used.
+
+Position independent code is implemented using a call to get eip for
+computed jumps and a ret is always done, rather than an addl $4,%esp or a
+popl, so the CPU return address branch prediction stack stays synchronised
+with the actual stack in memory.  Such a call however still costs 4 to 7
+cycles.
+
+Branch prediction, in absence of any history, will guess forward jumps are
+not taken and backward jumps are taken.  Where possible it's arranged that
+the less likely or less important case is under a taken forward jump.
+
+
+
+MMX
+
+Putting emms or femms as late as possible in a routine seems to be fastest.
+Perhaps an emms or femms stalls until all outstanding MMX instructions have
+completed, so putting it later gives them a chance to complete on their own,
+in parallel with other operations (like register popping).
+
+The Optimization Manual chapter 5 recommends using a femms on K6-2 and K6-3
+at the start of a routine, in case it's been preceded by x87 floating point
+operations.  This isn't done because in gmp programs it's expected that x87
+floating point won't be much used and that chances are an mpn routine won't
+have been preceded by any x87 code.
+
+
+
+CODING
+
+Instructions in general code are shown paired if they can decode and execute
+together, meaning two short decode instructions with the second not
+depending on the first, only the first using the shifter, no more than one
+load, and no more than one store.
+
+K6 does some out of order execution so the pairings aren't essential, they
+just show what slots might be available.  When decoding is the limiting
+factor things can be scheduled that might not execute until later.
+
+
+
+NOTES
+
+Code alignment
+
+- if an opcode/modrm or 0Fh/opcode/modrm crosses a cache line boundary,
+  short decode is inhibited.  The cross.pl script detects this.
+
+- loops and branch targets should be aligned to 16 bytes, or ensure at least
+  2 instructions before a 32 byte boundary.  This makes use of the 16 byte
+  cache in the BTB.
+
+Addressing modes
+
+- (%esi) degrades decoding from short to vector.  0(%esi) doesn't have this
+  problem, and can be used as an equivalent, or easier is just to use a
+  different register, like %ebx.
+
+- K6 and pre-CXT core K6-2 have the following problem.  (K6-2 CXT and K6-3
+  have it fixed, these being cpuid function 1 signatures 0x588 to 0x58F).
+
+  If more than 3 bytes are needed to determine instruction length then
+  decoding degrades from direct to long, or from long to vector.  This
+  happens with forms like "0F opcode mod/rm" with mod/rm=00-xxx-100 since
+  with mod=00 the sib determines whether there's a displacement.
+
+  This affects all MMX and 3DNow instructions, and others with an 0F prefix
+  like movzbl.  The modes affected are anything with an index and no
+  displacement, or an index but no base, and this includes (%esp) which is
+  really (,%esp,1).
+
+  The cross.pl script detects problem cases.  The workaround is to always
+  use a displacement, and to do this with Zdisp if it's zero so the
+  assembler doesn't discard it.
+
+  See Optimization Manual rev D page 67 and 3DNow Porting Guide rev B pages
+  13-14 and 36-37.
+
+Calls
+
+- indirect jumps and calls are not branch predicted, they measure about 6
+  cycles.
+
+Various
+
+- adcl      2 cycles of decode, maybe 2 cycles executing in the X pipe
+- bsf       12-27 cycles
+- emms      5 cycles
+- femms     3 cycles
+- jecxz     2 cycles taken, 13 not taken (optimization manual says 7 not taken)
+- divl      20 cycles back-to-back
+- imull     2 decode, 2 execute
+- mull      2 decode, 3 execute (optimization manual decoding sample)
+- prefetch  2 cycles
+- rcll/rcrl implicit by one bit: 2 cycles
+            immediate or %cl count: 11 + 2 per bit for dword
+                                    13 + 4 per bit for byte
+- setCC	    2 cycles
+- xchgl	%eax,reg  1.5 cycles, back-to-back (strange)
+        reg,reg   2 cycles, back-to-back
+
+
+
+
+REFERENCES
+
+"AMD-K6 Processor Code Optimization Application Note", AMD publication
+number 21924, revision D amendment 0, January 2000.  This describes K6-2 and
+K6-3.  Available on-line,
+
+	http://www.amd.com/K6/k6docs/pdf/21924.pdf
+
+"AMD-K6 MMX Enhanced Processor x86 Code Optimization Application Note", AMD
+publication number 21828, revision A amendment 0, August 1997.  This is an
+older edition of the above document, describing plain K6.  Available
+on-line,
+
+	http://www.amd.com/K6/k6docs/pdf/21828.pdf
+
+"3DNow Technology Manual", AMD publication number 21928F/0-August 1999.
+This describes the femms and prefetch instructions, but nothing else from
+3DNow has been used.  Available on-line,
+
+	http://www.amd.com/K6/k6docs/pdf/21928.pdf
+
+"3DNow Instruction Porting Guide", AMD publication number 22621, revision B,
+August 1999.  This has some notes on general K6 optimizations as well as
+3DNow.  Available on-line,
+
+	http://www.amd.com/products/cpg/athlon/techdocs/pdf/22621.pdf
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/k6/aors_n.asm b/rts/gmp/mpn/x86/k6/aors_n.asm
new file mode 100644
index 0000000000..31b05ada51
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/aors_n.asm
@@ -0,0 +1,329 @@
+dnl  AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
+dnl 
+dnl  K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_add_n', `
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+	define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+	define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C	                      mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size.  The return value is the carry bit from the top of the result
+C (1 or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation.  Note values other than 1 or 0 here will lead to garbage
+C results.
+C
+C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
+C an in-place dst+=src to 2.5 c/l.  The unrolled loops have 1 cycle/loop of
+C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
+
+define(PARAM_CARRY, `FRAME+20(%esp)')
+define(PARAM_SIZE,  `FRAME+16(%esp)')
+define(PARAM_SRC2,  `FRAME+12(%esp)')
+define(PARAM_SRC1,  `FRAME+8(%esp)')
+define(PARAM_DST,   `FRAME+4(%esp)')
+deflit(`FRAME',0)
+
+dnl  minimum 5 because the unrolled code can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(M4_function_nc)
+	movl	PARAM_CARRY, %eax
+	jmp	LF(M4_function_n,start)
+EPILOGUE()
+
+
+PROLOGUE(M4_function_n)
+	xorl	%eax, %eax
+L(start):
+	movl	PARAM_SIZE, %ecx
+	pushl	%ebx
+FRAME_pushl()
+
+	movl	PARAM_SRC1, %ebx
+	pushl	%edi
+FRAME_pushl()
+
+	movl	PARAM_SRC2, %edx
+	cmpl	$UNROLL_THRESHOLD, %ecx
+
+	movl	PARAM_DST, %edi
+	jae	L(unroll)
+
+
+	shrl	%eax		C initial carry flag
+
+	C offset 0x21 here, close enough to aligned
+L(simple):
+	C eax	scratch
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C The store to (%edi) could be done with a stosl; it'd be smaller
+	C code, but there's no speed gain and a cld would have to be added
+	C (per mpn/x86/README.family).
+
+	movl	(%ebx), %eax
+	leal	4(%ebx), %ebx
+	
+	M4_inst	(%edx), %eax
+
+	movl	%eax, (%edi)
+	leal	4(%edi), %edi
+
+	leal	4(%edx), %edx
+	loop	L(simple)
+
+	
+	movl	$0, %eax
+	popl	%edi
+
+	setc	%al
+
+	popl	%ebx
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(unroll):
+	C eax	carry
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+
+	cmpl	%edi, %ebx
+	pushl	%esi
+
+	je	L(inplace)
+
+ifdef(`OPERATION_add_n',`
+	cmpl	%edi, %edx
+
+	je	L(inplace_reverse)
+')
+
+	movl	%ecx, %esi
+
+	andl	$-4, %ecx
+	andl	$3, %esi
+
+	leal	(%ebx,%ecx,4), %ebx
+	leal	(%edx,%ecx,4), %edx
+	leal	(%edi,%ecx,4), %edi
+
+	negl	%ecx
+	shrl	%eax
+
+	ALIGN(32)
+L(normal_top):
+	C eax	counter, qwords, negative
+	C ebx	src1
+	C ecx	scratch
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+
+ 	movl	(%ebx,%ecx,4), %eax
+	leal	5(%ecx), %ecx
+ 	M4_inst	-20(%edx,%ecx,4), %eax
+ 	movl	%eax, -20(%edi,%ecx,4)
+
+ 	movl	4-20(%ebx,%ecx,4), %eax
+ 	M4_inst	4-20(%edx,%ecx,4), %eax
+ 	movl	%eax, 4-20(%edi,%ecx,4)
+
+ 	movl	8-20(%ebx,%ecx,4), %eax
+ 	M4_inst	8-20(%edx,%ecx,4), %eax
+ 	movl	%eax, 8-20(%edi,%ecx,4)
+
+ 	movl	12-20(%ebx,%ecx,4), %eax
+ 	M4_inst	12-20(%edx,%ecx,4), %eax
+ 	movl	%eax, 12-20(%edi,%ecx,4)
+
+	loop	L(normal_top)
+
+
+	decl	%esi
+	jz	L(normal_finish_one)
+	js	L(normal_done)
+
+	C two or three more limbs
+
+ 	movl	(%ebx), %eax
+ 	M4_inst	(%edx), %eax
+ 	movl	%eax, (%edi)
+
+ 	movl	4(%ebx), %eax
+ 	M4_inst	4(%edx), %eax
+	decl	%esi
+ 	movl	%eax, 4(%edi)
+
+	jz	L(normal_done)
+	movl	$2, %ecx
+
+L(normal_finish_one):
+ 	movl	(%ebx,%ecx,4), %eax
+ 	M4_inst	(%edx,%ecx,4), %eax
+ 	movl	%eax, (%edi,%ecx,4)
+
+L(normal_done):	
+	popl	%esi
+	popl	%edi
+
+	movl	$0, %eax
+	popl	%ebx
+
+	setc	%al
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+
+ifdef(`OPERATION_add_n',`
+L(inplace_reverse):
+	C dst==src2
+
+	movl	%ebx, %edx
+')
+
+L(inplace):
+	C eax	initial carry
+	C ebx
+	C ecx	size
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+	leal	-1(%ecx), %esi
+	decl	%ecx
+
+	andl	$-4, %ecx
+	andl	$3, %esi
+
+ 	movl	(%edx), %ebx		C src low limb
+	leal	(%edx,%ecx,4), %edx
+
+	leal	(%edi,%ecx,4), %edi
+	negl	%ecx
+
+	shrl	%eax
+
+
+	ALIGN(32)
+L(inplace_top):
+	C eax
+	C ebx	next src limb
+	C ecx	size
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+ 	M4_inst	%ebx, (%edi,%ecx,4)
+
+ 	movl	4(%edx,%ecx,4), %eax
+	leal	5(%ecx), %ecx
+
+ 	M4_inst	%eax, 4-20(%edi,%ecx,4)
+
+ 	movl	8-20(%edx,%ecx,4), %eax
+ 	movl	12-20(%edx,%ecx,4), %ebx
+
+ 	M4_inst	%eax, 8-20(%edi,%ecx,4)
+ 	M4_inst	%ebx, 12-20(%edi,%ecx,4)
+
+ 	movl	16-20(%edx,%ecx,4), %ebx
+	loop	L(inplace_top)
+
+
+	C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
+
+ 	M4_inst	%ebx, (%edi)
+
+	decl	%esi
+	jz	L(inplace_finish_one)
+	js	L(inplace_done)
+
+	C two or three more limbs
+
+ 	movl	4(%edx), %eax
+ 	movl	8(%edx), %ebx
+ 	M4_inst	%eax, 4(%edi)
+ 	M4_inst	%ebx, 8(%edi)
+
+	decl	%esi
+	movl	$2, %ecx
+
+	jz	L(normal_done)
+
+L(inplace_finish_one):
+ 	movl	4(%edx,%ecx,4), %eax
+ 	M4_inst	%eax, 4(%edi,%ecx,4)
+
+L(inplace_done):	
+	popl	%esi
+	popl	%edi
+
+	movl	$0, %eax
+	popl	%ebx
+
+	setc	%al
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/aorsmul_1.asm b/rts/gmp/mpn/x86/k6/aorsmul_1.asm
new file mode 100644
index 0000000000..da4120fe2f
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/aorsmul_1.asm
@@ -0,0 +1,372 @@
+dnl  AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+dnl 
+dnl  K6: 7.65 to 8.5 cycles/limb (at 16 limbs/loop and depending on the data),
+dnl  PIC adds about 6 cycles at the start.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K6:           large multpliers  small multpliers
+dnl  UNROLL_COUNT    cycles/limb       cycles/limb
+dnl        4             9.5              7.78
+dnl        8             9.0              7.78
+dnl       16             8.4              7.65
+dnl       32             8.4              8.2
+dnl
+dnl  Maximum possible unrolling with the current code is 32.
+dnl
+dnl  Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
+dnl  byte block, which might explain the good speed at that unrolling.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1', `
+	define(M4_inst,        addl)
+	define(M4_function_1,  mpn_addmul_1)
+	define(M4_function_1c, mpn_addmul_1c)
+	define(M4_description, add it to)
+	define(M4_desc_retval, carry)
+',`ifdef(`OPERATION_submul_1', `
+	define(M4_inst,        subl)
+	define(M4_function_1,  mpn_submul_1)
+	define(M4_function_1c, mpn_submul_1c)
+	define(M4_description, subtract it from)
+	define(M4_desc_retval, borrow)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult);
+C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                           mp_limb_t mult, mp_limb_t carry);
+C
+C Calculate src,size multiplied by mult and M4_description dst,size.
+C Return the M4_desc_retval limb from the top of the result.
+C
+C The jadcl0()s in the unrolled loop makes the speed data dependent.  Small
+C multipliers (most significant few bits clear) result in few carry bits and
+C speeds up to 7.65 cycles/limb are attained.  Large multipliers (most
+C significant few bits set) make the carry bits 50/50 and lead to something
+C more like 8.4 c/l.  (With adcl's both of these would be 9.3 c/l.)
+C
+C It's important that the gains for jadcl0 on small multipliers don't come
+C at the cost of slowing down other data.  Tests on uniformly distributed
+C random data, designed to confound branch prediction, show about a 7%
+C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
+C overheads included).
+C
+C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
+C 11.0 cycles/limb), and hence isn't used.
+C
+C In the simple loop, note that running ecx from negative to zero and using
+C it as an index in the two movs wouldn't help.  It would save one
+C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
+C that would be collapsed by this.
+C
+C
+C jadcl0
+C ------
+C
+C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
+C firstly the instruction decoding and secondly the fact that there's a
+C carry bit for the jadcl0 only on average about 1/4 of the time.
+C
+C The code in the unrolled loop decodes something like the following.
+C
+C                                         decode cycles
+C		mull	%ebp                    2
+C		M4_inst	%esi, disp(%edi)        1
+C		adcl	%eax, %ecx              2
+C		movl	%edx, %esi            \ 1
+C		jnc	1f                    /
+C		incl	%esi                  \ 1
+C	1:	movl	disp(%ebx), %eax      /
+C                                              ---
+C                                               7
+C
+C In a back-to-back style test this measures 7 with the jnc not taken, or 8
+C with it taken (both when correctly predicted).  This is opposite to the
+C measurements showing small multipliers running faster than large ones.
+C Watch this space for more info ...
+C
+C It's not clear how much branch misprediction might be costing.  The K6
+C doco says it will be 1 to 4 cycles, but presumably it's near the low end
+C of that range to get the measured results.
+C
+C
+C In the code the two carries are more or less the preceding mul product and
+C the calculation is roughly
+C
+C	x*y + u*b+v
+C
+C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
+C v are the two limbs it's added to (being the low of the next mul, and a
+C limb from the destination).
+C
+C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
+C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
+C x*y/b^2.  If x, y, u and v are random and uniformly distributed between 0
+C and b-1, then the total probability can be summed over x and y,
+C
+C	 1    b-1 b-1 x*y    1    b*(b-1)   b*(b-1)
+C	--- * sum sum --- = --- * ------- * ------- = 1/4
+C       b^2   x=0 y=1 b^2   b^4      2         2
+C
+C Actually it's a very tiny bit less than 1/4 of course.  If y is fixed,
+C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
+
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 9)
+',`
+deflit(UNROLL_THRESHOLD, 6)
+')
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(M4_function_1c)
+	pushl	%esi
+deflit(`FRAME',4)
+	movl	PARAM_CARRY, %esi
+	jmp	LF(M4_function_1,start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function_1)
+	push	%esi
+deflit(`FRAME',4)
+	xorl	%esi, %esi	C initial carry
+
+L(start_nc):
+	movl	PARAM_SIZE, %ecx
+	pushl	%ebx
+deflit(`FRAME',8)
+
+	movl	PARAM_SRC, %ebx
+	pushl	%edi
+deflit(`FRAME',12)
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_DST, %edi
+
+	pushl	%ebp
+deflit(`FRAME',16)
+	jae	L(unroll)
+
+	
+	C simple loop
+
+	movl	PARAM_MULTIPLIER, %ebp
+
+L(simple):
+	C eax	scratch
+	C ebx	src
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	dst
+	C ebp	multiplier
+
+	movl	(%ebx), %eax
+	addl	$4, %ebx
+
+	mull	%ebp
+
+	addl	$4, %edi
+	addl	%esi, %eax
+
+	adcl	$0, %edx
+
+	M4_inst	%eax, -4(%edi)
+
+	adcl	$0, %edx
+
+	movl	%edx, %esi
+	loop	L(simple)
+
+
+	popl	%ebp
+	popl	%edi
+
+	popl	%ebx
+	movl	%esi, %eax
+
+	popl	%esi
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+C The unrolled loop uses a "two carry limbs" scheme.  At the top of the loop
+C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
+C For the computed jump an odd size means they start one way around, an even
+C size the other.
+C
+C VAR_JUMP holds the computed jump temporarily because there's not enough
+C registers at the point of doing the mul for the initial two carry limbs.
+C
+C The add/adc for the initial carry in %esi is necessary only for the
+C mpn_addmul/submul_1c entry points.  Duplicating the startup code to
+C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
+C idea.
+
+dnl  overlapping with parameters already fetched
+define(VAR_COUNTER, `PARAM_SIZE')
+define(VAR_JUMP,    `PARAM_DST')
+
+L(unroll):
+	C eax
+	C ebx	src
+	C ecx	size
+	C edx
+	C esi	initial carry
+	C edi	dst
+	C ebp
+
+	movl	%ecx, %edx
+	decl	%ecx
+
+	subl	$2, %edx
+	negl	%ecx
+
+	shrl	$UNROLL_LOG2, %edx
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%edx, VAR_COUNTER
+	movl	%ecx, %edx
+
+	shll	$4, %edx
+	negl	%ecx
+
+	C 15 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%edx,%ecx,1), %edx
+')
+	movl	(%ebx), %eax		C src low limb
+
+	movl	PARAM_MULTIPLIER, %ebp
+	movl	%edx, VAR_JUMP
+
+	mull	%ebp
+
+	addl	%esi, %eax	C initial carry (from _1c)
+	jadcl0(	%edx)
+
+
+	leal	4(%ebx,%ecx,4), %ebx
+	movl	%edx, %esi	C high carry
+
+	movl	VAR_JUMP, %edx
+	leal	(%edi,%ecx,4), %edi
+
+	testl	$1, %ecx
+	movl	%eax, %ecx	C low carry
+
+	jz	L(noswap)
+	movl	%esi, %ecx	C high,low carry other way around
+
+	movl	%eax, %esi
+L(noswap):
+
+	jmp	*%edx
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%edx,%ecx,1), %edx
+	addl	$L(entry)-L(here), %edx
+	addl	(%esp), %edx
+	ret
+')
+
+
+C -----------------------------------------------------------
+	ALIGN(32)
+L(top):
+deflit(`FRAME',16)
+	C eax	scratch
+	C ebx	src
+	C ecx	carry lo
+	C edx	scratch
+	C esi	carry hi
+	C edi	dst
+	C ebp	multiplier
+	C
+	C 15 code bytes per limb
+
+	leal	UNROLL_BYTES(%edi), %edi
+
+L(entry):
+forloop(`i', 0, UNROLL_COUNT/2-1, `
+	deflit(`disp0', eval(2*i*4))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%ebx), %eax)
+	mull	%ebp
+Zdisp(	M4_inst,%ecx, disp0,(%edi))
+	adcl	%eax, %esi
+	movl	%edx, %ecx
+	jadcl0(	%ecx)
+
+	movl	disp1(%ebx), %eax
+	mull	%ebp
+	M4_inst	%esi, disp1(%edi)
+	adcl	%eax, %ecx
+	movl	%edx, %esi
+	jadcl0(	%esi)
+')
+
+	decl	VAR_COUNTER
+	leal	UNROLL_BYTES(%ebx), %ebx
+
+	jns	L(top)
+
+
+	popl	%ebp
+	M4_inst	%ecx, UNROLL_BYTES(%edi)
+
+	popl	%edi
+	movl	%esi, %eax
+
+	popl	%ebx
+	jadcl0(	%eax)
+
+	popl	%esi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/cross.pl b/rts/gmp/mpn/x86/k6/cross.pl
new file mode 100644
index 0000000000..21734f3e52
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/cross.pl
@@ -0,0 +1,141 @@
+#! /usr/bin/perl
+
+# Copyright (C) 2000 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# Usage: cross.pl [filename.o]...
+#
+# Produce an annotated disassembly of the given object files, indicating
+# certain code alignment and addressing mode problems afflicting K6 chips.
+# "ZZ" is used on all annotations, so this can be searched for.
+#
+# With no arguments, all .o files corresponding to .asm files are processed.
+# This is good in the mpn object directory of a k6*-*-* build.
+#
+# As far as fixing problems goes, any cache line crossing problems in loops
+# get attention, but as a rule it's too tedious to rearrange code or slip in
+# nops to fix every problem in setup or finishup code.
+#
+# Bugs:
+#
+# Instructions without mod/rm bytes or which are already vector decoded are
+# unaffected by cache line boundary crossing, but not all of these have yet
+# been put in as exceptions.  All that occur in practice in GMP are present
+# though.
+#
+# There's no messages for using the vector decoded addressing mode (%esi),
+# but that mode is easy to avoid when coding.
+
+use strict;
+
+sub disassemble {
+    my ($file) = @_;
+    my ($addr,$b1,$b2,$b3, $prefix,$opcode,$modrm);
+
+    open (IN, "objdump -Srfh $file |")
+	|| die "Cannot open pipe from objdump\n";
+    while (<IN>) {
+	print;
+
+	if (/^[ \t]*[0-9]+[ \t]+\.text[ \t]/ && /2\*\*([0-9]+)$/) {
+	    if ($1 < 5) {
+		print "ZZ need at least 2**5 for predictable cache line crossing\n";
+	    }
+	}
+	
+	if (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)[ \t]+([0-9a-f]+)/) {
+	    ($addr,$b1,$b2,$b3) = ($1,$2,$3,$4);
+
+	} elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)/) {
+	    ($addr,$b1,$b2,$b3) = ($1,$2,$3,'');
+
+	} elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)/) {
+	    ($addr,$b1,$b2,$b3) = ($1,$2,'','');
+
+	} else {
+	    next;
+	}
+
+	if ($b1 =~ /0f/) {
+	    $prefix = $b1;
+	    $opcode = $b2;
+	    $modrm = $b3;
+	} else {
+	    $prefix = '';
+	    $opcode = $b1;
+	    $modrm = $b2;
+	}
+
+	# modrm of the form 00-xxx-100 with an 0F prefix is the problem case
+	# for K6 and pre-CXT K6-2
+	if ($prefix =~ /0f/
+	    && $opcode !~ /^8/         # jcond disp32
+	    && $modrm =~ /^[0-3][4c]/) {
+	    print "ZZ ($file) >3 bytes to determine instruction length\n";
+	}
+
+	# with just an opcode, starting 1f mod 20h
+	if ($addr =~ /[13579bdf]f$/
+	    && $prefix !~ /0f/
+	    && $opcode !~ /1[012345]/ # adc
+	    && $opcode !~ /1[89abcd]/ # sbb
+	    && $opcode !~ /68/        # push $imm32
+	    && $opcode !~ /^7/        # jcond disp8
+	    && $opcode !~ /a[89]/     # test+imm
+	    && $opcode !~ /a[a-f]/    # stos/lods/scas
+	    && $opcode !~ /b8/        # movl $imm32,%eax
+	    && $opcode !~ /e[0123]/   # loop/loopz/loopnz/jcxz
+	    && $opcode !~ /e[b9]/     # jmp disp8/disp32
+	    && $opcode !~ /f[89abcd]/ # clc,stc,cli,sti,cld,std
+	    && !($opcode =~ /f[67]/          # grp 1
+		 && $modrm =~ /^[2367abef]/) # mul, imul, div, idiv
+	    && $modrm !~ /^$/) {
+	    print "ZZ ($file) opcode/modrm cross 32-byte boundary\n";
+	}
+
+	# with an 0F prefix, anything starting at 1f mod 20h
+	if ($addr =~ /[13579bdf][f]$/
+	    && $prefix =~ /0f/) {
+	    print "ZZ ($file) prefix/opcode cross 32-byte boundary\n";
+	}
+
+	# with an 0F prefix, anything with mod/rm starting at 1e mod 20h
+	if ($addr =~ /[13579bdf][e]$/
+	    && $prefix =~ /0f/
+	     && $opcode !~ /^8/        # jcond disp32
+	    && $modrm !~ /^$/) {
+	    print "ZZ ($file) prefix/opcode/modrm cross 32-byte boundary\n";
+	}
+    }
+    close IN || die "Error from objdump (or objdump not available)\n";
+}
+
+
+my @files;
+if ($#ARGV >= 0) {
+    @files = @ARGV;
+} else {
+    @files = glob "*.asm";
+    map {s/.asm/.o/} @files;
+}
+
+foreach (@files)  {
+    disassemble($_);
+}
diff --git a/rts/gmp/mpn/x86/k6/diveby3.asm b/rts/gmp/mpn/x86/k6/diveby3.asm
new file mode 100644
index 0000000000..ffb97bc380
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/diveby3.asm
@@ -0,0 +1,110 @@
+dnl  AMD K6 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+dnl 
+dnl  K6: 11.0 cycles/limb
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                              mp_limb_t carry);
+C
+C Using %esi in (%esi,%ecx,4) or 0(%esi,%ecx,4) addressing modes doesn't
+C lead to vector decoding, unlike plain (%esi) does.
+
+defframe(PARAM_CARRY,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,   8)
+defframe(PARAM_DST,   4)
+
+dnl  multiplicative inverse of 3, modulo 2^32
+deflit(INVERSE_3, 0xAAAAAAAB)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_divexact_by3c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%esi		defframe_pushl(SAVE_ESI)
+
+	movl	PARAM_SRC, %esi
+	pushl	%edi		defframe_pushl(SAVE_EDI)
+
+	movl	PARAM_DST, %edi
+	pushl	%ebx		defframe_pushl(SAVE_EBX)
+
+	movl	PARAM_CARRY, %ebx
+	leal	(%esi,%ecx,4), %esi
+
+	pushl	$3		defframe_pushl(VAR_THREE)
+	leal	(%edi,%ecx,4), %edi
+
+	negl	%ecx
+
+
+	C Need 32 alignment for claimed speed, to avoid the movl store
+	C opcode/modrm crossing a cache line boundary
+
+	ALIGN(32)
+L(top):
+	C eax	scratch, low product
+	C ebx	carry limb (0 to 3)
+	C ecx	counter, limbs, negative
+	C edx	scratch, high product
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+	C
+	C The 0(%esi,%ecx,4) form pads so the finishup "movl %ebx, %eax"
+	C doesn't cross a 32 byte boundary, saving a couple of cycles
+	C (that's a fixed couple, not per loop).
+
+Zdisp(	movl,	0,(%esi,%ecx,4), %eax)
+	subl	%ebx, %eax
+
+	setc	%bl
+
+	imull	$INVERSE_3, %eax
+
+	movl	%eax, (%edi,%ecx,4)
+	addl	$2, %ecx
+
+	mull	VAR_THREE
+
+	addl	%edx, %ebx
+	loop	L(top)
+
+
+	movl	SAVE_ESI, %esi
+	movl	%ebx, %eax
+
+	movl	SAVE_EBX, %ebx
+
+	movl	SAVE_EDI, %edi
+	addl	$FRAME, %esp
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/gmp-mparam.h b/rts/gmp/mpn/x86/k6/gmp-mparam.h
new file mode 100644
index 0000000000..77f3948d77
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/gmp-mparam.h
@@ -0,0 +1,97 @@
+/* AMD K6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME   3   /* cycles */
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME   20  /* cycles */
+#endif
+
+/* bsfl takes 12-27 cycles, put an average for uniform random numbers */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME   14  /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-04. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   18
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      130
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   34
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      116
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              68
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             98
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            13
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          67
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 528, 1184, 2176, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     472
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         4352
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 528, 1184, 2176, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     544
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         4352
+#endif
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm b/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm
new file mode 100644
index 0000000000..20a33e6ccf
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm
@@ -0,0 +1,179 @@
+dnl  AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
+dnl 
+dnl  K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
+dnl  alignment.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K6-2 aligned:
+dnl  UNROLL_COUNT cycles/limb
+dnl        8          0.75
+dnl       16          0.625
+dnl       32          0.5625
+dnl       64          0.53
+dnl  Maximum possible with the current code is 64, the minimum is 2.
+
+deflit(UNROLL_COUNT, 32)
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, processing limbs from high to low addresses.
+C
+C The comments in copyi.asm apply here too.
+
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_copyd)
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, %eax
+
+	movl	PARAM_SRC, %esi
+	movl	%edi, %edx
+
+	std
+
+	movl	PARAM_DST, %edi
+	cmpl	$UNROLL_COUNT, %ecx
+
+	leal	-4(%esi,%ecx,4), %esi
+
+	leal	-4(%edi,%ecx,4), %edi
+	ja	L(unroll)
+
+L(simple):
+	rep
+	movsl
+
+	cld
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	ret
+
+
+L(unroll):
+	C if src and dst are different alignments mod8, then use rep movs
+	C if src and dst are both 4mod8 then process one limb to get 0mod8
+
+	pushl	%ebx
+	leal	(%esi,%edi), %ebx
+
+	testb	$4, %bl
+	popl	%ebx
+	
+	jnz	L(simple)
+	testl	$4, %esi
+
+	leal	-UNROLL_COUNT(%ecx), %ecx
+	jnz	L(already_aligned)
+
+	movsl
+
+	decl	%ecx
+L(already_aligned):
+
+
+ifelse(UNROLL_BYTES,256,`
+	subl	$128, %esi
+	subl	$128, %edi
+')
+
+	C offset 0x3D here, but gets full speed without further alignment
+L(top):
+	C eax	saved esi
+	C ebx
+	C ecx	counter, limbs
+	C edx	saved edi
+	C esi	src, incrementing
+	C edi	dst, incrementing
+	C ebp
+	C
+	C `disp' is never 0, so don't need to force 0(%esi).
+
+deflit(CHUNK_COUNT, 2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
+	movq	disp(%esi), %mm0
+	movq	%mm0, disp(%edi)
+')
+
+	leal	-UNROLL_BYTES(%esi), %esi
+	subl	$UNROLL_COUNT, %ecx
+
+	leal	-UNROLL_BYTES(%edi), %edi
+	jns	L(top)
+
+
+	C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
+	C UNROLL_COUNT-1 limbs remaining
+
+	testb	$eval(UNROLL_COUNT/2), %cl
+
+	leal	UNROLL_COUNT(%ecx), %ecx
+	jz	L(not_half)
+
+
+	C at an unroll count of 32 this block of code is 16 cycles faster than
+	C the rep movs, less 3 or 4 to test whether to do it
+
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
+	deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
+	movq	disp(%esi), %mm0
+	movq	%mm0, disp(%edi)
+')
+
+	subl	$eval(UNROLL_BYTES/2), %esi
+	subl	$eval(UNROLL_BYTES/2), %edi
+
+	subl	$eval(UNROLL_COUNT/2), %ecx
+L(not_half):
+
+
+ifelse(UNROLL_BYTES,256,`
+	addl	$128, %esi
+	addl	$128, %edi
+')
+
+	rep
+	movsl
+
+	cld
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	femms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm b/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm
new file mode 100644
index 0000000000..215d805f2e
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm
@@ -0,0 +1,196 @@
+dnl  AMD K6-2 mpn_copyi -- copy limb vector, incrementing.
+dnl 
+dnl  K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
+dnl  alignment.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K6-2 aligned:
+dnl  UNROLL_COUNT cycles/limb
+dnl        8          0.75
+dnl       16          0.625
+dnl       32          0.5625
+dnl       64          0.53
+dnl  Maximum possible with the current code is 64, the minimum is 2.
+
+deflit(UNROLL_COUNT, 32)
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The MMX loop is faster than a rep movs when src and dst are both 0mod8.
+C With one 0mod8 and one 4mod8 it's 1.056 c/l and the rep movs at 1.0 c/l is
+C used instead.
+C
+C         mod8
+C	src  dst
+C	 0    0	   both aligned, use mmx
+C	 0    4    unaligned, use rep movs
+C	 4    0    unaligned, use rep movs
+C	 4    4    do one movs, then both aligned, use mmx
+C
+C The MMX code on aligned data is 0.5 c/l, plus loop overhead of 2
+C cycles/loop, which is 0.0625 c/l at 32 limbs/loop.
+C
+C A pattern of two movq loads and two movq stores (or four and four) was
+C tried, but found to be the same speed as just one of each.
+C
+C Note that this code only suits K6-2 and K6-3.  Plain K6 does only one mmx
+C instruction per cycle, so "movq"s are no faster than the simple 1 c/l rep
+C movs.
+C
+C Enhancement:
+C
+C Addressing modes like disp(%esi,%ecx,4) aren't currently used.  They'd
+C make it possible to avoid incrementing %esi and %edi in the loop and hence
+C get loop overhead down to 1 cycle.  Care would be needed to avoid bad
+C cache line crossings since the "movq"s would then be 5 code bytes rather
+C than 4.
+
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_copyi)
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, %eax
+
+	movl	PARAM_SRC, %esi
+	movl	%edi, %edx
+
+	cld
+
+	movl	PARAM_DST, %edi
+	cmpl	$UNROLL_COUNT, %ecx
+
+	ja	L(unroll)
+
+L(simple):
+	rep
+	movsl
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	ret
+
+
+L(unroll):
+	C if src and dst are different alignments mod8, then use rep movs
+	C if src and dst are both 4mod8 then process one limb to get 0mod8
+
+	pushl	%ebx
+	leal	(%esi,%edi), %ebx
+
+	testb	$4, %bl
+	popl	%ebx
+	
+	jnz	L(simple)
+	testl	$4, %esi
+
+	leal	-UNROLL_COUNT(%ecx), %ecx
+	jz	L(already_aligned)
+
+	decl	%ecx
+
+	movsl
+L(already_aligned):
+
+
+ifelse(UNROLL_BYTES,256,`
+	addl	$128, %esi
+	addl	$128, %edi
+')
+
+	C this is offset 0x34, no alignment needed
+L(top):
+	C eax	saved esi
+	C ebx
+	C ecx	counter, limbs
+	C edx	saved edi
+	C esi	src, incrementing
+	C edi	dst, incrementing
+	C ebp
+	C
+	C Zdisp gets 0(%esi) left that way to avoid vector decode, and with
+	C 0(%edi) keeps code aligned to 16 byte boundaries.
+
+deflit(CHUNK_COUNT, 2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+Zdisp(	movq,	disp,(%esi), %mm0)
+Zdisp(	movq,	%mm0, disp,(%edi))
+')
+
+	addl	$UNROLL_BYTES, %esi
+	subl	$UNROLL_COUNT, %ecx
+
+	leal	UNROLL_BYTES(%edi), %edi
+	jns	L(top)
+
+
+	C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
+	C UNROLL_COUNT-1 limbs remaining
+
+	testb	$eval(UNROLL_COUNT/2), %cl
+
+	leal	UNROLL_COUNT(%ecx), %ecx
+	jz	L(not_half)
+
+	C at an unroll count of 32 this block of code is 16 cycles faster than
+	C the rep movs, less 3 or 4 to test whether to do it
+
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
+	deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	movq	disp(%esi), %mm0
+	movq	%mm0, disp(%edi)
+')
+	addl	$eval(UNROLL_BYTES/2), %esi
+	addl	$eval(UNROLL_BYTES/2), %edi
+
+	subl	$eval(UNROLL_COUNT/2), %ecx
+L(not_half):
+
+
+ifelse(UNROLL_BYTES,256,`
+	subl	$128, %esi
+	subl	$128, %edi
+')
+
+	rep
+	movsl
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	femms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm b/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm
new file mode 100644
index 0000000000..f6d54f97a8
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm
@@ -0,0 +1,286 @@
+dnl  AMD K6-2 mpn_lshift -- mpn left shift.
+dnl 
+dnl  K6-2: 1.75 cycles/limb
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  used after src has been fetched
+define(VAR_RETVAL,`PARAM_SRC')
+
+dnl  minimum 9, because unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 9)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shldl(	%cl, %edx, %eax)	C return value
+
+ 	shll	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)	C avoid offset 0x1f
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx,%eax,4), %edx	C src high limb
+	negl	%ecx
+
+	movd	PARAM_SHIFT, %mm6
+	addl	$32, %ecx		C 32-shift
+
+	shrl	%cl, %edx
+	cmpl	$UNROLL_THRESHOLD-1, %eax
+
+	movl	%edx, VAR_RETVAL
+	jae	L(unroll)
+
+
+	movd	%ecx, %mm7
+	movl	%eax, %ecx
+
+	movl	PARAM_DST, %eax
+
+L(simple):
+	C eax	dst
+	C ebx	src
+	C ecx	counter, size-1 to 1
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%ebx,%ecx,4), %mm0
+
+ 	psrlq	%mm7, %mm0
+
+Zdisp(	movd,	%mm0, 0,(%eax,%ecx,4))
+	loop	L(simple)
+
+
+	movd	(%ebx), %mm0
+	popl	%ebx
+
+ 	psllq	%mm6, %mm0
+
+	movd	%mm0, (%eax)
+	movl	%edx, %eax
+
+	femms
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	size-1
+	C ebx	src
+	C ecx	32-shift
+	C edx	retval (but instead VAR_RETVAL is used)
+	C
+	C mm6	shift
+
+	addl	$32, %ecx
+	movl	PARAM_DST, %edx
+
+	movd	%ecx, %mm7
+	subl	$7, %eax			C size-8
+
+	leal	(%edx,%eax,4), %ecx		C alignment of dst
+
+	movq	32-8(%ebx,%eax,4), %mm2		C src high qword
+	testb	$4, %cl
+
+	jz	L(dst_aligned)
+	psllq	%mm6, %mm2
+
+	psrlq	$32, %mm2
+	decl	%eax
+
+	movd	%mm2, 32(%edx,%eax,4)		C dst high limb
+	movq	32-8(%ebx,%eax,4), %mm2		C new src high qword
+L(dst_aligned):
+
+	movq	32-16(%ebx,%eax,4), %mm0	C src second highest qword
+
+
+	C This loop is the important bit, the rest is just support for it.
+	C Four src limbs are held at the start, and four more will be read.
+	C Four dst limbs will be written.  This schedule seems necessary for
+	C full speed.
+	C
+	C The use of size-8 lets the loop stop when %eax goes negative and
+	C leaves -4 to -1 which can be tested with test $1 and $2.
+
+L(top):
+	C eax	counter, size-8 step by -4 until <0
+	C ebx	src
+	C ecx
+	C edx	dst
+	C
+	C mm0	src next qword
+	C mm1	scratch
+	C mm2	src prev qword
+	C mm6	shift
+	C mm7	64-shift
+
+	psllq	%mm6, %mm2
+	subl	$4, %eax
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	movq	24(%ebx,%eax,4), %mm0
+
+	psllq	%mm6, %mm1
+	movq	%mm2, 40(%edx,%eax,4)
+
+	movq	%mm0, %mm2
+	psrlq	%mm7, %mm0
+
+	por	%mm0, %mm1
+	movq	16(%ebx,%eax,4), %mm0
+
+	movq	%mm1, 32(%edx,%eax,4)
+	jnc	L(top)
+
+
+	C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4.
+	C
+	C 8(%ebx) is the next source, and 24(%edx) is the next destination.
+	C %eax is between -4 and -1, representing respectively 0 to 3 extra
+	C limbs that must be read.
+
+
+	testl	$2, %eax	C testl to avoid bad cache line crossing
+	jz	L(finish_nottwo)
+
+	C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes
+	C new mm2 and a new mm0 is loaded.
+
+	psllq	%mm6, %mm2
+	movq	%mm0, %mm1
+
+	psrlq	%mm7, %mm0
+	subl	$2, %eax
+
+	por	%mm0, %mm2
+	movq	16(%ebx,%eax,4), %mm0
+
+	movq	%mm2, 32(%edx,%eax,4)
+	movq	%mm1, %mm2
+L(finish_nottwo):
+
+
+	C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0
+
+	testb	$1, %al
+	psllq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	psllq	%mm6, %mm1
+
+	movq	%mm2, 24(%edx,%eax,4)
+	jz	L(finish_even)
+
+
+	C Size is odd, so mm1 and one extra limb to process.
+
+	movd	(%ebx), %mm0		C src[0]
+	popl	%ebx
+deflit(`FRAME',0)
+
+	movq	%mm0, %mm2
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	psllq	%mm6, %mm2
+	por	%mm0, %mm1
+
+	movq	%mm1, 4(%edx)		C dst[1,2]
+	movd	%mm2, (%edx)		C dst[0]
+
+	movl	VAR_RETVAL, %eax
+
+	femms
+	ret
+
+
+	nop	C avoid bad cache line crossing
+L(finish_even):
+deflit(`FRAME',4)
+	C Size is even, so only mm1 left to process.
+
+	movq	%mm1, (%edx)		C dst[0,1]
+	movl	VAR_RETVAL, %eax
+
+	popl	%ebx
+	femms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm b/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm
new file mode 100644
index 0000000000..8a8c144241
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm
@@ -0,0 +1,285 @@
+dnl  AMD K6-2 mpn_rshift -- mpn right shift.
+dnl 
+dnl  K6-2: 1.75 cycles/limb
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  Minimum 9, because the unrolled loop can't handle less.
+dnl
+deflit(UNROLL_THRESHOLD, 9)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shrdl(	%cl, %edx, %eax)	C return value
+
+ 	shrl	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)	C avoid offset 0x1f
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx), %edx	C src low limb
+	negl	%ecx
+
+	addl	$32, %ecx
+	movd	PARAM_SHIFT, %mm6
+
+	shll	%cl, %edx
+	cmpl	$UNROLL_THRESHOLD-1, %eax
+
+	jae	L(unroll)
+
+
+	C eax	size-1
+	C ebx	src
+	C ecx	32-shift
+	C edx	retval
+	C
+	C mm6	shift
+
+	movl	PARAM_DST, %ecx
+	leal	(%ebx,%eax,4), %ebx
+
+	leal	-4(%ecx,%eax,4), %ecx
+	negl	%eax
+
+	C This loop runs at about 3 cycles/limb, which is the amount of
+	C decoding, and this is despite every second access being unaligned.
+
+L(simple):
+	C eax	counter, -(size-1) to -1
+	C ebx	&src[size-1]
+	C ecx	&dst[size-1]
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+
+Zdisp(	movq,	0,(%ebx,%eax,4), %mm0)
+	incl	%eax
+
+ 	psrlq	%mm6, %mm0
+
+Zdisp(	movd,	%mm0, 0,(%ecx,%eax,4))
+	jnz	L(simple)
+
+
+	movq	%mm0, (%ecx)
+	movl	%edx, %eax
+
+	popl	%ebx
+
+	femms
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)	
+L(unroll):
+	C eax	size-1
+	C ebx	src
+	C ecx	32-shift
+	C edx	retval
+	C
+	C mm6	shift
+
+	addl	$32, %ecx
+	subl	$7, %eax		C size-8
+
+	movd	%ecx, %mm7
+	movl	PARAM_DST, %ecx
+
+	movq	(%ebx), %mm2		C src low qword
+	leal	(%ebx,%eax,4), %ebx	C src end - 32
+
+	testb	$4, %cl
+	leal	(%ecx,%eax,4), %ecx	C dst end - 32
+
+	notl	%eax			C -(size-7)
+	jz	L(dst_aligned)
+
+	psrlq	%mm6, %mm2
+	incl	%eax
+
+Zdisp(	movd,	%mm2, 0,(%ecx,%eax,4))	C dst low limb
+	movq	4(%ebx,%eax,4), %mm2	C new src low qword
+L(dst_aligned):
+
+	movq	12(%ebx,%eax,4), %mm0	C src second lowest qword
+	nop 	C avoid bad cache line crossing
+
+
+	C This loop is the important bit, the rest is just support for it.
+	C Four src limbs are held at the start, and four more will be read.
+	C Four dst limbs will be written.  This schedule seems necessary for
+	C full speed.
+	C
+	C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
+	C and leaves 0 to 3 which can be tested with test $1 and $2.
+
+L(top):
+	C eax	counter, -(size-7) step by +4 until >=0
+	C ebx	src end - 32
+	C ecx	dst end - 32
+	C edx	retval
+	C
+	C mm0	src next qword
+	C mm1	scratch
+	C mm2	src prev qword
+	C mm6	shift
+	C mm7	64-shift
+
+	psrlq	%mm6, %mm2
+	addl	$4, %eax
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	movq	4(%ebx,%eax,4), %mm0
+
+	psrlq	%mm6, %mm1
+	movq	%mm2, -12(%ecx,%eax,4)
+
+	movq	%mm0, %mm2
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm1
+	movq	12(%ebx,%eax,4), %mm0
+
+	movq	%mm1, -4(%ecx,%eax,4)
+	ja	L(top)		C jump if no carry and not zero
+
+
+
+	C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
+	C to 3 representing respectively 3 to 0 further limbs.
+
+	testl	$2, %eax	C testl to avoid bad cache line crossings
+	jnz	L(finish_nottwo)
+
+	C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
+	C becomes new mm2 and a new mm0 is loaded.
+
+	psrlq	%mm6, %mm2
+	movq	%mm0, %mm1
+
+	psllq	%mm7, %mm0
+	addl	$2, %eax
+
+	por	%mm0, %mm2
+	movq	12(%ebx,%eax,4), %mm0
+
+	movq	%mm2, -4(%ecx,%eax,4)
+	movq	%mm1, %mm2
+L(finish_nottwo):
+
+
+	testb	$1, %al
+	psrlq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	psrlq	%mm6, %mm1
+
+	movq	%mm2, 4(%ecx,%eax,4)
+	jnz	L(finish_even)
+
+
+	C one further extra limb to process
+
+	movd	32-4(%ebx), %mm0	C src[size-1], most significant limb
+	popl	%ebx
+
+	movq	%mm0, %mm2
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm1
+	psrlq	%mm6, %mm2
+
+	movq	%mm1, 32-12(%ecx)	C dst[size-3,size-2]
+	movd	%mm2, 32-4(%ecx)	C dst[size-1]
+
+	movl	%edx, %eax		C retval
+
+	femms
+	ret
+
+
+	nop	C avoid bad cache line crossing
+L(finish_even):
+	C no further extra limbs
+
+	movq	%mm1, 32-8(%ecx)	C dst[size-2,size-1]
+	movl	%edx, %eax		C retval
+
+	popl	%ebx
+
+	femms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/com_n.asm b/rts/gmp/mpn/x86/k6/mmx/com_n.asm
new file mode 100644
index 0000000000..8915080f0f
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/com_n.asm
@@ -0,0 +1,91 @@
+dnl  AMD K6-2 mpn_com_n -- mpn bitwise one's complement.
+dnl   
+dnl     alignment dst/src, A=0mod8 N=4mod8
+dnl        A/A   A/N   N/A   N/N
+dnl  K6-2  1.0   1.18  1.18  1.18  cycles/limb
+dnl  K6    1.5   1.85  1.75  1.85
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Take the bitwise ones-complement of src,size and write it to dst,size.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_com_n)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+	shrl	%ecx
+	jnz	L(two_or_more)
+
+	movl	(%eax), %eax
+	notl	%eax
+	movl	%eax, (%edx)
+	ret
+
+
+L(two_or_more):
+	pushl	%ebx
+FRAME_pushl()
+	movl	%ecx, %ebx
+
+	pcmpeqd	%mm7, %mm7	C all ones
+
+
+	ALIGN(16)
+L(top):
+	C eax	src
+	C ebx	floor(size/2)
+	C ecx	counter
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+
+	movq	-8(%eax,%ecx,8), %mm0
+	pxor	%mm7, %mm0
+	movq	%mm0, -8(%edx,%ecx,8)
+	loop	L(top)
+
+
+	jnc	L(no_extra)
+	movl	(%eax,%ebx,8), %eax
+	notl	%eax
+	movl	%eax, (%edx,%ebx,8)
+L(no_extra):
+
+	popl	%ebx
+	emms_or_femms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/logops_n.asm b/rts/gmp/mpn/x86/k6/mmx/logops_n.asm
new file mode 100644
index 0000000000..46cb3b7ea5
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/logops_n.asm
@@ -0,0 +1,212 @@
+dnl  AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
+dnl  mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
+dnl   
+dnl          alignment dst/src1/src2, A=0mod8, N=4mod8
+dnl       A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
+dnl 
+dnl  K6-2  1.2   1.5   1.5   1.2   1.2   1.5   1.5   1.2   and,andn,ior,xor
+dnl  K6-2  1.5   1.75  2.0   1.75  1.75  2.0   1.75  1.5   iorn,xnor
+dnl  K6-2  1.75  2.0   2.0   2.0   2.0   2.0   2.0   1.75  nand,nior
+dnl
+dnl  K6    1.5   1.68  1.75  1.2   1.75  1.75  1.68  1.5   and,andn,ior,xor
+dnl  K6    2.0   2.0   2.25  2.25  2.25  2.25  2.0   2.0   iorn,xnor
+dnl  K6    2.0   2.25  2.25  2.25  2.25  2.25  2.25  2.0   nand,nior
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  M4_p and M4_i are the MMX and integer instructions
+dnl  M4_*_neg_dst means whether to negate the final result before writing
+dnl  M4_*_neg_src2 means whether to negate the src2 values before using them
+
+define(M4_choose_op,
+m4_assert_numargs(7)
+`ifdef(`OPERATION_$1',`
+define(`M4_function',  `mpn_$1')
+define(`M4_operation', `$1')
+define(`M4_p',         `$2')
+define(`M4_p_neg_dst', `$3')
+define(`M4_p_neg_src2',`$4')
+define(`M4_i',         `$5')
+define(`M4_i_neg_dst', `$6')
+define(`M4_i_neg_src2',`$7')
+')')
+
+dnl  xnor is done in "iorn" style because it's a touch faster than "nior"
+dnl  style (the two are equivalent for xor).
+
+M4_choose_op( and_n,  pand,0,0,  andl,0,0)
+M4_choose_op( andn_n, pandn,0,0, andl,0,1)
+M4_choose_op( nand_n, pand,1,0,  andl,1,0)
+M4_choose_op( ior_n,  por,0,0,   orl,0,0)
+M4_choose_op( iorn_n, por,0,1,   orl,0,1)
+M4_choose_op( nior_n, por,1,0,   orl,1,0)
+M4_choose_op( xor_n,  pxor,0,0,  xorl,0,0)
+M4_choose_op( xnor_n, pxor,0,1,  xorl,0,1)
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+
+C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                   mp_size_t size);
+C
+C Do src1,size M4_operation src2,size, storing the result in dst,size.
+C
+C Unaligned movq loads and stores are a bit slower than aligned ones.  The
+C test at the start of the routine checks the alignment of src1 and if
+C necessary processes one limb separately at the low end to make it aligned.
+C
+C The raw speeds without this alignment switch are as follows.
+C
+C           alignment dst/src1/src2, A=0mod8, N=4mod8
+C     A/A/A  A/A/N  A/N/A  A/N/N  N/A/A  N/A/N  N/N/A  N/N/N
+C
+C K6                 1.5    2.0                 1.5    2.0    and,andn,ior,xor
+C K6                 1.75   2.2                 2.0    2.28   iorn,xnor
+C K6                 2.0    2.25                2.35   2.28   nand,nior
+C
+C
+C Future:
+C
+C K6 can do one 64-bit load per cycle so each of these routines should be
+C able to approach 1.0 c/l, if aligned.  The basic and/andn/ior/xor might be
+C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
+C The others are 4 instructions per 2 limbs, and so can only approach 1.0
+C because there's nowhere to hide some loop control.
+
+defframe(PARAM_SIZE,16)
+defframe(PARAM_SRC2,12)
+defframe(PARAM_SRC1,8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	.text
+	ALIGN(32)
+PROLOGUE(M4_function)
+			movl	PARAM_SIZE, %ecx
+			pushl	%ebx
+		FRAME_pushl()
+			movl	PARAM_SRC1, %eax
+			movl	PARAM_SRC2, %ebx
+			cmpl	$1, %ecx
+			movl	PARAM_DST, %edx
+			ja	L(two_or_more)
+
+
+			movl	(%ebx), %ecx
+			popl	%ebx
+ifelse(M4_i_neg_src2,1,`notl	%ecx')
+			M4_i	(%eax), %ecx
+ifelse(M4_i_neg_dst,1,`	notl	%ecx')
+			movl	%ecx, (%edx)
+
+			ret
+
+
+L(two_or_more):
+			C eax	src1
+			C ebx	src2
+			C ecx	size
+			C edx	dst
+			C esi
+			C edi
+			C ebp
+			C
+			C carry bit is low of size
+
+			pushl	%esi
+		FRAME_pushl()
+			testl	$4, %eax
+			jz	L(alignment_ok)
+
+			movl	(%ebx), %esi
+			addl	$4, %ebx
+ifelse(M4_i_neg_src2,1,`notl	%esi')
+			M4_i	(%eax), %esi
+			addl	$4, %eax
+ifelse(M4_i_neg_dst,1,`	notl	%esi')
+			movl	%esi, (%edx)
+			addl	$4, %edx
+			decl	%ecx
+
+L(alignment_ok):
+			movl	%ecx, %esi
+			shrl	%ecx
+			jnz	L(still_two_or_more)
+
+			movl	(%ebx), %ecx
+			popl	%esi
+ifelse(M4_i_neg_src2,1,`notl	%ecx')
+			M4_i	(%eax), %ecx
+ifelse(M4_i_neg_dst,1,`	notl	%ecx')
+			popl	%ebx
+			movl	%ecx, (%edx)
+			ret
+
+
+L(still_two_or_more):
+ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
+			pcmpeqd	%mm7, %mm7	C all ones
+')
+
+			ALIGN(16)
+L(top):
+			C eax	src1
+			C ebx	src2
+			C ecx	counter
+			C edx	dst
+			C esi
+			C edi
+			C ebp
+			C
+			C carry bit is low of size
+
+			movq	-8(%ebx,%ecx,8), %mm0
+ifelse(M4_p_neg_src2,1,`pxor	%mm7, %mm0')
+			M4_p	-8(%eax,%ecx,8), %mm0
+ifelse(M4_p_neg_dst,1,`	pxor	%mm7, %mm0')
+			movq	%mm0, -8(%edx,%ecx,8)
+
+			loop	L(top)
+
+
+			jnc	L(no_extra)
+
+			movl	-4(%ebx,%esi,4), %ebx
+ifelse(M4_i_neg_src2,1,`notl	%ebx')
+			M4_i	-4(%eax,%esi,4), %ebx
+ifelse(M4_i_neg_dst,1,`	notl	%ebx')
+			movl	%ebx, -4(%edx,%esi,4)
+L(no_extra):
+
+			popl	%esi
+			popl	%ebx
+			emms_or_femms
+			ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/lshift.asm b/rts/gmp/mpn/x86/k6/mmx/lshift.asm
new file mode 100644
index 0000000000..f1dc83db46
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/lshift.asm
@@ -0,0 +1,122 @@
+dnl  AMD K6 mpn_lshift -- mpn left shift.
+dnl 
+dnl  K6: 3.0 cycles/limb
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
+C instructions.  This is despite every second fetch being unaligned.
+
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shldl(	%cl, %edx, %eax)	C return value
+
+ 	shll	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+	ALIGN(16)	C avoid offset 0x1f
+	nop		C avoid bad cache line crossing
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx,%eax,4), %edx	C src high limb
+	negl	%ecx
+
+	movd	PARAM_SHIFT, %mm6
+	addl	$32, %ecx		C 32-shift
+
+	shrl	%cl, %edx
+
+	movd	%ecx, %mm7
+	movl	PARAM_DST, %ecx
+
+L(top):
+	C eax	counter, size-1 to 1
+	C ebx	src
+	C ecx	dst
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%ebx,%eax,4), %mm0
+	decl	%eax
+
+ 	psrlq	%mm7, %mm0
+
+	movd	%mm0, 4(%ecx,%eax,4)
+	jnz	L(top)
+
+
+	movd	(%ebx), %mm0
+	popl	%ebx
+
+ 	psllq	%mm6, %mm0
+	movl	%edx, %eax
+
+	movd	%mm0, (%ecx)
+
+	emms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/popham.asm b/rts/gmp/mpn/x86/k6/mmx/popham.asm
new file mode 100644
index 0000000000..2c619252bb
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/popham.asm
@@ -0,0 +1,238 @@
+dnl  AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
+dnl  hamming distance.
+dnl
+dnl         popcount  hamdist
+dnl  K6-2:    9.0       11.5   cycles/limb
+dnl  K6:      12.5      13.0
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C The code here isn't optimal, but it's already a 2x speedup over the plain
+C integer mpn/generic/popcount.c,hamdist.c.
+
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist
+')m4exit(1)')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC2,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+	dnl  non-PIC
+
+	DATA
+	ALIGN(8)
+
+define(LS,
+m4_assert_numargs(1)
+`LF(M4_function,`$1')')
+
+LS(rodata_AAAAAAAAAAAAAAAA):
+	.long	0xAAAAAAAA
+	.long	0xAAAAAAAA
+
+LS(rodata_3333333333333333):
+	.long	0x33333333
+	.long	0x33333333
+
+LS(rodata_0F0F0F0F0F0F0F0F):
+	.long	0x0F0F0F0F
+	.long	0x0F0F0F0F
+
+LS(rodata_000000FF000000FF):
+	.long	0x000000FF
+	.long	0x000000FF
+')
+
+	.text
+	ALIGN(32)
+
+POP(`ifdef(`PIC', `
+	C avoid shrl crossing a 32-byte boundary
+	nop')')
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	orl	%ecx, %ecx
+	jz	L(zero)
+
+ifdef(`PIC',`
+	movl	$0xAAAAAAAA, %eax
+	movl	$0x33333333, %edx
+
+	movd	%eax, %mm7
+	movd	%edx, %mm6
+
+	movl	$0x0F0F0F0F, %eax
+	movl	$0x000000FF, %edx
+
+	punpckldq %mm7, %mm7
+	punpckldq %mm6, %mm6
+
+	movd	%eax, %mm5
+	movd	%edx, %mm4
+
+	punpckldq %mm5, %mm5
+	punpckldq %mm4, %mm4
+',`
+
+	movq	LS(rodata_AAAAAAAAAAAAAAAA), %mm7
+	movq	LS(rodata_3333333333333333), %mm6
+	movq	LS(rodata_0F0F0F0F0F0F0F0F), %mm5
+	movq	LS(rodata_000000FF000000FF), %mm4
+')
+
+define(REG_AAAAAAAAAAAAAAAA, %mm7)
+define(REG_3333333333333333, %mm6)
+define(REG_0F0F0F0F0F0F0F0F, %mm5)
+define(REG_000000FF000000FF, %mm4)
+
+
+	movl	PARAM_SRC, %eax
+HAM(`	movl	PARAM_SRC2, %edx')
+
+	pxor	%mm2, %mm2	C total
+
+	shrl	%ecx
+	jnc	L(top)
+
+Zdisp(	movd,	0,(%eax,%ecx,8), %mm1)
+
+HAM(`
+Zdisp(	movd,	0,(%edx,%ecx,8), %mm0)
+	pxor	%mm0, %mm1
+')
+
+	incl	%ecx
+	jmp	L(loaded)
+
+
+	ALIGN(16)
+POP(`	nop	C alignment to avoid crossing 32-byte boundaries')
+
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter, qwords, decrementing
+	C edx	[hamdist] src2
+	C
+	C mm0	(scratch)
+	C mm1	(scratch)
+	C mm2	total (low dword)
+	C mm3
+	C mm4	\
+	C mm5	| special constants
+	C mm6	|
+	C mm7	/
+
+	movq	-8(%eax,%ecx,8), %mm1
+HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
+
+L(loaded):
+	movq	%mm1, %mm0
+	pand	REG_AAAAAAAAAAAAAAAA, %mm1
+
+	psrlq	$1, %mm1
+HAM(`	nop			C code alignment')
+
+	psubd	%mm1, %mm0	C bit pairs
+HAM(`	nop			C code alignment')
+
+
+	movq	%mm0, %mm1
+	psrlq	$2, %mm0
+
+	pand	REG_3333333333333333, %mm0
+	pand	REG_3333333333333333, %mm1
+
+	paddd	%mm1, %mm0	C nibbles
+
+
+	movq	%mm0, %mm1
+	psrlq	$4, %mm0
+
+	pand	REG_0F0F0F0F0F0F0F0F, %mm0
+	pand	REG_0F0F0F0F0F0F0F0F, %mm1
+
+	paddd	%mm1, %mm0	C bytes
+
+	movq	%mm0, %mm1
+	psrlq	$8, %mm0
+
+
+	paddb	%mm1, %mm0	C words
+
+
+	movq	%mm0, %mm1
+	psrlq	$16, %mm0
+
+	paddd	%mm1, %mm0	C dwords
+
+	pand	REG_000000FF000000FF, %mm0
+
+	paddd	%mm0, %mm2	C low to total
+	psrlq	$32, %mm0
+
+	paddd	%mm0, %mm2	C high to total
+	loop	L(top)
+
+
+
+	movd	%mm2, %eax
+	emms_or_femms
+	ret
+
+L(zero):
+	movl	$0, %eax
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/rshift.asm b/rts/gmp/mpn/x86/k6/mmx/rshift.asm
new file mode 100644
index 0000000000..cc5948f26c
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/rshift.asm
@@ -0,0 +1,122 @@
+dnl  AMD K6 mpn_rshift -- mpn right shift.
+dnl 
+dnl  K6: 3.0 cycles/limb
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
+C instructions.  This is despite every second fetch being unaligned.
+
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shrdl(	%cl, %edx, %eax)	C return value
+
+ 	shrl	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+	ALIGN(16)	C avoid offset 0x1f
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx), %edx	C src low limb
+	negl	%ecx
+
+	addl	$32, %ecx	C 32-shift
+	movd	PARAM_SHIFT, %mm6
+
+	shll	%cl, %edx	C retval
+	movl	PARAM_DST, %ecx
+
+	leal	(%ebx,%eax,4), %ebx
+
+	leal	-4(%ecx,%eax,4), %ecx
+	negl	%eax
+
+
+L(simple):
+	C eax	counter (negative)
+	C ebx	&src[size-1]
+	C ecx	&dst[size-1]
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+
+Zdisp(	movq,	0,(%ebx,%eax,4), %mm0)
+	incl	%eax
+
+ 	psrlq	%mm6, %mm0
+
+Zdisp(	movd,	%mm0, 0,(%ecx,%eax,4))
+	jnz	L(simple)
+
+
+	movq	%mm0, (%ecx)
+	movl	%edx, %eax
+
+	popl	%ebx
+
+	emms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mul_1.asm b/rts/gmp/mpn/x86/k6/mul_1.asm
new file mode 100644
index 0000000000..c2220fe4ca
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mul_1.asm
@@ -0,0 +1,272 @@
+dnl  AMD K6 mpn_mul_1 -- mpn by limb multiply.
+dnl 
+dnl  K6: 6.25 cycles/limb.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       mp_limb_t multiplier, mp_limb_t carry);
+C
+C Multiply src,size by mult and store the result in dst,size.
+C Return the carry limb from the top of the result.
+C
+C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
+C the low limb of the result.
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+dnl  minimum 5 because the unrolled code can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_mul_1c)
+	pushl	%esi
+deflit(`FRAME',4)
+	movl	PARAM_CARRY, %esi
+	jmp	LF(mpn_mul_1,start_nc)
+EPILOGUE()
+
+
+PROLOGUE(mpn_mul_1)
+	push	%esi
+deflit(`FRAME',4)
+	xorl	%esi, %esi	C initial carry
+
+L(start_nc):
+	mov	PARAM_SIZE, %ecx
+	push	%ebx
+FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	push	%edi
+FRAME_pushl()
+
+	movl	PARAM_DST, %edi
+	pushl	%ebp
+FRAME_pushl()
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_MULTIPLIER, %ebp
+
+	jae	L(unroll)
+
+
+	C code offset 0x22 here, close enough to aligned
+L(simple):
+	C eax	scratch
+	C ebx	src
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	dst
+	C ebp	multiplier
+	C
+	C this loop 8 cycles/limb
+
+	movl	(%ebx), %eax
+	addl	$4, %ebx
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi)
+	addl	$4, %edi
+
+	loop	L(simple)
+
+
+	popl	%ebp
+
+	popl	%edi
+	popl	%ebx
+
+	movl	%esi, %eax
+	popl	%esi
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C The code for each limb is 6 cycles, with instruction decoding being the
+C limiting factor.  At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
+C cycles/limb in total.
+C
+C The secret ingredient to get 6.25 is to start the loop with the mul and
+C have the load/store pair at the end.  Rotating the load/store to the top
+C is an 0.5 c/l slowdown.  (Some address generation effect probably.)
+C
+C The whole unrolled loop fits nicely in exactly 80 bytes.
+
+
+	ALIGN(16)	C already aligned to 16 here actually
+L(unroll):
+	movl	(%ebx), %eax
+	leal	-16(%ebx,%ecx,4), %ebx
+
+	leal	-16(%edi,%ecx,4), %edi
+	subl	$4, %ecx
+
+	negl	%ecx
+
+
+	ALIGN(16)	C one byte nop for this alignment
+L(top):
+	C eax	scratch
+	C ebx	&src[size-4]
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	&dst[size-4]
+	C ebp	multiplier
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi,%ecx,4)
+	movl	4(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 4(%edi,%ecx,4)
+	movl	8(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 8(%edi,%ecx,4)
+	movl	12(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 12(%edi,%ecx,4)
+	movl	16(%ebx,%ecx,4), %eax
+
+
+	addl	$4, %ecx
+	js	L(top)
+
+
+
+	C eax	next src limb
+	C ebx	&src[size-4]
+	C ecx	0 to 3 representing respectively 4 to 1 further limbs
+	C edx
+	C esi	carry
+	C edi	&dst[size-4]
+
+	testb	$2, %cl
+	jnz	L(finish_not_two)
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi,%ecx,4)
+	movl	4(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 4(%edi,%ecx,4)
+	movl	8(%ebx,%ecx,4), %eax
+
+	addl	$2, %ecx
+L(finish_not_two):
+
+
+	testb	$1, %cl
+	jnz	L(finish_not_one)
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 8(%edi)
+	movl	12(%ebx), %eax
+L(finish_not_one):
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	popl	%ebp
+
+	adcl	$0, %edx
+
+	movl	%eax, 12(%edi)
+	popl	%edi
+
+	popl	%ebx
+	movl	%edx, %eax
+
+	popl	%esi
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mul_basecase.asm b/rts/gmp/mpn/x86/k6/mul_basecase.asm
new file mode 100644
index 0000000000..1f5a3a4b4b
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mul_basecase.asm
@@ -0,0 +1,600 @@
+dnl  AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
+dnl 
+dnl  K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
+dnl      unrolling).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K6: UNROLL_COUNT cycles/product (approx)
+dnl           8           9.75
+dnl          16           9.3
+dnl          32           9.3
+dnl  Maximum possible with the current code is 32.
+dnl
+dnl  With 16 the inner unrolled loop fits exactly in a 256 byte block, which
+dnl  might explain it's good performance.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+C
+C Calculate xp,xsize multiplied by yp,ysize, storing the result in
+C wp,xsize+ysize.
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() entry code only
+C once.  The saving is about 10-20% on typical sizes coming from the
+C Karatsuba multiply code.
+C
+C Future:
+C
+C The unrolled loop could be shared by mpn_addmul_1, with some extra stack
+C setups and maybe 2 or 3 wasted cycles at the end.  Code saving would be
+C 256 bytes.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 8)
+',`
+deflit(UNROLL_THRESHOLD, 8)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP,   16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_XSIZE, %ecx
+	movl	PARAM_YP, %eax
+
+	movl	PARAM_XP, %edx
+	movl	(%eax), %eax	C yp low limb
+
+	cmpl	$2, %ecx
+	ja	L(xsize_more_than_two_limbs)
+	je	L(two_by_something)
+
+
+	C one limb by one limb
+
+	movl	(%edx), %edx	C xp low limb
+	movl	PARAM_WP, %ecx
+	
+	mull	%edx
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+	decl	PARAM_YSIZE
+	pushl	%ebx
+deflit(`FRAME',4)
+
+	movl	PARAM_WP, %ebx
+	pushl	%esi
+deflit(`FRAME',8)
+
+	movl	%eax, %ecx	C yp low limb
+	movl	(%edx), %eax	C xp low limb	
+
+	movl	%edx, %esi	C xp
+	jnz	L(two_by_two)
+
+
+	C two limbs by one limb
+
+	mull	%ecx	
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+
+	movl	%edx, %esi	C carry
+
+	mull	%ecx
+
+	addl	%eax, %esi
+	movl	%esi, 4(%ebx)
+
+	adcl	$0, %edx
+
+	movl	%edx, 8(%ebx)
+	popl	%esi
+
+	popl	%ebx
+	ret
+	
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(two_by_two):
+	C eax	xp low limb
+	C ebx	wp
+	C ecx	yp low limb
+	C edx
+	C esi	xp
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	mull	%ecx		C xp[0] * yp[0]
+
+	push	%edi
+deflit(`FRAME',12)
+	movl	%eax, (%ebx)
+
+	movl	4(%esi), %eax
+	movl	%edx, %edi	C carry, for wp[1]
+
+	mull	%ecx		C xp[1] * yp[0]
+
+	addl	%eax, %edi
+	movl	PARAM_YP, %ecx
+
+	adcl	$0, %edx
+
+	movl	%edi, 4(%ebx)
+	movl	4(%ecx), %ecx	C yp[1]
+
+	movl	4(%esi), %eax	C xp[1]
+	movl	%edx, %edi	C carry, for wp[2]
+
+	mull	%ecx		C xp[1] * yp[1]
+
+	addl	%eax, %edi
+
+	adcl	$0, %edx
+
+	movl	(%esi), %eax	C xp[0]
+	movl	%edx, %esi	C carry, for wp[3]
+
+	mull	%ecx		C xp[0] * yp[1]
+
+	addl	%eax, 4(%ebx)
+	adcl	%edx, %edi
+	adcl	$0, %esi
+
+	movl	%edi, 8(%ebx)
+	popl	%edi
+
+	movl	%esi, 12(%ebx)
+	popl	%esi
+
+	popl	%ebx
+	ret
+
+	
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(xsize_more_than_two_limbs):
+
+C The first limb of yp is processed with a simple mpn_mul_1 style loop
+C inline.  Unrolling this doesn't seem worthwhile since it's only run once
+C (whereas the addmul below is run ysize-1 many times).  A call to the
+C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
+C popping, and doesn't seem likely to be worthwhile on the typical 10-20
+C limb operations the Karatsuba code calls here with.
+
+	C eax	yp[0]
+	C ebx
+	C ecx	xsize
+	C edx	xp
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',0)
+
+	pushl	%edi		defframe_pushl(SAVE_EDI)
+	pushl	%ebp		defframe_pushl(SAVE_EBP)
+
+	movl	PARAM_WP, %edi
+	pushl	%esi		defframe_pushl(SAVE_ESI)
+
+	movl	%eax, %ebp
+	pushl	%ebx		defframe_pushl(SAVE_EBX)
+
+	leal	(%edx,%ecx,4), %ebx	C xp end
+	xorl	%esi, %esi
+
+	leal	(%edi,%ecx,4), %edi	C wp end of mul1
+	negl	%ecx
+
+
+L(mul1):
+	C eax	scratch
+	C ebx	xp end
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	carry
+	C edi	wp end of mul1
+	C ebp	multiplier
+
+	movl	(%ebx,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi,%ecx,4)
+	incl	%ecx
+
+	jnz	L(mul1)
+
+
+	movl	PARAM_YSIZE, %edx
+	movl	%esi, (%edi)		C final carry
+
+	movl	PARAM_XSIZE, %ecx
+	decl	%edx
+
+	jnz	L(ysize_more_than_one_limb)
+
+	popl	%ebx
+	popl	%esi
+	popl	%ebp
+	popl	%edi
+	ret
+
+
+L(ysize_more_than_one_limb):
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_YP, %eax
+
+	jae	L(unroll)
+
+
+C -----------------------------------------------------------------------------
+C Simple addmul loop.
+C
+C Using ebx and edi pointing at the ends of their respective locations saves
+C a couple of instructions in the outer loop.  The inner loop is still 11
+C cycles, the same as the simple loop in aorsmul_1.asm.
+
+	C eax	yp
+	C ebx	xp end
+	C ecx	xsize
+	C edx	ysize-1
+	C esi
+	C edi	wp end of mul1
+	C ebp
+
+	movl	4(%eax), %ebp		C multiplier
+	negl	%ecx
+
+	movl	%ecx, PARAM_XSIZE	C -xsize
+	xorl	%esi, %esi		C initial carry
+
+	leal	4(%eax,%edx,4), %eax	C yp end
+	negl	%edx
+
+	movl	%eax, PARAM_YP
+	movl	%edx, PARAM_YSIZE
+
+	jmp	L(simple_outer_entry)
+
+
+	C aligning here saves a couple of cycles
+	ALIGN(16)
+L(simple_outer_top):	
+	C edx	ysize counter, negative
+
+	movl	PARAM_YP, %eax		C yp end
+	xorl	%esi, %esi		C carry
+
+	movl	PARAM_XSIZE, %ecx	C -xsize
+	movl	%edx, PARAM_YSIZE
+
+	movl	(%eax,%edx,4), %ebp	C yp limb multiplier
+L(simple_outer_entry):
+	addl	$4, %edi
+
+
+L(simple_inner):
+	C eax	scratch
+	C ebx	xp end
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	carry
+	C edi	wp end of this addmul
+	C ebp	multiplier
+
+	movl	(%ebx,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	$0, %edx
+	addl	%eax, (%edi,%ecx,4)
+	adcl	%edx, %esi
+
+	incl	%ecx
+	jnz	L(simple_inner)
+
+
+	movl	PARAM_YSIZE, %edx
+	movl	%esi, (%edi)
+
+	incl	%edx
+	jnz	L(simple_outer_top)
+
+
+	popl	%ebx
+	popl	%esi
+	popl	%ebp
+	popl	%edi
+	ret
+
+
+C -----------------------------------------------------------------------------
+C Unrolled loop.
+C
+C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
+C some comments.
+C
+C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
+C 0, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
+C is entered.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop.  This can't just be fetched through the xp
+C pointer because of the offset applied to it.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
+C to give the starting point in the destination for each unrolled loop (this
+C point is one limb upwards for each limb of yp processed).
+C
+C Having PARAM_YSIZE count negative to zero means it's not necessary to
+C store new values of PARAM_YP and PARAM_WP on each loop.  Those values on
+C the stack remain constant and on each loop an leal adjusts them with the
+C PARAM_YSIZE counter value.
+
+
+defframe(VAR_COUNTER,      -20)
+defframe(VAR_COUNTER_INIT, -24)
+defframe(VAR_JMP,          -28)
+defframe(VAR_XP_LOW,       -32)
+deflit(VAR_STACK_SPACE, 16)
+
+dnl  For some strange reason using (%esp) instead of 0(%esp) is a touch
+dnl  slower in this code, hence the defframe empty-if-zero feature is
+dnl  disabled.
+dnl
+dnl  If VAR_COUNTER is at (%esp), the effect is worse.  In this case the
+dnl  unrolled loop is 255 instead of 256 bytes, but quite how this affects
+dnl  anything isn't clear.
+dnl
+define(`defframe_empty_if_zero_disabled',1)
+
+L(unroll):
+	C eax	yp (not used)
+	C ebx	xp end (not used)
+	C ecx	xsize
+	C edx	ysize-1
+	C esi
+	C edi	wp end of mul1 (not used)
+	C ebp
+deflit(`FRAME', 16)
+
+	leal	-2(%ecx), %ebp	C one limb processed at start,
+	decl	%ecx		C and ebp is one less
+
+	shrl	$UNROLL_LOG2, %ebp
+	negl	%ecx
+
+	subl	$VAR_STACK_SPACE, %esp
+deflit(`FRAME', 16+VAR_STACK_SPACE)
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%ecx, %esi
+	shll	$4, %ecx
+
+	movl	%ebp, VAR_COUNTER_INIT
+	negl	%esi
+
+	C 15 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(unroll_here):
+',`
+	leal	L(unroll_entry) (%ecx,%esi,1), %ecx
+')
+
+	movl	PARAM_XP, %ebx
+	movl	%ebp, VAR_COUNTER
+
+	movl	PARAM_WP, %edi
+	movl	%ecx, VAR_JMP
+
+	movl	(%ebx), %eax
+	leal	4(%edi,%esi,4), %edi	C wp adjust for unrolling and mul1
+
+	leal	(%ebx,%esi,4), %ebx	C xp adjust for unrolling
+
+	movl	%eax, VAR_XP_LOW
+
+	movl	%ebx, PARAM_XP
+	movl	PARAM_YP, %ebx
+
+	leal	(%edi,%edx,4), %ecx	C wp adjust for ysize indexing
+	movl	4(%ebx), %ebp		C multiplier (yp second limb)
+
+	leal	4(%ebx,%edx,4), %ebx	C yp adjust for ysize indexing
+
+	movl	%ecx, PARAM_WP
+
+	leal	1(%esi), %ecx	C adjust parity for decl %ecx above
+
+	movl	%ebx, PARAM_YP
+	negl	%edx
+
+	movl	%edx, PARAM_YSIZE
+	jmp	L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%ecx,%esi,1), %ecx
+	addl	$L(unroll_entry)-L(unroll_here), %ecx
+	addl	(%esp), %ecx
+	ret
+')
+
+
+C -----------------------------------------------------------------------------
+	C Aligning here saves a couple of cycles per loop.  Using 32 doesn't
+	C cost any extra space, since the inner unrolled loop below is
+	C aligned to 32.
+	ALIGN(32)
+L(unroll_outer_top):
+	C edx	ysize
+
+	movl	PARAM_YP, %eax
+	movl	%edx, PARAM_YSIZE	C incremented ysize counter
+
+	movl	PARAM_WP, %edi
+
+	movl	VAR_COUNTER_INIT, %ebx
+	movl	(%eax,%edx,4), %ebp	C next multiplier
+
+	movl	PARAM_XSIZE, %ecx
+	leal	(%edi,%edx,4), %edi	C adjust wp for where we are in yp
+
+	movl	VAR_XP_LOW, %eax
+	movl	%ebx, VAR_COUNTER
+
+L(unroll_outer_entry):
+	mull	%ebp
+
+	C using testb is a tiny bit faster than testl
+	testb	$1, %cl
+
+	movl	%eax, %ecx	C low carry
+	movl	VAR_JMP, %eax
+
+	movl	%edx, %esi	C high carry
+	movl	PARAM_XP, %ebx
+
+	jnz	L(unroll_noswap)
+	movl	%ecx, %esi	C high,low carry other way around
+
+	movl	%edx, %ecx
+L(unroll_noswap):
+
+	jmp	*%eax
+
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(unroll_top):
+	C eax	scratch
+	C ebx	xp
+	C ecx	carry low
+	C edx	scratch
+	C esi	carry high
+	C edi	wp
+	C ebp	multiplier
+	C VAR_COUNTER  loop counter
+	C
+	C 15 code bytes each limb
+
+	leal	UNROLL_BYTES(%edi), %edi
+
+L(unroll_entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4))
+	deflit(`disp1', eval(disp0 + 4))
+	deflit(`disp2', eval(disp1 + 4))
+
+	movl	disp1(%ebx), %eax
+	mull	%ebp
+Zdisp(	addl,	%ecx, disp0,(%edi))
+	adcl	%eax, %esi
+	movl	%edx, %ecx
+	jadcl0( %ecx)
+
+	movl	disp2(%ebx), %eax
+	mull	%ebp
+	addl	%esi, disp1(%edi)
+	adcl	%eax, %ecx
+	movl	%edx, %esi
+	jadcl0( %esi)
+')
+
+	decl	VAR_COUNTER
+	leal	UNROLL_BYTES(%ebx), %ebx
+
+	jns	L(unroll_top)
+
+
+	movl	PARAM_YSIZE, %edx
+	addl	%ecx, UNROLL_BYTES(%edi)
+
+	adcl	$0, %esi
+
+	incl	%edx
+	movl	%esi, UNROLL_BYTES+4(%edi)
+
+	jnz	L(unroll_outer_top)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBP, %ebp
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+
+	addl	$FRAME, %esp
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/sqr_basecase.asm b/rts/gmp/mpn/x86/k6/sqr_basecase.asm
new file mode 100644
index 0000000000..70d49b3e57
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/sqr_basecase.asm
@@ -0,0 +1,672 @@
+dnl  AMD K6 mpn_sqr_basecase -- square an mpn number.
+dnl 
+dnl  K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
+dnl  product (measured on the speed difference between 17 and 33 limbs,
+dnl  which is roughly the Karatsuba recursing range).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  KARATSUBA_SQR_THRESHOLD_MAX is the maximum KARATSUBA_SQR_THRESHOLD this
+dnl  code supports.  This value is used only by the tune program to know
+dnl  what it can go up to.  (An attempt to compile with a bigger value will
+dnl  trigger some m4_assert()s in the code, making the build fail.)
+dnl
+dnl  The value is determined by requiring the displacements in the unrolled
+dnl  addmul to fit in single bytes.  This means a maximum UNROLL_COUNT of
+dnl  63, giving a maximum KARATSUBA_SQR_THRESHOLD of 66.
+
+deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66)
+
+
+dnl  Allow a value from the tune program to override config.m4.
+
+ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
+`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
+
+
+dnl  UNROLL_COUNT is the number of code chunks in the unrolled addmul.  The
+dnl  number required is determined by KARATSUBA_SQR_THRESHOLD, since
+dnl  mpn_sqr_basecase only needs to handle sizes < KARATSUBA_SQR_THRESHOLD.
+dnl
+dnl  The first addmul is the biggest, and this takes the second least
+dnl  significant limb and multiplies it by the third least significant and
+dnl  up.  Hence for a maximum operand size of KARATSUBA_SQR_THRESHOLD-1
+dnl  limbs, UNROLL_COUNT needs to be KARATSUBA_SQR_THRESHOLD-3.
+
+m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
+deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the given size
+C is small.
+C
+C The code size might look a bit excessive, but not all of it is executed
+C and so won't fill up the code cache.  The 1x1, 2x2 and 3x3 special cases
+C clearly apply only to those sizes; mid sizes like 10x10 only need part of
+C the unrolled addmul; and big sizes like 35x35 that do need all of it will
+C at least be getting value for money, because 35x35 spends something like
+C 5780 cycles here.
+C
+C Different values of UNROLL_COUNT give slightly different speeds, between
+C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
+C This isn't a big difference, but it's presumably some alignment effect
+C which if understood could give a simple speedup.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %ecx
+	je	L(two_limbs)
+
+	movl	PARAM_DST, %edx
+	ja	L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+
+	movl	(%eax), %eax
+	movl	%edx, %ecx
+
+	mull	%eax
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+
+	pushl	%ebx
+	movl	%eax, %ebx	C src
+deflit(`FRAME',4)
+
+	movl	(%ebx), %eax
+	movl	PARAM_DST, %ecx
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)
+	movl	4(%ebx), %eax
+
+	movl	%edx, 4(%ecx)
+
+	mull	%eax		C src[1]^2
+
+	movl	%eax, 8(%ecx)
+	movl	(%ebx), %eax
+
+	movl	%edx, 12(%ecx)
+	movl	4(%ebx), %edx
+
+	mull	%edx		C src[0]*src[1]
+
+	addl	%eax, 4(%ecx)
+
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+
+	popl	%ebx
+	addl	%eax, 4(%ecx)
+
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(three_or_more):
+deflit(`FRAME',0)
+	cmpl	$4, %ecx
+	jae	L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+	C eax	src
+	C ecx	size
+	C edx	dst
+
+	pushl	%ebx
+	movl	%eax, %ebx	C src
+
+	movl	(%ebx), %eax
+	movl	%edx, %ecx	C dst
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	4(%ebx), %eax
+
+	movl	%edx, 4(%ecx)
+	pushl	%esi
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	8(%ebx), %eax
+
+	movl	%edx, 12(%ecx)
+	pushl	%edi
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	(%ebx), %eax
+
+	movl	%edx, 20(%ecx)
+	movl	4(%ebx), %edx
+
+	mull	%edx		C src[0] * src[1]
+
+	movl	%eax, %esi
+	movl	(%ebx), %eax
+
+	movl	%edx, %edi
+	movl	8(%ebx), %edx
+
+	pushl	%ebp
+	xorl	%ebp, %ebp
+
+	mull	%edx		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	4(%ebx), %eax
+
+	adcl	%edx, %ebp
+
+	movl	8(%ebx), %edx
+
+	mull	%edx		C src[1] * src[2]
+
+	addl	%eax, %ebp
+
+	adcl	$0, %edx
+
+
+	C eax	will be dst[5]
+	C ebx
+	C ecx	dst
+	C edx	dst[4]
+	C esi	dst[1]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	xorl	%eax, %eax
+	addl	%esi, %esi
+	adcl	%edi, %edi
+	adcl	%ebp, %ebp
+	adcl	%edx, %edx
+	adcl	$0, %eax
+
+	addl	%esi, 4(%ecx)
+	adcl	%edi, 8(%ecx)
+	adcl	%ebp, 12(%ecx)
+
+	popl	%ebp
+	popl	%edi
+
+	adcl	%edx, 16(%ecx)
+
+	popl	%esi
+	popl	%ebx
+
+	adcl	%eax, 20(%ecx)
+	ASSERT(nc)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+
+defframe(SAVE_EBX,   -4)
+defframe(SAVE_ESI,   -8)
+defframe(SAVE_EDI,   -12)
+defframe(SAVE_EBP,   -16)
+defframe(VAR_COUNTER,-20)
+defframe(VAR_JMP,    -24)
+deflit(STACK_SPACE, 24)
+
+	ALIGN(16)
+L(four_or_more):
+
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+C
+C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
+C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
+C a 5780 cycle operation, which is not surprising since the loop here is 8
+C c/l and mpn_mul_1 is 6.25 c/l.
+
+	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
+	
+	movl	%edi, SAVE_EDI
+	leal	4(%edx), %edi
+
+	movl	%ebx, SAVE_EBX
+	leal	4(%eax), %ebx
+
+	movl	%esi, SAVE_ESI
+	xorl	%esi, %esi
+
+	movl	%ebp, SAVE_EBP
+
+	C eax
+	C ebx	src+4
+	C ecx	size
+	C edx
+	C esi
+	C edi	dst+4
+	C ebp
+
+	movl	(%eax), %ebp	C multiplier
+	leal	-1(%ecx), %ecx	C size-1, and pad to a 16 byte boundary
+
+
+	ALIGN(16)
+L(mul_1):
+	C eax	scratch
+	C ebx	src ptr
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	dst ptr
+	C ebp	multiplier
+
+	movl	(%ebx), %eax
+	addl	$4, %ebx
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi)
+	addl	$4, %edi
+
+	loop	L(mul_1)
+
+
+C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two addmuls, which are the bottom right corner of the product
+C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as mpn_addmul_1(), see that routine for some
+C comments.
+C
+C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+C
+C K6 doesn't do any branch prediction on indirect jumps, which is good
+C actually because it's a different target each time.  The unrolled addmul
+C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
+C the indirect jump is quickly recovered.
+
+
+dnl  This value is also implicitly encoded in a shift and add.
+dnl
+deflit(CODE_BYTES_PER_LIMB, 15)
+
+dnl  With the unmodified &src[size] and &dst[size] pointers, the
+dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
+dnl  values up to 31.  Above that an offset must be added to them.
+dnl
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>31),1,
+eval((UNROLL_COUNT-31)*4),
+0))
+
+	C eax
+	C ebx	&src[size]
+	C ecx
+	C edx
+	C esi	carry
+	C edi	&dst[size]
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, (%edi)
+
+	subl	$4, %ecx
+	jz	L(corner)
+
+	movl	%ecx, %edx
+ifelse(OFFSET,0,,
+`	subl	$OFFSET, %ebx')
+
+	shll	$4, %ecx
+ifelse(OFFSET,0,,
+`	subl	$OFFSET, %edi')
+
+	negl	%ecx
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+	negl	%edx
+
+
+	C The calculated jump mustn't be before the start of the available
+	C code.  This is the limitation UNROLL_COUNT puts on the src operand
+	C size, but checked here using the jump address directly.
+	C
+	ASSERT(ae,`
+	movl_text_address( L(unroll_inner_start), %eax)
+	cmpl	%eax, %ecx
+	')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll_outer_top):
+	C eax
+	C ebx	&src[size], constant
+	C ecx	VAR_JMP
+	C edx	VAR_COUNTER, limbs, negative
+	C esi	high limb to store
+	C edi	dst ptr, high of last addmul
+	C ebp
+
+	movl	-12+OFFSET(%ebx,%edx,4), %ebp	C multiplier
+	movl	%edx, VAR_COUNTER
+
+	movl	-8+OFFSET(%ebx,%edx,4), %eax	C first limb of multiplicand
+
+	mull	%ebp
+
+	testb	$1, %cl
+
+	movl	%edx, %esi	C high carry
+	movl	%ecx, %edx	C jump
+
+	movl	%eax, %ecx	C low carry
+	leal	CODE_BYTES_PER_LIMB(%edx), %edx
+
+	movl	%edx, VAR_JMP
+	leal	4(%edi), %edi
+
+	C A branch-free version of this using some xors was found to be a
+	C touch slower than just a conditional jump, despite the jump
+	C switching between taken and not taken on every loop.
+
+ifelse(eval(UNROLL_COUNT%2),0,
+	jz,jnz)	L(unroll_noswap)
+	movl	%esi, %eax	C high,low carry other way around
+
+	movl	%ecx, %esi
+	movl	%eax, %ecx
+L(unroll_noswap):
+
+	jmp	*%edx
+
+
+	C Must be on an even address here so the low bit of the jump address
+	C will indicate which way around ecx/esi should start.
+	C
+	C An attempt was made at padding here to get the end of the unrolled
+	C code to come out on a good alignment, to save padding before
+	C L(corner).  This worked, but turned out to run slower than just an
+	C ALIGN(2).  The reason for this is not clear, it might be related
+	C to the different speeds on different UNROLL_COUNTs noted above.
+
+	ALIGN(2)
+
+L(unroll_inner_start):
+	C eax	scratch
+	C ebx	src
+	C ecx	carry low
+	C edx	scratch
+	C esi	carry high
+	C edi	dst
+	C ebp	multiplier
+	C
+	C 15 code bytes each limb
+	C ecx/esi swapped on each chunk
+
+forloop(`i', UNROLL_COUNT, 1, `
+	deflit(`disp_src', eval(-i*4 + OFFSET))
+	deflit(`disp_dst', eval(disp_src - 4))
+
+	m4_assert(`disp_src>=-128 && disp_src<128')
+	m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp(	movl,	disp_src,(%ebx), %eax)
+	mull	%ebp
+Zdisp(	addl,	%esi, disp_dst,(%edi))
+	adcl	%eax, %ecx
+	movl	%edx, %esi
+	jadcl0( %esi)
+',`
+	dnl  this one comes out last
+Zdisp(	movl,	disp_src,(%ebx), %eax)
+	mull	%ebp
+Zdisp(	addl,	%ecx, disp_dst,(%edi))
+	adcl	%eax, %esi
+	movl	%edx, %ecx
+	jadcl0( %ecx)
+')
+')
+L(unroll_inner_end):
+
+	addl	%esi, -4+OFFSET(%edi)
+
+	movl	VAR_COUNTER, %edx
+	jadcl0(	%ecx)
+
+	movl	%ecx, m4_empty_if_zero(OFFSET)(%edi)
+	movl	VAR_JMP, %ecx
+
+	incl	%edx
+	jnz	L(unroll_outer_top)
+	
+
+ifelse(OFFSET,0,,`
+	addl	$OFFSET, %ebx
+	addl	$OFFSET, %edi
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(corner):
+	C ebx	&src[size]
+	C edi	&dst[2*size-5]
+
+	movl	-12(%ebx), %ebp
+
+	movl	-8(%ebx), %eax
+	movl	%eax, %ecx
+
+	mull	%ebp
+
+	addl	%eax, -4(%edi)
+	adcl	$0, %edx
+
+	movl	-4(%ebx), %eax
+	movl	%edx, %esi
+	movl	%eax, %ebx
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	adcl	$0, %edx
+
+	addl	%eax, (%edi)
+	adcl	$0, %edx
+
+	movl	%edx, %esi
+	movl	%ebx, %eax
+
+	mull	%ecx
+
+	addl	%esi, %eax
+	movl	%eax, 4(%edi)
+
+	adcl	$0, %edx
+
+	movl	%edx, 8(%edi)
+	
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
+C The loop measures about 6 cycles/iteration, though it looks like it should
+C decode in 5.
+
+L(lshift_start):
+	movl	PARAM_SIZE, %ecx
+
+	movl	PARAM_DST, %edi
+	subl	$1, %ecx		C size-1 and clear carry
+
+	movl	PARAM_SRC, %ebx
+	movl	%ecx, %edx
+
+	xorl	%eax, %eax		C ready for adcl
+
+
+	ALIGN(16)
+L(lshift):
+	C eax
+	C ebx	src (for later use)
+	C ecx	counter, decrementing
+	C edx	size-1 (for later use)
+	C esi
+	C edi	dst, incrementing
+	C ebp
+
+	rcll	4(%edi)
+	rcll	8(%edi)
+	leal	8(%edi), %edi
+	loop	L(lshift)
+
+
+	adcl	%eax, %eax
+
+	movl	%eax, 4(%edi)		C dst most significant limb
+	movl	(%ebx), %eax		C src[0]
+
+	leal	4(%ebx,%edx,4), %ebx	C &src[size]
+	subl	%edx, %ecx		C -(size-1)
+
+
+C -----------------------------------------------------------------------------
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+
+	mull	%eax
+
+	movl	%eax, (%edi,%ecx,8)	C dst[0]
+
+
+	ALIGN(16)
+L(diag):
+	C eax	scratch
+	C ebx	&src[size]
+	C ecx	counter, negative
+	C edx	carry
+	C esi	scratch
+	C edi	dst[2*size-2]
+	C ebp
+
+	movl	(%ebx,%ecx,4), %eax
+	movl	%edx, %esi
+
+	mull	%eax
+
+	addl	%esi, 4(%edi,%ecx,8)
+	adcl	%eax, 8(%edi,%ecx,8)
+	adcl	$0, %edx
+
+	incl	%ecx
+	jnz	L(diag)
+
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+
+	addl	%edx, 4(%edi)		C dst most significant limb
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+ifdef(`PIC',`
+L(pic_calc):
+        C See README.family about old gas bugs
+	addl	(%esp), %ecx
+	addl	$L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
+	addl	%edx, %ecx
+	ret
+')
+
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/README b/rts/gmp/mpn/x86/k7/README
new file mode 100644
index 0000000000..c34315c401
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/README
@@ -0,0 +1,145 @@
+
+                      AMD K7 MPN SUBROUTINES
+
+
+This directory contains code optimized for the AMD Athlon CPU.
+
+The mmx subdirectory has routines using MMX instructions.  All Athlons have
+MMX, the separate directory is just so that configure can omit it if the
+assembler doesn't support MMX.
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache.
+
+                               cycles/limb
+	mpn_add/sub_n             1.6
+
+	mpn_copyi                 0.75 or 1.0   \ varying with data alignment
+	mpn_copyd                 0.75 or 1.0   /
+
+	mpn_divrem_1             17.0 integer part, 15.0 fractional part
+	mpn_mod_1                17.0
+	mpn_divexact_by3          8.0
+
+	mpn_l/rshift              1.2
+
+	mpn_mul_1                 3.4
+	mpn_addmul/submul_1       3.9
+
+	mpn_mul_basecase          4.42 cycles/crossproduct (approx)
+
+	mpn_popcount		   5.0
+	mpn_hamdist		   6.0
+
+Prefetching of sources hasn't yet been tried.
+
+
+
+NOTES
+
+cmov, MMX, 3DNow and some extensions to MMX and 3DNow are available.
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+
+Floating point multiplications can be done in parallel with integer
+multiplications, but there doesn't seem to be any way to make use of this.
+
+Unsigned "mul"s can be issued every 3 cycles.  This suggests 3 is a limit on
+the speed of the multiplication routines.  The documentation shows mul
+executing in IEU0 (or maybe in IEU0 and IEU1 together), so it might be that,
+to get near 3 cycles code has to be arranged so that nothing else is issued
+to IEU0.  A busy IEU0 could explain why some code takes 4 cycles and other
+apparently equivalent code takes 5.
+
+
+
+OPTIMIZATIONS
+
+Unrolled loops are used to reduce looping overhead.  The unrolling is
+configurable up to 32 limbs/loop for most routines and up to 64 for some.
+The K7 has 64k L1 code cache so quite big unrolling is allowable.
+
+Computed jumps into the unrolling are used to handle sizes not a multiple of
+the unrolling.  An attractive feature of this is that times increase
+smoothly with operand size, but it may be that some routines should just
+have simple loops to finish up, especially when PIC adds between 2 and 16
+cycles to get %eip.
+
+Position independent code is implemented using a call to get %eip for the
+computed jumps and a ret is always done, rather than an addl $4,%esp or a
+popl, so the CPU return address branch prediction stack stays synchronised
+with the actual stack in memory.
+
+Branch prediction, in absence of any history, will guess forward jumps are
+not taken and backward jumps are taken.  Where possible it's arranged that
+the less likely or less important case is under a taken forward jump.
+
+
+
+CODING
+
+Instructions in general code have been shown grouped if they can execute
+together, which means up to three direct-path instructions which have no
+successive dependencies.  K7 always decodes three and has out-of-order
+execution, but the groupings show what slots might be available and what
+dependency chains exist.
+
+When there's vector-path instructions an effort is made to get triplets of
+direct-path instructions in between them, even if there's dependencies,
+since this maximizes decoding throughput and might save a cycle or two if
+decoding is the limiting factor.
+
+
+
+INSTRUCTIONS
+
+adcl       direct
+divl       39 cycles back-to-back
+lodsl,etc  vector
+loop       1 cycle vector (decl/jnz opens up one decode slot)
+movd reg   vector
+movd mem   direct
+mull       issue every 3 cycles, latency 4 cycles low word, 6 cycles high word
+popl	   vector (use movl for more than one pop)
+pushl	   direct, will pair with a load
+shrdl %cl  vector, 3 cycles, seems to be 3 decode too
+xorl r,r   false read dependency recognised
+
+
+
+REFERENCES
+
+"AMD Athlon Processor X86 Code Optimization Guide", AMD publication number
+22007, revision E, November 1999.  Available on-line,
+
+	http://www.amd.com/products/cpg/athlon/techdocs/pdf/22007.pdf
+
+"3DNow Technology Manual", AMD publication number 21928F/0-August 1999.
+This describes the femms and prefetch instructions.  Available on-line,
+
+	http://www.amd.com/K6/k6docs/pdf/21928.pdf
+
+"AMD Extensions to the 3DNow and MMX Instruction Sets Manual", AMD
+publication number 22466, revision B, August 1999.  This describes
+instructions added in the Athlon processor, such as pswapd and the extra
+prefetch forms.  Available on-line,
+
+	http://www.amd.com/products/cpg/athlon/techdocs/pdf/22466.pdf
+
+"3DNow Instruction Porting Guide", AMD publication number 22621, revision B,
+August 1999.  This has some notes on general Athlon optimizations as well as
+3DNow.  Available on-line,
+
+	http://www.amd.com/products/cpg/athlon/techdocs/pdf/22621.pdf
+
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/k7/aors_n.asm b/rts/gmp/mpn/x86/k7/aors_n.asm
new file mode 100644
index 0000000000..85fa9d3036
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/aors_n.asm
@@ -0,0 +1,250 @@
+dnl  AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
+dnl 
+dnl  K7: 1.64 cycles/limb (at 16 limb/loop).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K7: UNROLL_COUNT cycles/limb
+dnl           8           1.9
+dnl          16           1.64
+dnl          32           1.7
+dnl          64           2.0
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_add_n', `
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+	define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+	define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                         mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C	                   mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size.  The return value is the carry bit from the top of the result (1
+C or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation.  Note values other than 1 or 0 here will lead to garbage
+C results.
+C
+C This code runs at 1.64 cycles/limb, which is probably the best possible
+C with plain integer operations.  Each limb is 2 loads and 1 store, and in
+C one cycle the K7 can do two loads, or a load and a store, leading to 1.5
+C c/l.
+
+dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 8)
+',`
+deflit(UNROLL_THRESHOLD, 8)
+')
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+defframe(SAVE_EBP, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+defframe(SAVE_EDI, -16)
+deflit(STACK_SPACE, 16)
+
+	.text
+	ALIGN(32)
+deflit(`FRAME',0)
+
+PROLOGUE(M4_function_nc)
+	movl	PARAM_CARRY, %eax
+	jmp	LF(M4_function_n,start)
+EPILOGUE()
+
+PROLOGUE(M4_function_n)
+
+	xorl	%eax, %eax	C carry
+L(start):
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%edi, SAVE_EDI
+	movl	%ebx, SAVE_EBX
+	cmpl	$UNROLL_THRESHOLD, %ecx
+
+	movl	PARAM_SRC2, %edx
+	movl	PARAM_SRC1, %ebx
+	jae	L(unroll)
+
+	movl	PARAM_DST, %edi
+	leal	(%ebx,%ecx,4), %ebx
+	leal	(%edx,%ecx,4), %edx
+
+	leal	(%edi,%ecx,4), %edi
+	negl	%ecx
+	shrl	%eax
+
+	C This loop in in a single 16 byte code block already, so no
+	C alignment necessary.
+L(simple):
+	C eax	scratch
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+
+	movl	(%ebx,%ecx,4), %eax
+	M4_inst	(%edx,%ecx,4), %eax
+	movl	%eax, (%edi,%ecx,4)
+	incl	%ecx
+	jnz	L(simple)
+
+	movl	$0, %eax
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBX, %ebx
+	setc	%al
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	C This is at 0x55, close enough to aligned.
+L(unroll):
+deflit(`FRAME',STACK_SPACE)
+	movl	%ebp, SAVE_EBP
+	andl	$-2, %ecx		C size low bit masked out
+	andl	$1, PARAM_SIZE		C size low bit kept
+
+	movl	%ecx, %edi
+	decl	%ecx
+	movl	PARAM_DST, %ebp
+
+	shrl	$UNROLL_LOG2, %ecx
+	negl	%edi
+	movl	%esi, SAVE_ESI
+
+	andl	$UNROLL_MASK, %edi
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%edi,%edi,8), %esi	C 9 bytes per
+')
+	negl	%edi
+	shrl	%eax
+
+	leal	ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
+	leal	ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
+	leal	ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
+
+	jmp	*%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%edi,%edi,8), %esi
+	addl	$L(entry)-L(here), %esi
+	addl	(%esp), %esi
+	ret
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(top):
+	C eax	zero
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi	scratch (was computed jump)
+	C edi	dst
+	C ebp	scratch
+
+	leal	UNROLL_BYTES(%edx), %edx
+
+L(entry):
+deflit(CHUNK_COUNT, 2)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%ebx), %esi)
+	movl	disp1(%ebx), %ebp
+Zdisp(	M4_inst,disp0,(%edx), %esi)
+Zdisp(	movl,	%esi, disp0,(%edi))
+	M4_inst	disp1(%edx), %ebp
+	movl	%ebp, disp1(%edi)
+')
+
+	decl	%ecx
+	leal	UNROLL_BYTES(%ebx), %ebx
+	leal	UNROLL_BYTES(%edi), %edi
+	jns	L(top)
+
+
+	mov	PARAM_SIZE, %esi
+	movl	SAVE_EBP, %ebp
+	movl	$0, %eax
+
+	decl	%esi
+	js	L(even)
+
+	movl	(%ebx), %ecx
+	M4_inst	UNROLL_BYTES(%edx), %ecx
+	movl	%ecx, (%edi)
+L(even):
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+	setc	%al
+
+	movl	SAVE_ESI, %esi
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/aorsmul_1.asm b/rts/gmp/mpn/x86/k7/aorsmul_1.asm
new file mode 100644
index 0000000000..9f9c3daaf4
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/aorsmul_1.asm
@@ -0,0 +1,364 @@
+dnl  AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+dnl 
+dnl  K7: 3.9 cycles/limb.
+dnl 
+dnl  Future: It should be possible to avoid the separate mul after the
+dnl  unrolled loop by moving the movl/adcl to the top.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K7: UNROLL_COUNT  cycles/limb
+dnl           4            4.42
+dnl           8            4.16
+dnl          16            3.9
+dnl          32            3.9
+dnl          64            3.87
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1',`
+	define(M4_inst,        addl)
+	define(M4_function_1,  mpn_addmul_1)
+	define(M4_function_1c, mpn_addmul_1c)
+	define(M4_description, add it to)
+	define(M4_desc_retval, carry)
+',`ifdef(`OPERATION_submul_1',`
+	define(M4_inst,        subl)
+	define(M4_function_1,  mpn_submul_1)
+	define(M4_function_1c, mpn_submul_1c)
+	define(M4_description, subtract it from)
+	define(M4_desc_retval, borrow)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                            mp_limb_t mult);
+C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                             mp_limb_t mult, mp_limb_t carry);
+C
+C Calculate src,size multiplied by mult and M4_description dst,size.
+C Return the M4_desc_retval limb from the top of the result.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 9)
+',`
+deflit(UNROLL_THRESHOLD, 6)
+')
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+deflit(`FRAME',0)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(SAVE_SIZE, 16)
+
+	.text
+	ALIGN(32)
+PROLOGUE(M4_function_1)
+	movl	PARAM_SIZE, %edx
+	movl	PARAM_SRC, %eax
+	xorl	%ecx, %ecx
+
+	decl	%edx
+	jnz	LF(M4_function_1c,start_1)
+
+	movl	(%eax), %eax
+	movl	PARAM_DST, %ecx
+
+	mull	PARAM_MULTIPLIER
+
+	M4_inst	%eax, (%ecx)
+	adcl	$0, %edx
+	movl	%edx, %eax
+
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(M4_function_1c)
+	movl	PARAM_SIZE, %edx
+	movl	PARAM_SRC, %eax
+
+	decl	%edx
+	jnz	L(more_than_one_limb)
+
+	movl	(%eax), %eax
+	movl	PARAM_DST, %ecx
+
+	mull	PARAM_MULTIPLIER
+
+	addl	PARAM_CARRY, %eax
+
+	adcl	$0, %edx
+	M4_inst	%eax, (%ecx)
+
+	adcl	$0, %edx
+	movl	%edx, %eax
+
+	ret
+
+
+	C offset 0x44 so close enough to aligned
+L(more_than_one_limb):
+	movl	PARAM_CARRY, %ecx
+L(start_1):
+	C eax	src
+	C ecx	initial carry
+	C edx	size-1
+	subl	$SAVE_SIZE, %esp
+deflit(`FRAME',16)
+
+	movl	%ebx, SAVE_EBX
+	movl	%esi, SAVE_ESI
+	movl	%edx, %ebx	C size-1
+
+	movl	PARAM_SRC, %esi
+	movl	%ebp, SAVE_EBP
+	cmpl	$UNROLL_THRESHOLD, %edx
+
+	movl	PARAM_MULTIPLIER, %ebp
+	movl	%edi, SAVE_EDI
+
+	movl	(%esi), %eax	C src low limb
+	movl	PARAM_DST, %edi
+	ja	L(unroll)
+
+
+	C simple loop
+
+	leal	4(%esi,%ebx,4), %esi	C point one limb past last
+	leal	(%edi,%ebx,4), %edi	C point at last limb
+	negl	%ebx
+
+	C The movl to load the next source limb is done well ahead of the
+	C mul.  This is necessary for full speed, and leads to one limb
+	C handled separately at the end.
+
+L(simple):
+	C eax	src limb
+	C ebx	loop counter
+	C ecx	carry limb
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+	mull	%ebp
+
+	addl	%eax, %ecx
+	adcl	$0, %edx
+
+	M4_inst	%ecx, (%edi,%ebx,4)
+	movl	(%esi,%ebx,4), %eax
+	adcl	$0, %edx
+
+	incl	%ebx
+	movl	%edx, %ecx
+	jnz	L(simple)
+
+
+	mull	%ebp
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBP, %ebp
+
+	addl	%eax, %ecx
+	adcl	$0, %edx
+
+	M4_inst	%ecx, (%edi)
+	adcl	$0, %edx
+	movl	SAVE_EDI, %edi
+
+	addl	$SAVE_SIZE, %esp
+	movl	%edx, %eax
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	src low limb
+	C ebx	size-1
+	C ecx	carry
+	C edx	size-1
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+	
+dnl  overlapping with parameters no longer needed
+define(VAR_COUNTER,`PARAM_SIZE')
+define(VAR_JUMP,   `PARAM_MULTIPLIER')
+
+	subl	$2, %ebx	C (size-2)-1
+	decl	%edx		C size-2
+	
+	shrl	$UNROLL_LOG2, %ebx
+	negl	%edx
+
+	movl	%ebx, VAR_COUNTER
+	andl	$UNROLL_MASK, %edx
+
+	movl	%edx, %ebx
+	shll	$4, %edx
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%edx,%ebx,1), %edx
+')
+	negl	%ebx
+	movl	%edx, VAR_JUMP
+
+	mull	%ebp
+
+	addl	%eax, %ecx	C initial carry, becomes low carry
+	adcl	$0, %edx
+	testb	$1, %bl
+
+	movl	4(%esi), %eax	C src second limb
+	leal	ifelse(UNROLL_BYTES,256,128+) 8(%esi,%ebx,4), %esi
+	leal	ifelse(UNROLL_BYTES,256,128)   (%edi,%ebx,4), %edi
+
+	movl	%edx, %ebx	C high carry
+	cmovnz(	%ecx, %ebx)	C high,low carry other way around
+	cmovnz(	%edx, %ecx)
+
+	jmp	*VAR_JUMP
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%edx,%ebx,1), %edx
+	addl	$L(entry)-L(here), %edx
+	addl	(%esp), %edx
+	ret
+')
+
+
+C -----------------------------------------------------------------------------
+C This code uses a "two carry limbs" scheme.  At the top of the loop the
+C carries are ebx=lo, ecx=hi, then they swap for each limb processed.  For
+C the computed jump an odd size means they start one way around, an even
+C size the other.  Either way one limb is handled separately at the start of
+C the loop.
+C
+C The positioning of the movl to load the next source limb is important.
+C Moving it after the adcl with a view to avoiding a separate mul at the end
+C of the loop slows the code down.
+
+	ALIGN(32)
+L(top):
+	C eax	src limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	src+8
+	C edi	dst
+	C ebp	multiplier
+	C
+	C VAR_COUNTER  loop counter
+	C
+	C 17 bytes each limb
+
+L(entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+	mull	%ebp
+
+Zdisp(	M4_inst,%ecx, disp0,(%edi))
+	movl	$0, %ecx
+
+	adcl	%eax, %ebx
+
+Zdisp(	movl,	disp0,(%esi), %eax)
+	adcl	%edx, %ecx	
+
+
+	mull	%ebp
+
+	M4_inst	%ebx, disp1(%edi)
+	movl	$0, %ebx
+
+	adcl	%eax, %ecx
+
+	movl	disp1(%esi), %eax
+	adcl	%edx, %ebx
+')
+
+	decl	VAR_COUNTER
+	leal	UNROLL_BYTES(%esi), %esi
+	leal	UNROLL_BYTES(%edi), %edi
+
+	jns	L(top)
+
+
+	C eax	src limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx
+	C esi
+	C edi	dst (points at second last limb)
+	C ebp	multiplier
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 4))
+
+	mull	%ebp
+
+	M4_inst	%ecx, disp0(%edi)
+	movl	SAVE_EBP, %ebp
+
+	adcl	%ebx, %eax
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+
+	adcl	$0, %edx
+	M4_inst	%eax, disp1(%edi)
+	movl	SAVE_EDI, %edi
+
+	adcl	$0, %edx
+	addl	$SAVE_SIZE, %esp
+
+	movl	%edx, %eax
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/diveby3.asm b/rts/gmp/mpn/x86/k7/diveby3.asm
new file mode 100644
index 0000000000..57684958a5
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/diveby3.asm
@@ -0,0 +1,131 @@
+dnl  AMD K7 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+dnl 
+dnl  K7: 8.0 cycles/limb
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                              mp_limb_t carry);
+
+defframe(PARAM_CARRY,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,   8)
+defframe(PARAM_DST,   4)
+
+dnl  multiplicative inverse of 3, modulo 2^32
+deflit(INVERSE_3,        0xAAAAAAAB)
+
+dnl  ceil(b/3) and floor(b*2/3) where b=2^32
+deflit(ONE_THIRD_CEIL,   0x55555556)
+deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_divexact_by3c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SRC, %ecx
+	pushl	%ebx			defframe_pushl(SAVE_EBX)
+
+	movl	PARAM_CARRY, %ebx
+	pushl	%ebp			defframe_pushl(SAVE_EBP)
+
+	movl	PARAM_SIZE, %ebp
+	pushl	%edi			defframe_pushl(SAVE_EDI)
+
+	movl	(%ecx), %eax		C src low limb
+	pushl	%esi			defframe_pushl(SAVE_ESI)
+
+	movl	PARAM_DST, %edi
+	movl	$TWO_THIRDS_FLOOR, %esi
+	leal	-4(%ecx,%ebp,4), %ecx	C &src[size-1]
+
+	subl	%ebx, %eax
+
+	setc	%bl
+	decl	%ebp
+	jz	L(last)
+
+	leal	(%edi,%ebp,4), %edi	C &dst[size-1]
+	negl	%ebp
+
+
+	ALIGN(16)
+L(top):
+	C eax	src limb, carry subtracted
+	C ebx	carry limb (0 or 1)
+	C ecx	&src[size-1]
+	C edx	scratch
+	C esi	TWO_THIRDS_FLOOR
+	C edi	&dst[size-1]
+	C ebp	counter, limbs, negative
+
+	imull	$INVERSE_3, %eax, %edx
+
+	movl	4(%ecx,%ebp,4), %eax	C next src limb
+	cmpl	$ONE_THIRD_CEIL, %edx
+
+	sbbl	$-1, %ebx		C +1 if result>=ceil(b/3)
+	cmpl	%edx, %esi
+
+	sbbl	%ebx, %eax		C and further 1 if result>=ceil(b*2/3)
+	movl	%edx, (%edi,%ebp,4)
+	incl	%ebp
+
+	setc	%bl			C new carry
+	jnz	L(top)
+
+
+
+L(last):
+	C eax	src limb, carry subtracted
+	C ebx	carry limb (0 or 1)
+	C ecx	&src[size-1]
+	C edx	scratch
+	C esi	multiplier
+	C edi	&dst[size-1]
+	C ebp
+
+	imull	$INVERSE_3, %eax
+
+	cmpl	$ONE_THIRD_CEIL, %eax
+	movl	%eax, (%edi)
+	movl	SAVE_EBP, %ebp
+
+	sbbl	$-1, %ebx		C +1 if eax>=ceil(b/3)
+	cmpl	%eax, %esi
+	movl	$0, %eax
+
+	adcl	%ebx, %eax		C further +1 if eax>=ceil(b*2/3)
+	movl	SAVE_EDI, %edi
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/gmp-mparam.h b/rts/gmp/mpn/x86/k7/gmp-mparam.h
new file mode 100644
index 0000000000..c3bba0afc4
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/gmp-mparam.h
@@ -0,0 +1,100 @@
+/* AMD K7 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+/* the low limb is ready after 4 cycles, but normally it's the high limb
+   which is of interest, and that comes out after 6 cycles */
+#ifndef UMUL_TIME
+#define UMUL_TIME   6  /* cycles */
+#endif
+
+/* AMD doco says 40, but it measures 39 back-to-back */
+#ifndef UDIV_TIME
+#define UDIV_TIME   39  /* cycles */
+#endif
+
+/* using bsf */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME   7  /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   26
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      177
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   52
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      173
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              76
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            114
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            34
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        5
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          54
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 720, 1440, 2944, 7680, 18432, 57344, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     736
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         6912
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 784, 1696, 3200, 7680, 18432, 57344, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     800
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         8448
+#endif
diff --git a/rts/gmp/mpn/x86/k7/mmx/copyd.asm b/rts/gmp/mpn/x86/k7/mmx/copyd.asm
new file mode 100644
index 0000000000..33214daa1f
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/copyd.asm
@@ -0,0 +1,136 @@
+dnl  AMD K7 mpn_copyd -- copy limb vector, decrementing.
+dnl 
+dnl     alignment dst/src, A=0mod8 N=4mod8
+dnl        A/A   A/N   N/A   N/N
+dnl  K7    0.75  1.0   1.0   0.75
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The various comments in mpn/x86/k7/copyi.asm apply here too.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl  parameter space reused
+define(SAVE_EBX,`PARAM_SIZE')
+define(SAVE_ESI,`PARAM_SRC')
+
+dnl  minimum 5 since the unrolled code can't handle less than 5
+deflit(UNROLL_THRESHOLD, 5)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_copyd)
+
+	movl	PARAM_SIZE, %ecx
+	movl	%ebx, SAVE_EBX
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	jae	L(unroll)
+
+	orl	%ecx, %ecx
+	jz	L(simple_done)
+
+L(simple):
+	C eax	src
+	C ebx	scratch
+	C ecx	counter
+	C edx	dst
+	C
+	C this loop is 2 cycles/limb
+
+	movl	-4(%eax,%ecx,4), %ebx
+	movl	%ebx, -4(%edx,%ecx,4)
+	decl	%ecx
+	jnz	L(simple)
+
+L(simple_done):
+	movl	SAVE_EBX, %ebx
+	ret
+
+
+L(unroll):
+	movl	%esi, SAVE_ESI
+	leal	(%eax,%ecx,4), %ebx
+	leal	(%edx,%ecx,4), %esi
+
+	andl	%esi, %ebx
+	movl	SAVE_ESI, %esi
+	subl	$4, %ecx		C size-4
+
+	testl	$4, %ebx   C testl to pad code closer to 16 bytes for L(top)
+	jz	L(aligned)
+
+	C both src and dst unaligned, process one limb to align them
+	movl	12(%eax,%ecx,4), %ebx
+	movl	%ebx, 12(%edx,%ecx,4)
+	decl	%ecx
+L(aligned):
+
+
+	ALIGN(16)
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter, limbs
+	C edx	dst
+
+	movq	8(%eax,%ecx,4), %mm0
+	movq	(%eax,%ecx,4), %mm1
+	subl	$4, %ecx
+	movq	%mm0, 16+8(%edx,%ecx,4)
+	movq	%mm1, 16(%edx,%ecx,4)
+	jns	L(top)
+
+
+	C now %ecx is -4 to -1 representing respectively 0 to 3 limbs remaining
+
+	testb	$2, %cl
+	jz	L(finish_not_two)
+
+	movq	8(%eax,%ecx,4), %mm0
+	movq	%mm0, 8(%edx,%ecx,4)
+L(finish_not_two):
+
+	testb	$1, %cl
+	jz	L(done)
+
+	movl	(%eax), %ebx
+	movl	%ebx, (%edx)
+
+L(done):
+	movl	SAVE_EBX, %ebx
+	emms
+	ret
+
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/copyi.asm b/rts/gmp/mpn/x86/k7/mmx/copyi.asm
new file mode 100644
index 0000000000..b234a1628c
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/copyi.asm
@@ -0,0 +1,147 @@
+dnl  AMD K7 mpn_copyi -- copy limb vector, incrementing.
+dnl 
+dnl     alignment dst/src, A=0mod8 N=4mod8
+dnl        A/A   A/N   N/A   N/N
+dnl  K7    0.75  1.0   1.0   0.75
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size.
+C
+C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at
+C 1.33 c/l.
+C
+C The K7 can do two loads, or two stores, or a load and a store, in one
+C cycle, so if those are 64-bit operations then 0.5 c/l should be possible,
+C however nothing under 0.7 c/l is known.
+C
+C If both source and destination are unaligned then one limb is processed at
+C the start to make them aligned and so get 0.75 c/l, whereas if they'd been
+C used unaligned it would be 1.5 c/l.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl  parameter space reused
+define(SAVE_EBX,`PARAM_SIZE')
+
+dnl  minimum 5 since the unrolled code can't handle less than 5
+deflit(UNROLL_THRESHOLD, 5)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_copyi)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	%ebx, SAVE_EBX
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	jae	L(unroll)
+
+	orl	%ecx, %ecx
+	jz	L(simple_done)
+
+L(simple):
+	C eax	src, incrementing
+	C ebx	scratch
+	C ecx	counter
+	C edx	dst, incrementing
+	C
+	C this loop is 2 cycles/limb
+
+	movl	(%eax), %ebx
+	movl	%ebx, (%edx)
+	decl	%ecx
+	leal	4(%eax), %eax
+	leal	4(%edx), %edx
+	jnz	L(simple)
+
+L(simple_done):
+	movl	SAVE_EBX, %ebx
+	ret
+
+
+L(unroll):
+	movl	%eax, %ebx
+	leal	-12(%eax,%ecx,4), %eax	C src end - 12
+	subl	$3, %ecx		C size-3
+
+	andl	%edx, %ebx
+	leal	(%edx,%ecx,4), %edx	C dst end - 12
+	negl	%ecx
+
+	testl	$4, %ebx   C testl to pad code closer to 16 bytes for L(top)
+	jz	L(aligned)
+
+	C both src and dst unaligned, process one limb to align them
+	movl	(%eax,%ecx,4), %ebx
+	movl	%ebx, (%edx,%ecx,4)
+	incl	%ecx
+L(aligned):
+
+
+	ALIGN(16)
+L(top):
+	C eax	src end - 12
+	C ebx
+	C ecx	counter, negative, limbs
+	C edx	dst end - 12
+
+	movq	(%eax,%ecx,4), %mm0
+	movq	8(%eax,%ecx,4), %mm1
+	addl	$4, %ecx
+	movq	%mm0, -16(%edx,%ecx,4)
+	movq	%mm1, -16+8(%edx,%ecx,4)
+	ja	L(top)		C jump no carry and not zero
+
+
+	C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining
+
+	testb	$2, %cl
+	jnz	L(finish_not_two)
+
+	movq	(%eax,%ecx,4), %mm0
+	movq	%mm0, (%edx,%ecx,4)
+L(finish_not_two):
+
+	testb	$1, %cl
+	jnz	L(done)
+
+	movl	8(%eax), %ebx
+	movl	%ebx, 8(%edx)
+
+L(done):
+	movl	SAVE_EBX, %ebx
+	emms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm b/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm
new file mode 100644
index 0000000000..483ad6a9a1
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm
@@ -0,0 +1,718 @@
+dnl  AMD K7 mpn_divrem_1 -- mpn by limb division.
+dnl 
+dnl  K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size,
+C                         mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size,
+C                          mp_limb_t divisor, mp_limb_t carry);
+C
+C The method and nomenclature follow part 8 of "Division by Invariant
+C Integers using Multiplication" by Granlund and Montgomery, reference in
+C gmp.texi.
+C
+C The "and"s shown in the paper are done here with "cmov"s.  "m" is written
+C for m', and "d" for d_norm, which won't cause any confusion since it's
+C only the normalized divisor that's of any use in the code.  "b" is written
+C for 2^N, the size of a limb, N being 32 here.
+C
+C mpn_divrem_1 avoids one division if the src high limb is less than the
+C divisor.  mpn_divrem_1c doesn't check for a zero carry, since in normal
+C circumstances that will be a very rare event.
+C
+C There's a small bias towards expecting xsize==0, by having code for
+C xsize==0 in a straight line and xsize!=0 under forward jumps.
+
+
+dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
+dnl
+dnl  The inverse takes about 50 cycles to calculate, but after that the
+dnl  multiply is 17 c/l versus division at 42 c/l.
+dnl
+dnl  At 3 limbs the mul is a touch faster than div on the integer part, and
+dnl  even more so on the fractional part.
+
+deflit(MUL_THRESHOLD, 3)
+
+
+defframe(PARAM_CARRY,  24)
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+defframe(SAVE_EBX,    -4)
+defframe(SAVE_ESI,    -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+defframe(VAR_NORM,    -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC,     -28)
+defframe(VAR_DST,     -32)
+defframe(VAR_DST_STOP,-36)
+
+deflit(STACK_SPACE, 36)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	leal	-4(%edi,%ebx,4), %edi
+	jmp	LF(mpn_divrem_1,start_1c)
+
+EPILOGUE()
+
+
+	C offset 0x31, close enough to aligned
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	$0, %edx		C initial carry (if can't skip a div)
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	orl	%ecx, %ecx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+
+	jz	L(no_skip_div)
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+
+	cmpl	%ebp, %eax		C one less div if high<divisor
+	jnb	L(no_skip_div)
+
+	movl	$0, (%edi,%ecx,4)	C dst high limb
+	decl	%ecx			C size-1
+	movl	%eax, %edx		C src high limb as initial carry
+L(no_skip_div):
+
+
+L(start_1c):
+	C eax	
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	(%ebx,%ecx), %eax	C size+xsize
+	cmpl	$MUL_THRESHOLD, %eax
+	jae	L(mul_by_inverse)
+
+
+C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
+C It'd be possible to write them out without the looping, but no speedup
+C would be expected.
+C
+C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
+C integer part, but curiously not on the fractional part, where %ebp is a
+C (fixed) couple of cycles faster.
+
+	orl	%ecx, %ecx
+	jz	L(divide_no_integer)
+
+L(divide_integer):
+	C eax	scratch (quotient)
+	C ebx	xsize
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	PARAM_DIVISOR
+
+	movl	%eax, (%edi,%ecx,4)
+	decl	%ecx
+	jnz	L(divide_integer)
+
+
+L(divide_no_integer):
+	movl	PARAM_DST, %edi
+	orl	%ebx, %ebx
+	jnz	L(divide_fraction)
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EDI, %edi
+	movl	%edx, %eax
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+L(divide_fraction):
+	C eax	scratch (quotient)
+	C ebx	counter
+	C ecx
+	C edx	scratch (remainder)
+	C esi
+	C edi	dst
+	C ebp	divisor
+
+	movl	$0, %eax
+
+	divl	%ebp
+
+	movl	%eax, -4(%edi,%ebx,4)
+	decl	%ebx
+	jnz	L(divide_fraction)
+
+	jmp	L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	bsrl	%ebp, %eax		C 31-l
+
+	leal	12(%edi), %ebx
+	leal	4(%edi,%ecx,4), %edi	C &dst[xsize+size]
+
+	movl	%edi, VAR_DST
+	movl	%ebx, VAR_DST_STOP
+
+	movl	%ecx, %ebx		C size
+	movl	$31, %ecx
+
+	movl	%edx, %edi		C carry
+	movl	$-1, %edx
+
+	C
+
+	xorl	%eax, %ecx		C l
+	incl	%eax			C 32-l
+
+	shll	%cl, %ebp		C d normalized
+	movl	%ecx, VAR_NORM
+
+	movd	%eax, %mm7
+
+	movl	$-1, %eax
+	subl	%ebp, %edx		C (b-d)-1 giving edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1) / d
+
+	orl	%ebx, %ebx		C size
+	movl	%eax, VAR_INVERSE
+	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
+
+	jz	L(start_zero)
+	movl	%eax, VAR_SRC
+	cmpl	$1, %ebx
+
+	movl	8(%eax), %esi		C src high limb
+	jz	L(start_one)
+
+L(start_two_or_more):
+	movl	4(%eax), %edx		C src second highest limb
+
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
+
+	cmpl	$2, %ebx
+	je	L(integer_two_left)
+	jmp	L(integer_top)
+
+
+L(start_one):
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shll	%cl, %esi		C n10 = high << l
+	movl	%eax, VAR_SRC
+	jmp	L(integer_one_left)
+
+
+L(start_zero):
+	shll	%cl, %edi		C n2 = carry << l
+	movl	$0, %esi		C n10 = 0
+
+	C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
+	C must have xsize!=0
+	jmp	L(fraction_some)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
+C execution.  The instruction scheduling is important, with various
+C apparently equivalent forms running 1 to 5 cycles slower.
+C
+C A lower bound for the time would seem to be 16 cycles, based on the
+C following successive dependencies.
+C
+C		      cycles
+C		n2+n1	1
+C		mul	6
+C		q1+1	1
+C		mul	6
+C		sub	1
+C		addback	1
+C		       ---
+C		       16
+C
+C This chain is what the loop has already, but 16 cycles isn't achieved.
+C K7 has enough decode, and probably enough execute (depending maybe on what
+C a mul actually consumes), but nothing running under 17 has been found.
+C
+C In theory n2+n1 could be done in the sub and addback stages (by
+C calculating both n2 and n2+n1 there), but lack of registers makes this an
+C unlikely proposition.
+C
+C The jz in the loop keeps the q1+1 stage to 1 cycle.  Handling an overflow
+C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
+C chain, and nothing better than 18 cycles has been found when using it.
+C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
+C be an extremely rare event.
+C
+C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but
+C if some special data is coming out with this always, the q1_ff special
+C case actually runs at 15 c/l.  0x2FFF...FFFD divided by 3 is a good way to
+C induce the q1_ff case, for speed measurements or testing.  Note that
+C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
+C
+C The instruction groupings and empty comments show the cycles for a naive
+C in-order view of the code (conveniently ignoring the load latency on
+C VAR_INVERSE).  This shows some of where the time is going, but is nonsense
+C to the extent that out-of-order execution rearranges it.  In this case
+C there's 19 cycles shown, but it executes at 17.
+
+	ALIGN(16)
+L(integer_top):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src qword)
+	C mm7	rshift for normalization
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+	movl	VAR_SRC, %ecx
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movq	(%ecx), %mm0       C next limb and the one below it
+	subl	$4, %ecx
+
+	movl	%ecx, VAR_SRC
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+	jz	L(q1_ff)
+	movl	VAR_DST, %ecx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psrlq	%mm7, %mm0
+
+	leal	-4(%ecx), %ecx
+
+	C
+
+	subl	%eax, %esi
+	movl	VAR_DST_STOP, %eax
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	movd	%mm0, %esi
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	sbbl	$0, %ebx	   C q
+	cmpl	%eax, %ecx
+
+	movl	%ebx, (%ecx)
+	movl	%ecx, VAR_DST
+	jne	L(integer_top)
+
+
+L(integer_loop_done):
+
+
+C -----------------------------------------------------------------------------
+C
+C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
+C q1_ff special case.  This make the code a bit smaller and simpler, and
+C costs only 1 cycle (each).
+
+L(integer_two_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+	movl	PARAM_SRC, %ecx
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movd	(%ecx), %mm0	   C src low limb
+
+	movl	VAR_DST_STOP, %ecx
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	subl	%eax, %esi
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	movd	%mm0, %esi
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -4(%ecx)
+
+
+C -----------------------------------------------------------------------------
+L(integer_one_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	dst
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+	movl	VAR_DST_STOP, %ecx
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx           C q1 if q1+1 overflowed
+
+	mull	%ebx
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -8(%ecx)
+	subl	$8, %ecx
+
+
+
+L(integer_none):
+	cmpl	$0, PARAM_XSIZE
+	jne	L(fraction_some)
+
+	movl	%edi, %eax
+L(fraction_done):
+	movl	VAR_NORM, %ecx
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EBX, %ebx
+	addl	$STACK_SPACE, %esp
+
+	shrl	%cl, %eax
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+	C eax	(divisor)
+	C ebx	(q1+1 == 0)
+	C ecx
+	C edx
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+
+	movl	VAR_DST, %ecx
+	movl	VAR_DST_STOP, %edx
+	subl	$4, %ecx
+
+	psrlq	%mm7, %mm0
+	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
+	movl	%ecx, VAR_DST
+
+	movd	%mm0, %esi		C next n10
+
+	movl	$-1, (%ecx)
+	cmpl	%ecx, %edx
+	jne	L(integer_top)
+
+	jmp	L(integer_loop_done)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C Being the fractional part, the "source" limbs are all zero, meaning
+C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
+C
+C The loop runs at 15 cycles.  The dependent chain is the same as the
+C general case above, but without the n2+n1 stage (due to n1==0), so 15
+C would seem to be the lower bound.
+C
+C A not entirely obvious simplification is that q1+1 never overflows a limb,
+C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
+C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
+C rnd() means rounding down to a multiple of d.
+C
+C	m*n2 + b*n2 <= m*(d-1) + b*(d-1)
+C	             = m*d + b*d - m - b
+C	             = floor((b(b-d)-1)/d)*d + b*d - m - b
+C	             = rnd(b(b-d)-1) + b*d - m - b
+C	             = rnd(b(b-d)-1 + b*d) - m - b
+C	             = rnd(b*b-1) - m - b
+C	             <= (b-2)*b
+C
+C Unchanged from the general case is that the final quotient limb q can be
+C either q1 or q1+1, and the q1+1 case occurs often.  This can be seen from
+C equation 8.4 of the paper which simplifies as follows when n1==0 and
+C n0==0.
+C
+C	n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
+C
+C As before, the instruction groupings and empty comments show a naive
+C in-order view of the code, which is made a nonsense by out of order
+C execution.  There's 17 cycles shown, but it executes at 15.
+C
+C Rotating the store q and remainder->n2 instructions up to the top of the
+C loop gets the run time down from 16 to 15.
+
+	ALIGN(16)
+L(fraction_some):
+	C eax
+	C ebx
+	C ecx
+	C edx
+	C esi
+	C edi	carry
+	C ebp	divisor
+
+	movl	PARAM_DST, %esi
+	movl	VAR_DST_STOP, %ecx
+	movl	%edi, %eax
+
+	subl	$8, %ecx
+
+	jmp	L(fraction_entry)
+
+
+	ALIGN(16)
+L(fraction_top):
+	C eax	n2 carry, then scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	dst, decrementing
+	C edx	scratch
+	C esi	dst stop point
+	C edi	(will be n2)
+	C ebp	divisor
+
+	movl	%ebx, (%ecx)	C previous q
+	movl	%eax, %edi	C remainder->n2
+
+L(fraction_entry):
+	mull	VAR_INVERSE	C m*n2
+
+	movl	%ebp, %eax	C d
+	subl	$4, %ecx	C dst
+	leal	1(%edi), %ebx
+
+	C
+
+	C
+
+	C
+
+	C
+
+	addl	%edx, %ebx	C 1 + high(n2<<32 + m*n2) = q1+1
+
+	mull	%ebx		C (q1+1)*d
+
+	C
+
+	C
+
+	C
+
+	negl	%eax		C low of n - (q1+1)*d
+
+	C
+
+ 	sbbl	%edx, %edi	C high of n - (q1+1)*d, caring only about carry
+ 	leal	(%ebp,%eax), %edx
+
+ 	cmovc(	%edx, %eax)	C n - q1*d if underflow from using q1+1
+ 	sbbl	$0, %ebx	C q
+ 	cmpl	%esi, %ecx
+
+	jne	L(fraction_top)
+
+
+	movl	%ebx, (%ecx)
+	jmp	L(fraction_done)
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/lshift.asm b/rts/gmp/mpn/x86/k7/mmx/lshift.asm
new file mode 100644
index 0000000000..4d17c881ec
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/lshift.asm
@@ -0,0 +1,472 @@
+dnl  AMD K7 mpn_lshift -- mpn left shift.
+dnl 
+dnl  K7: 1.21 cycles/limb (at 16 limbs/loop).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K7: UNROLL_COUNT cycles/limb
+dnl           4           1.51
+dnl           8           1.26
+dnl          16           1.21
+dnl          32           1.2
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size left by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the right.  The bits shifted out at the left are
+C the return value.
+C
+C The comments in mpn_rshift apply here too.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 10)
+',`
+deflit(UNROLL_THRESHOLD, 10)
+')
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+defframe(SAVE_EDI, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+deflit(SAVE_SIZE, 12)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_SRC, %edx
+	subl	$SAVE_SIZE, %esp
+deflit(`FRAME',SAVE_SIZE)
+
+	movl	PARAM_SHIFT, %ecx
+	movl	%edi, SAVE_EDI
+
+	movl	PARAM_DST, %edi
+	decl	%eax
+	jnz	L(more_than_one_limb)
+
+	movl	(%edx), %edx
+
+	shldl(	%cl, %edx, %eax)	C eax was decremented to zero
+
+ 	shll	%cl, %edx
+
+	movl	%edx, (%edi)
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(more_than_one_limb):
+	C eax	size-1
+	C ebx
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+	movd	PARAM_SHIFT, %mm6
+	movd	(%edx,%eax,4), %mm5	C src high limb
+	cmp	$UNROLL_THRESHOLD-1, %eax
+
+	jae	L(unroll)
+	negl	%ecx
+	movd	(%edx), %mm4		C src low limb
+
+	addl	$32, %ecx
+
+	movd	%ecx, %mm7
+
+L(simple_top):
+	C eax	loop counter, limbs
+	C ebx
+	C ecx
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C mm0	scratch
+	C mm4	src low limb
+	C mm5	src high limb
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%edx,%eax,4), %mm0
+	decl	%eax
+
+ 	psrlq	%mm7, %mm0
+
+	movd	%mm0, 4(%edi,%eax,4)
+	jnz	L(simple_top)
+
+
+	psllq	%mm6, %mm5
+ 	psllq	%mm6, %mm4
+
+	psrlq	$32, %mm5
+	movd	%mm4, (%edi)		C dst low limb
+
+	movd	%mm5, %eax		C return value
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	size-1
+	C ebx	(saved)
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C mm5	src high limb, for return value
+	C mm6	lshift
+
+	movl	%esi, SAVE_ESI
+	movl	%ebx, SAVE_EBX
+	leal	-4(%edx,%eax,4), %edx   C &src[size-2]
+
+	testb	$4, %dl
+	movq	(%edx), %mm1		C src high qword
+
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process high limb (marked xxx) separately to
+	C make it so
+	C
+	C  source    -4(edx,%eax,4)
+	C                  |
+	C  +-------+-------+-------+--
+	C  |  xxx          |
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+	C
+	C  dest      -4(edi,%eax,4)
+	C                  |
+	C  +-------+-------+--
+	C  |  xxx  |       |  
+	C  +-------+-------+--
+
+	psllq	%mm6, %mm1
+	subl	$4, %edx
+	movl	%eax, PARAM_SIZE	C size-1
+
+	psrlq	$32, %mm1
+	decl	%eax			C size-2 is new size-1
+
+	movd	%mm1, 4(%edi,%eax,4)
+	movq	(%edx), %mm1		C new src high qword
+L(start_src_aligned):
+
+
+        leal    -4(%edi,%eax,4), %edi   C &dst[size-2]
+	psllq	%mm6, %mm5
+
+	testl	$4, %edi
+	psrlq	$32, %mm5		C return value
+
+	jz	L(start_dst_aligned)
+
+
+	C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
+	C shift is 32 bits extra.  High limb of dst (marked xxx) handled
+	C here separately.
+	C
+	C  source       %edx
+	C  +-------+-------+--
+	C  |      mm1      |  
+	C  +-------+-------+--
+	C                0mod8   4mod8
+	C
+	C  dest         %edi
+	C  +-------+-------+-------+--
+	C  |  xxx  |          
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+
+	movq	%mm1, %mm0
+	psllq	%mm6, %mm1
+	addl	$32, %ecx		C shift+32
+
+	psrlq	$32, %mm1
+
+	movd	%mm1, 4(%edi)
+	movq	%mm0, %mm1
+	subl	$4, %edi
+
+	movd	%ecx, %mm6		C new lshift
+L(start_dst_aligned):
+
+	decl	%eax			C size-2, two last limbs handled at end
+	movq	%mm1, %mm2		C copy of src high qword
+	negl	%ecx
+
+	andl	$-2, %eax		C round size down to even
+	addl	$64, %ecx
+
+	movl	%eax, %ebx
+	negl	%eax
+
+	andl	$UNROLL_MASK, %eax
+	decl	%ebx
+
+	shll	%eax
+
+	movd	%ecx, %mm7		C rshift = 64-lshift
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%eax,%eax,4), %esi
+')
+	shrl	$UNROLL_LOG2, %ebx	C loop counter
+
+	leal	ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
+	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
+	movl	PARAM_SIZE, %eax	C for use at end
+	jmp	*%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%eax,%eax,4), %esi
+	addl	$L(entry)-L(here), %esi
+	addl	(%esp), %esi
+
+	ret
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(top):
+	C eax	size (for use at end)
+	C ebx	loop counter
+	C ecx	rshift
+	C edx	src
+	C esi	computed jump
+	C edi	dst
+	C ebp
+	C
+	C mm0	scratch
+	C mm1	\ carry (alternating, mm2 first)
+	C mm2	/
+	C mm6	lshift
+	C mm7	rshift
+	C
+	C 10 code bytes/limb
+	C
+	C The two chunks differ in whether mm1 or mm2 hold the carry.
+	C The computed jump puts the initial carry in both mm1 and mm2.
+	
+L(entry):
+deflit(CHUNK_COUNT, 4)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 - 8))
+
+ 	movq	disp0(%edx), %mm0
+ 	psllq	%mm6, %mm2
+
+ 	movq	%mm0, %mm1
+ 	psrlq	%mm7, %mm0
+
+ 	por	%mm2, %mm0
+ 	movq	%mm0, disp0(%edi)
+
+
+ 	movq	disp1(%edx), %mm0
+ 	psllq	%mm6, %mm1
+
+ 	movq	%mm0, %mm2
+ 	psrlq	%mm7, %mm0
+
+ 	por	%mm1, %mm0
+ 	movq	%mm0, disp1(%edi)
+')
+
+	subl	$UNROLL_BYTES, %edx
+	subl	$UNROLL_BYTES, %edi
+	decl	%ebx
+
+	jns	L(top)
+
+
+
+define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
+
+L(end):
+	testb	$1, %al
+	movl	SAVE_EBX, %ebx
+	psllq	%mm6, %mm2	C wanted left shifted in all cases below
+
+	movd	%mm5, %eax
+
+	movl	SAVE_ESI, %esi
+	jz	L(end_even)
+
+
+L(end_odd):
+
+	C Size odd, destination was aligned.
+	C
+	C                 source        edx+8   edx+4
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C dest                            edi
+	C --+---------------+---------------+-------+
+	C   |   written     |               |       |
+	C --+---------------+---------------+-------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size odd, destination was unaligned.
+	C
+	C                 source        edx+8   edx+4
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C         dest                            edi
+	C         --+---------------+---------------+
+	C           |   written     |               |
+	C         --+---------------+---------------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword at (%edi), and in the aligned case
+	C there's an extra limb of dst to be formed from that extra src limb
+	C left shifted.
+
+	movd	disp(4) (%edx), %mm0
+	testb	$32, %cl
+
+	movq	%mm0, %mm1
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+	psllq	%mm6, %mm1
+
+	por	%mm2, %mm0
+
+	movq	%mm0, disp(0) (%edi)
+	jz	L(end_odd_unaligned)
+	movd	%mm1, disp(-4) (%edi)
+L(end_odd_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+L(end_even):
+
+	C Size even, destination was aligned.
+	C
+	C                 source        edx+8
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C dest                            edi
+	C --+---------------+---------------+
+	C   |   written     |               |
+	C --+---------------+---------------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size even, destination was unaligned.
+	C
+	C               source          edx+8
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C         dest                  edi+4
+	C         --+---------------+-------+
+	C           |    written    |       |
+	C         --+---------------+-------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C The movq for the aligned case overwrites the movd for the
+	C unaligned case.
+
+	movq	%mm2, %mm0
+	psrlq	$32, %mm2
+
+	testb	$32, %cl
+	movd	%mm2, disp(4) (%edi)
+
+	jz	L(end_even_unaligned)
+	movq	%mm0, disp(0) (%edi)
+L(end_even_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/mod_1.asm b/rts/gmp/mpn/x86/k7/mmx/mod_1.asm
new file mode 100644
index 0000000000..545ca56ddf
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/mod_1.asm
@@ -0,0 +1,457 @@
+dnl  AMD K7 mpn_mod_1 -- mpn by limb remainder.
+dnl 
+dnl  K7: 17.0 cycles/limb.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                       mp_limb_t carry);
+C
+C The code here is the same as mpn_divrem_1, but with the quotient
+C discarded.  See mpn/x86/k7/mmx/divrem_1.c for some comments.
+
+
+dnl  MUL_THRESHOLD is the size at which the multiply by inverse method is
+dnl  used, rather than plain "divl"s.  Minimum value 2.
+dnl
+dnl  The inverse takes about 50 cycles to calculate, but after that the
+dnl  multiply is 17 c/l versus division at 41 c/l.
+dnl
+dnl  Using mul or div is about the same speed at 3 limbs, so the threshold
+dnl  is set to 4 to get the smaller div code used at 3.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,    8)
+defframe(PARAM_SRC,     4)
+
+defframe(SAVE_EBX,    -4)
+defframe(SAVE_ESI,    -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+defframe(VAR_NORM,    -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC_STOP,-28)
+
+deflit(STACK_SPACE, 28)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	jmp	LF(mpn_mod_1,start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(32)
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	$0, %edx		C initial carry (if can't skip a div)
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	orl	%ecx, %ecx
+	jz	L(divide_done)
+
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+
+	cmpl	%ebp, %eax		C carry flag if high<divisor
+					
+	cmovc(	%eax, %edx)		C src high limb as initial carry
+	sbbl	$0, %ecx		C size-1 to skip one div
+	jz	L(divide_done)
+
+
+	ALIGN(16)
+L(start_1c):
+	C eax	
+	C ebx
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi
+	C ebp	divisor
+
+	cmpl	$MUL_THRESHOLD, %ecx
+	jae	L(mul_by_inverse)
+
+
+
+C With a MUL_THRESHOLD of 4, this "loop" only ever does 1 to 3 iterations,
+C but it's already fast and compact, and there's nothing to gain by
+C expanding it out.
+C
+C Using PARAM_DIVISOR in the divl is a couple of cycles faster than %ebp.
+
+	orl	%ecx, %ecx
+	jz	L(divide_done)
+
+
+L(divide_top):
+	C eax	scratch (quotient)
+	C ebx
+	C ecx	counter, limbs, decrementing
+	C edx	scratch (remainder)
+	C esi	src
+	C edi
+	C ebp
+
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	PARAM_DIVISOR
+
+	decl	%ecx
+	jnz	L(divide_top)
+
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	movl	%edx, %eax
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi
+	C ebp	divisor
+
+	bsrl	%ebp, %eax		C 31-l
+
+	movl	%ebx, SAVE_EBX
+	leal	-4(%esi), %ebx
+
+	movl	%ebx, VAR_SRC_STOP
+	movl	%edi, SAVE_EDI
+
+	movl	%ecx, %ebx		C size
+	movl	$31, %ecx
+
+	movl	%edx, %edi		C carry
+	movl	$-1, %edx
+
+	C
+
+	xorl	%eax, %ecx		C l
+	incl	%eax			C 32-l
+
+	shll	%cl, %ebp		C d normalized
+	movl	%ecx, VAR_NORM
+
+	movd	%eax, %mm7
+
+	movl	$-1, %eax
+	subl	%ebp, %edx		C (b-d)-1 so  edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1) / d
+
+	C
+
+	movl	%eax, VAR_INVERSE
+	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
+
+	movl	8(%eax), %esi		C src high limb
+	movl	4(%eax), %edx		C src second highest limb
+
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
+
+	movl	%eax, %ecx		C &src[size-3]
+
+
+ifelse(MUL_THRESHOLD,2,`
+	cmpl	$2, %ebx
+	je	L(inverse_two_left)
+')
+
+
+C The dependent chain here is the same as in mpn_divrem_1, but a few
+C instructions are saved by not needing to store the quotient limbs.
+C Unfortunately this doesn't get the code down to the theoretical 16 c/l.
+C
+C There's four dummy instructions in the loop, all of which are necessary
+C for the claimed 17 c/l.  It's a 1 to 3 cycle slowdown if any are removed,
+C or changed from load to store or vice versa.  They're not completely
+C random, since they correspond to what mpn_divrem_1 has, but there's no
+C obvious reason why they're necessary.  Presumably they induce something
+C good in the out of order execution, perhaps through some load/store
+C ordering and/or decoding effects.
+C
+C The q1==0xFFFFFFFF case is handled here the same as in mpn_divrem_1.  On
+C on special data that comes out as q1==0xFFFFFFFF always, the loop runs at
+C about 13.5 c/l.
+
+	ALIGN(32)
+L(inverse_top):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	src pointer, decrementing
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src qword)
+	C mm7	rshift for normalization
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+	movl	PARAM_SIZE, %ebx   C dummy
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movq	(%ecx), %mm0       C next src limb and the one below it
+	subl	$4, %ecx
+
+	movl	%ecx, PARAM_SIZE   C dummy
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+	jz	L(q1_ff)
+	nop                        C dummy
+
+	mull	%ebx		   C (q1+1)*d
+
+	psrlq	%mm7, %mm0
+	leal	0(%ecx), %ecx      C dummy
+
+	C
+
+	C
+
+	subl	%eax, %esi
+	movl	VAR_SRC_STOP, %eax
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	movd	%mm0, %esi
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	cmpl	%eax, %ecx
+	jne	L(inverse_top)
+
+
+L(inverse_loop_done):
+
+
+C -----------------------------------------------------------------------------
+
+L(inverse_two_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	&src[-1]
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src dword)
+	C mm7	rshift
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movd	4(%ecx), %mm0	   C src low limb
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	subl	%eax, %esi
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	movd	%mm0, %esi
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+
+
+C One limb left
+
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movl	VAR_NORM, %ecx     C for final denorm
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	movl	SAVE_EBX, %ebx
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	movl	%esi, %eax	   C remainder
+	movl	SAVE_ESI, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	leal	(%ebp,%eax), %edx
+	movl	SAVE_EBP, %ebp
+
+	cmovc(	%edx, %eax)	   C n - q1*d if underflow from using q1+1
+	movl	SAVE_EDI, %edi
+
+	shrl	%cl, %eax	   C denorm remainder
+	addl	$STACK_SPACE, %esp
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+	C eax	(divisor)
+	C ebx	(q1+1 == 0)
+	C ecx	src pointer
+	C edx
+	C esi	n10
+	C edi	(n2)
+	C ebp	divisor
+
+	movl	VAR_SRC_STOP, %edx
+	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
+	psrlq	%mm7, %mm0
+
+	movd	%mm0, %esi		C next n10
+
+	cmpl	%ecx, %edx
+	jne	L(inverse_top)
+	jmp	L(inverse_loop_done)
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/popham.asm b/rts/gmp/mpn/x86/k7/mmx/popham.asm
new file mode 100644
index 0000000000..fa7c8c04a5
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/popham.asm
@@ -0,0 +1,239 @@
+dnl  AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
+dnl  distance.
+dnl 
+dnl  K7: popcount 5.0 cycles/limb, hamdist 6.0 cycles/limb
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  Only recent versions of gas know psadbw, in particular gas 2.9.1 on
+dnl  FreeBSD 3.3 and 3.4 doesn't recognise it.
+
+define(psadbw_mm4_mm0,
+`ifelse(m4_ifdef_anyof_p(`HAVE_TARGET_CPU_athlon',
+                         `HAVE_TARGET_CPU_pentium3'),1,
+	`.byte 0x0f,0xf6,0xc4	C psadbw %mm4, %mm0',
+
+`m4_warning(`warning, using simulated and only partly functional psadbw, use for testing only
+')	C this works enough for the sum of bytes done below, making it
+	C possible to test on an older cpu
+	leal	-8(%esp), %esp
+	movq	%mm4, (%esp)
+	movq	%mm0, %mm4
+forloop(i,1,7,
+`	psrlq	$ 8, %mm4
+	paddb	%mm4, %mm0
+')
+	pushl	$ 0
+	pushl	$ 0xFF
+	pand	(%esp), %mm0
+	movq	8(%esp), %mm4
+	leal	16(%esp), %esp
+')')
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C The code here is almost certainly not optimal, but is already a 3x speedup
+C over the generic C code.  The main improvement would be to interleave
+C processing of two qwords in the loop so as to fully exploit the available
+C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
+C
+C The loop is based on the example "Efficient 64-bit population count using
+C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
+C page 158 of rev E (reference in mpn/x86/k7/README).
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
+')')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC2,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+	dnl  non-PIC
+
+	DATA
+	ALIGN(8)
+
+define(LS,
+m4_assert_numargs(1)
+`LF(M4_function,`$1')')
+
+LS(rodata_AAAAAAAAAAAAAAAA):
+	.long	0xAAAAAAAA
+	.long	0xAAAAAAAA
+
+LS(rodata_3333333333333333):
+	.long	0x33333333
+	.long	0x33333333
+
+LS(rodata_0F0F0F0F0F0F0F0F):
+	.long	0x0F0F0F0F
+	.long	0x0F0F0F0F
+')
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	orl	%ecx, %ecx
+	jz	L(zero)
+
+ifdef(`PIC',`
+	movl	$0xAAAAAAAA, %eax
+	movl	$0x33333333, %edx
+
+	movd	%eax, %mm7
+	movd	%edx, %mm6
+
+	movl	$0x0F0F0F0F, %eax
+
+	punpckldq %mm7, %mm7
+	punpckldq %mm6, %mm6
+
+	movd	%eax, %mm5
+	movd	%edx, %mm4
+
+	punpckldq %mm5, %mm5
+
+',`
+	movq	LS(rodata_AAAAAAAAAAAAAAAA), %mm7
+	movq	LS(rodata_3333333333333333), %mm6
+	movq	LS(rodata_0F0F0F0F0F0F0F0F), %mm5
+')
+	pxor	%mm4, %mm4
+
+define(REG_AAAAAAAAAAAAAAAA,%mm7)
+define(REG_3333333333333333,%mm6)
+define(REG_0F0F0F0F0F0F0F0F,%mm5)
+define(REG_0000000000000000,%mm4)
+
+
+	movl	PARAM_SRC, %eax
+HAM(`	movl	PARAM_SRC2, %edx')
+
+	pxor	%mm2, %mm2	C total
+
+	shrl	%ecx
+	jnc	L(top)
+
+	movd	(%eax,%ecx,8), %mm1
+
+HAM(`	movd	0(%edx,%ecx,8), %mm0
+	pxor	%mm0, %mm1
+')
+	orl	%ecx, %ecx
+	jmp	L(loaded)
+
+
+	ALIGN(16)
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter, qwords, decrementing
+	C edx	[hamdist] src2
+	C
+	C mm0	(scratch)
+	C mm1	(scratch)
+	C mm2	total (low dword)
+	C mm3
+	C mm4	\
+	C mm5	| special constants
+	C mm6	|
+	C mm7	/
+
+	movq	-8(%eax,%ecx,8), %mm1
+
+HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
+	decl	%ecx
+
+L(loaded):
+	movq	%mm1, %mm0
+	pand	REG_AAAAAAAAAAAAAAAA, %mm1
+
+	psrlq	$1, %mm1
+
+	psubd	%mm1, %mm0	C bit pairs
+
+
+	movq	%mm0, %mm1
+	psrlq	$2, %mm0
+
+	pand	REG_3333333333333333, %mm0
+	pand	REG_3333333333333333, %mm1
+
+	paddd	%mm1, %mm0	C nibbles
+
+
+	movq	%mm0, %mm1
+	psrlq	$4, %mm0
+
+	pand	REG_0F0F0F0F0F0F0F0F, %mm0
+	pand	REG_0F0F0F0F0F0F0F0F, %mm1
+
+	paddd	%mm1, %mm0	C bytes
+
+
+	psadbw_mm4_mm0
+
+	paddd	%mm0, %mm2	C add to total
+	jnz	L(top)
+
+
+	movd	%mm2, %eax
+	emms
+	ret
+
+
+L(zero):
+	movl	$0, %eax
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/rshift.asm b/rts/gmp/mpn/x86/k7/mmx/rshift.asm
new file mode 100644
index 0000000000..abb546cd5b
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/rshift.asm
@@ -0,0 +1,471 @@
+dnl  AMD K7 mpn_rshift -- mpn right shift.
+dnl 
+dnl  K7: 1.21 cycles/limb (at 16 limbs/loop).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K7: UNROLL_COUNT cycles/limb
+dnl           4           1.51
+dnl           8           1.26
+dnl          16           1.21
+dnl          32           1.2
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left.  The bits shifted out at the right are
+C the return value.
+C
+C This code uses 64-bit MMX operations, which makes it possible to handle
+C two limbs at a time, for a theoretical 1.0 cycles/limb.  Plain integer
+C code, on the other hand, suffers from shrd being a vector path decode and
+C running at 3 cycles back-to-back.
+C
+C Full speed depends on source and destination being aligned, and some hairy
+C setups and finish-ups are done to arrange this for the loop.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 10)
+',`
+deflit(UNROLL_THRESHOLD, 10)
+')
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+defframe(SAVE_EDI, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+deflit(SAVE_SIZE, 12)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_SRC, %edx
+	subl	$SAVE_SIZE, %esp
+deflit(`FRAME',SAVE_SIZE)
+
+	movl	PARAM_SHIFT, %ecx
+	movl	%edi, SAVE_EDI
+
+	movl	PARAM_DST, %edi
+	decl	%eax
+	jnz	L(more_than_one_limb)
+
+	movl	(%edx), %edx		C src limb
+
+	shrdl(	%cl, %edx, %eax)	C eax was decremented to zero
+
+ 	shrl	%cl, %edx
+
+	movl	%edx, (%edi)		C dst limb
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(more_than_one_limb):
+	C eax	size-1
+	C ebx
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+	movd	PARAM_SHIFT, %mm6	C rshift
+	movd	(%edx), %mm5		C src low limb
+	cmp	$UNROLL_THRESHOLD-1, %eax
+
+	jae	L(unroll)
+	leal	(%edx,%eax,4), %edx	C &src[size-1]
+	leal	-4(%edi,%eax,4), %edi	C &dst[size-2]
+
+	movd	(%edx), %mm4		C src high limb
+	negl	%eax
+
+
+L(simple_top):
+	C eax	loop counter, limbs, negative
+	C ebx
+	C ecx	shift
+	C edx	carry
+	C edx	&src[size-1]
+	C edi	&dst[size-2]
+	C ebp
+	C
+	C mm0	scratch
+	C mm4	src high limb
+	C mm5	src low limb
+	C mm6	shift
+
+	movq	(%edx,%eax,4), %mm0
+	incl	%eax
+
+ 	psrlq	%mm6, %mm0
+
+	movd	%mm0, (%edi,%eax,4)
+	jnz	L(simple_top)
+
+
+	psllq	$32, %mm5
+ 	psrlq	%mm6, %mm4
+
+	psrlq	%mm6, %mm5
+	movd	%mm4, 4(%edi)		C dst high limb
+
+	movd	%mm5, %eax		C return value
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	size-1
+	C ebx
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C mm5	src low limb
+	C mm6	rshift
+
+	testb	$4, %dl
+	movl	%esi, SAVE_ESI
+	movl	%ebx, SAVE_EBX
+
+	psllq	$32, %mm5
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process low limb separately (marked xxx) and
+	C step src and dst by one limb, making src aligned.
+	C
+	C source                  edx
+	C --+-------+-------+-------+
+	C           |          xxx  |
+	C --+-------+-------+-------+
+	C         4mod8   0mod8   4mod8
+	C
+	C         dest            edi
+	C         --+-------+-------+
+	C           |       |  xxx  |  
+	C         --+-------+-------+
+
+	movq	(%edx), %mm0		C src low two limbs
+	addl	$4, %edx
+	movl	%eax, PARAM_SIZE	C size-1
+
+	addl	$4, %edi
+	decl	%eax			C size-2 is new size-1
+
+	psrlq	%mm6, %mm0
+	movl	%edi, PARAM_DST		C new dst
+
+	movd	%mm0, -4(%edi)
+L(start_src_aligned):
+
+
+	movq	(%edx), %mm1		C src low two limbs
+	decl	%eax			C size-2, two last limbs handled at end
+	testl	$4, %edi
+
+	psrlq	%mm6, %mm5
+	jz	L(start_dst_aligned)
+
+
+	C dst isn't aligned, add 4 to make it so, and pretend the shift is
+	C 32 bits extra.  Low limb of dst (marked xxx) handled here separately.
+	C
+	C          source          edx
+	C          --+-------+-------+
+	C            |      mm1      |
+	C          --+-------+-------+
+	C                  4mod8   0mod8
+	C
+	C  dest                    edi
+	C  --+-------+-------+-------+
+	C                    |  xxx  |        
+	C  --+-------+-------+-------+
+	C          4mod8   0mod8   4mod8
+
+	movq	%mm1, %mm0
+	psrlq	%mm6, %mm1
+	addl	$32, %ecx		C shift+32
+
+	movd	%mm1, (%edi)
+	movq	%mm0, %mm1
+	addl	$4, %edi		C new dst
+
+	movd	%ecx, %mm6
+L(start_dst_aligned):
+
+
+	movq	%mm1, %mm2		C copy of src low two limbs
+	negl	%ecx
+	andl	$-2, %eax		C round size down to even
+
+	movl	%eax, %ebx
+	negl	%eax
+	addl	$64, %ecx
+
+	andl	$UNROLL_MASK, %eax
+	decl	%ebx
+
+	shll	%eax
+
+	movd	%ecx, %mm7		C lshift = 64-rshift
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%eax,%eax,4), %esi
+	negl	%eax
+')
+	shrl	$UNROLL_LOG2, %ebx	C loop counter
+
+	leal	ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
+	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
+	movl	PARAM_SIZE, %eax	C for use at end
+
+	jmp	*%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%eax,%eax,4), %esi
+	addl	$L(entry)-L(here), %esi
+	addl	(%esp), %esi
+	negl	%eax
+
+	ret
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(64)
+L(top):
+	C eax	size, for use at end
+	C ebx	loop counter
+	C ecx	lshift
+	C edx	src
+	C esi	was computed jump
+	C edi	dst
+	C ebp
+	C
+	C mm0	scratch
+	C mm1	\ carry (alternating)
+	C mm2	/
+	C mm6	rshift
+	C mm7	lshift
+	C
+	C 10 code bytes/limb
+	C
+	C The two chunks differ in whether mm1 or mm2 hold the carry.
+	C The computed jump puts the initial carry in both mm1 and mm2.
+	
+L(entry):
+deflit(CHUNK_COUNT, 4)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 8))
+
+ 	movq	disp0(%edx), %mm0
+ 	psrlq	%mm6, %mm2
+
+ 	movq	%mm0, %mm1
+ 	psllq	%mm7, %mm0
+
+ 	por	%mm2, %mm0
+ 	movq	%mm0, disp0(%edi)
+
+
+ 	movq	disp1(%edx), %mm0
+ 	psrlq	%mm6, %mm1
+
+ 	movq	%mm0, %mm2
+ 	psllq	%mm7, %mm0
+
+ 	por	%mm1, %mm0
+ 	movq	%mm0, disp1(%edi)
+')
+
+	addl	$UNROLL_BYTES, %edx
+	addl	$UNROLL_BYTES, %edi
+	decl	%ebx
+
+	jns	L(top)
+
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 8))
+
+	testb	$1, %al
+	psrlq	%mm6, %mm2	C wanted rshifted in all cases below
+	movl	SAVE_ESI, %esi
+
+	movd	%mm5, %eax		C return value
+
+	movl	SAVE_EBX, %ebx
+	jz	L(end_even)
+
+	
+	C Size odd, destination was aligned.
+	C
+	C source
+	C       edx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest                  edi
+	C +-------+---------------+---------------+--
+	C |       |               |    written    |
+	C +-------+---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size odd, destination was unaligned.
+	C
+	C source
+	C       edx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest          edi
+	C +---------------+---------------+--
+	C |               |    written    |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword to store, and in the aligned case there's
+	C a further extra limb of dst to be formed.
+
+
+	movd	disp0(%edx), %mm0
+	movq	%mm0, %mm1
+
+	psllq	%mm7, %mm0
+	testb	$32, %cl
+
+ 	por	%mm2, %mm0
+	psrlq	%mm6, %mm1
+
+	movq	%mm0, disp0(%edi)
+	jz	L(finish_odd_unaligned)
+
+	movd	%mm1, disp1(%edi)
+L(finish_odd_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+L(end_even):
+
+	C Size even, destination was aligned.
+	C
+	C source
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest          edi
+	C +---------------+---------------+--
+	C |               |      mm3      |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size even, destination was unaligned.
+	C
+	C source
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest  edi
+	C +-------+---------------+--
+	C |       |      mm3      |
+	C +-------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = 64-(shift+32)
+
+
+	C The movd for the unaligned case is the same data as the movq for
+	C the aligned case, it's just a choice between whether one or two
+	C limbs should be written.
+
+
+	testb	$32, %cl
+	movd	%mm2, disp0(%edi)
+
+	jz	L(end_even_unaligned)
+
+	movq	%mm2, disp0(%edi)
+L(end_even_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mul_1.asm b/rts/gmp/mpn/x86/k7/mul_1.asm
new file mode 100644
index 0000000000..07f7085b10
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mul_1.asm
@@ -0,0 +1,265 @@
+dnl  AMD K7 mpn_mul_1 -- mpn by limb multiply.
+dnl 
+dnl  K7: 3.4 cycles/limb (at 16 limbs/loop).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K7: UNROLL_COUNT cycles/limb
+dnl           8           3.9
+dnl          16           3.4
+dnl          32           3.4
+dnl          64           3.35
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       mp_limb_t multiplier, mp_limb_t carry);
+C
+C Multiply src,size by mult and store the result in dst,size.
+C Return the carry limb from the top of the result.
+C
+C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
+C the low limb of the destination.
+C
+C Variations on the unrolled loop have been tried, with the current
+C registers or with the counter on the stack to free up ecx.  The current
+C code is the fastest found.
+C
+C An interesting effect is that removing the stores "movl %ebx, disp0(%edi)"
+C from the unrolled loop actually slows it down to 5.0 cycles/limb.  Code
+C with this change can be tested on sizes of the form UNROLL_COUNT*n+1
+C without having to change the computed jump.  There's obviously something
+C fishy going on, perhaps with what execution units the mul needs.
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+defframe(SAVE_EBP, -4)
+defframe(SAVE_EDI, -8)
+defframe(SAVE_ESI, -12)
+defframe(SAVE_EBX, -16)
+deflit(STACK_SPACE, 16)
+
+dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 7)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_mul_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+	jmp	LF(mpn_mul_1,start_nc)
+EPILOGUE()
+
+
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+	xorl	%edx, %edx	C initial carry
+L(start_nc):
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME', STACK_SPACE)
+
+	movl	%edi, SAVE_EDI
+	movl	%ebx, SAVE_EBX
+	movl	%edx, %ebx
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	cmpl	$UNROLL_THRESHOLD, %ecx
+
+	movl	PARAM_DST, %edi
+	movl	%ebp, SAVE_EBP
+	jae	L(unroll)
+
+	leal	(%esi,%ecx,4), %esi
+	leal	(%edi,%ecx,4), %edi
+	negl	%ecx
+
+	movl	PARAM_MULTIPLIER, %ebp
+
+L(simple):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter (negative)
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+	movl	(%esi,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	movl	%eax, (%edi,%ecx,4)
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+	incl	%ecx
+	jnz	L(simple)
+
+	movl	%ebx, %eax
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C The mov to load the next source limb is done well ahead of the mul, this
+C is necessary for full speed.  It leads to one limb handled separately
+C after the loop.
+C
+C When unrolling to 32 or more, an offset of +4 is used on the src pointer,
+C to avoid having an 0x80 displacement in the code for the last limb in the
+C unrolled loop.  This is for a fair comparison between 16 and 32 unrolling.
+
+ifelse(eval(UNROLL_COUNT >= 32),1,`
+deflit(SRC_OFFSET,4)
+',`
+deflit(SRC_OFFSET,)
+')
+
+	C this is offset 0x62, so close enough to aligned
+L(unroll):
+	C eax
+	C ebx	initial carry
+	C ecx	size
+	C edx
+	C esi	src
+	C edi	dst
+	C ebp
+deflit(`FRAME', STACK_SPACE)
+
+	leal	-1(%ecx), %edx	C one limb handled at end
+	leal	-2(%ecx), %ecx	C and ecx is one less than edx
+	movl	%ebp, SAVE_EBP
+
+	negl	%edx
+	shrl	$UNROLL_LOG2, %ecx	C unrolled loop counter
+	movl	(%esi), %eax		C src low limb
+
+	andl	$UNROLL_MASK, %edx
+	movl	PARAM_DST, %edi
+
+	movl	%edx, %ebp
+	shll	$4, %edx
+
+	C 17 code bytes per limb
+ifdef(`PIC',`
+	call	L(add_eip_to_edx)
+L(here):
+',`
+	leal	L(entry) (%edx,%ebp), %edx
+')
+	negl	%ebp
+
+	leal	ifelse(UNROLL_BYTES,256,128+) SRC_OFFSET(%esi,%ebp,4), %esi
+	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%ebp,4), %edi
+	movl	PARAM_MULTIPLIER, %ebp
+
+	jmp	*%edx
+
+
+ifdef(`PIC',`
+L(add_eip_to_edx):
+	C See README.family about old gas bugs
+	leal	(%edx,%ebp), %edx
+	addl	$L(entry)-L(here), %edx
+	addl	(%esp), %edx
+	ret
+')
+
+
+C ----------------------------------------------------------------------------
+	ALIGN(32)
+L(top):
+	C eax	next src limb
+	C ebx	carry
+	C ecx	counter
+	C edx	scratch
+	C esi	src+4
+	C edi	dst
+	C ebp	multiplier
+	C
+	C 17 code bytes per limb processed
+
+L(entry):
+forloop(i, 0, UNROLL_COUNT-1, `
+	deflit(`disp_dst', eval(i*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp_src', eval(disp_dst + 4-(SRC_OFFSET-0)))
+ 
+	mull	%ebp
+	
+	addl	%eax, %ebx
+Zdisp(	movl,	disp_src,(%esi), %eax)
+Zdisp(	movl,	%ebx, disp_dst,(%edi))
+
+	movl	$0, %ebx
+	adcl	%edx, %ebx
+')
+
+	decl	%ecx
+
+	leal	UNROLL_BYTES(%esi), %esi
+	leal	UNROLL_BYTES(%edi), %edi
+	jns	L(top)
+
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+
+	mull	%ebp
+	
+	addl	%eax, %ebx
+	movl	$0, %eax
+	movl	SAVE_ESI, %esi
+
+	movl	%ebx, disp0(%edi)
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EDI, %edi
+
+	adcl	%edx, %eax
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+	
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mul_basecase.asm b/rts/gmp/mpn/x86/k7/mul_basecase.asm
new file mode 100644
index 0000000000..c4be62e633
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mul_basecase.asm
@@ -0,0 +1,593 @@
+dnl  AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
+dnl 
+dnl  K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
+dnl      limbs/loop unrolling).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K7 UNROLL_COUNT cycles/product (at around 20x20)
+dnl           8           4.67    
+dnl          16           4.59
+dnl          32           4.42
+dnl  Maximum possible with the current code is 32.
+dnl
+dnl  At 32 the typical 13-26 limb sizes from the karatsuba code will get
+dnl  done with a straight run through a block of code, no inner loop.  Using
+dnl  32 gives 1k of code, but the k7 has a 64k L1 code cache.
+
+deflit(UNROLL_COUNT, 32)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+C
+C Calculate xp,xsize multiplied by yp,ysize, storing the result in
+C wp,xsize+ysize.
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() startup
+C calculations only once.  The saving is 15-25% on typical sizes coming from
+C the Karatsuba multiply code.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP,   16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_XSIZE, %ecx
+	movl	PARAM_YP, %eax
+
+	movl	PARAM_XP, %edx
+	movl	(%eax), %eax	C yp low limb
+
+	cmpl	$2, %ecx
+	ja	L(xsize_more_than_two)
+	je	L(two_by_something)
+
+
+	C one limb by one limb
+
+	mull	(%edx)
+
+	movl	PARAM_WP, %ecx
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+deflit(`FRAME',0)
+	decl	PARAM_YSIZE
+	pushl	%ebx		defframe_pushl(`SAVE_EBX')
+	movl	%eax, %ecx	C yp low limb
+
+	movl	PARAM_WP, %ebx
+	pushl	%esi		defframe_pushl(`SAVE_ESI')
+	movl	%edx, %esi	C xp
+
+	movl	(%edx), %eax	C xp low limb	
+	jnz	L(two_by_two)
+
+
+	C two limbs by one limb
+
+	mull	%ecx	
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+	movl	%edx, %esi	C carry
+
+	mull	%ecx
+
+	addl	%eax, %esi
+
+	movl	%esi, 4(%ebx)
+	movl	SAVE_ESI, %esi
+
+	adcl	$0, %edx
+
+	movl	%edx, 8(%ebx)
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+	
+
+
+C -----------------------------------------------------------------------------
+C Could load yp earlier into another register.
+
+	ALIGN(16)
+L(two_by_two):
+	C eax	xp low limb
+	C ebx	wp
+	C ecx	yp low limb
+	C edx
+	C esi	xp
+	C edi
+	C ebp
+
+dnl  FRAME carries on from previous
+
+	mull	%ecx		C xp[0] * yp[0]
+
+	push	%edi		defframe_pushl(`SAVE_EDI')
+	movl	%edx, %edi	C carry, for wp[1]
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+
+	mull	%ecx		C xp[1] * yp[0]
+
+	addl	%eax, %edi
+	movl	PARAM_YP, %ecx
+
+	adcl	$0, %edx
+	movl	4(%ecx), %ecx	C yp[1]
+	movl	%edi, 4(%ebx)
+
+	movl	4(%esi), %eax	C xp[1]
+	movl	%edx, %edi	C carry, for wp[2]
+
+	mull	%ecx		C xp[1] * yp[1]
+
+	addl	%eax, %edi
+
+	adcl	$0, %edx
+	movl	(%esi), %eax	C xp[0]
+
+	movl	%edx, %esi	C carry, for wp[3]
+
+	mull	%ecx		C xp[0] * yp[1]
+
+	addl	%eax, 4(%ebx)
+	adcl	%edx, %edi
+	movl	%edi, 8(%ebx)
+
+	adcl	$0, %esi
+	movl	SAVE_EDI, %edi
+	movl	%esi, 12(%ebx)
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+
+	
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(xsize_more_than_two):
+
+C The first limb of yp is processed with a simple mpn_mul_1 style loop
+C inline.  Unrolling this doesn't seem worthwhile since it's only run once
+C (whereas the addmul below is run ysize-1 many times).  A call to the
+C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
+C popping, and doesn't seem likely to be worthwhile on the typical 13-26
+C limb operations the Karatsuba code calls here with.
+
+	C eax	yp[0]
+	C ebx
+	C ecx	xsize
+	C edx	xp
+	C esi
+	C edi
+	C ebp
+
+dnl  FRAME doesn't carry on from previous, no pushes yet here
+defframe(`SAVE_EBX',-4)
+defframe(`SAVE_ESI',-8)
+defframe(`SAVE_EDI',-12)
+defframe(`SAVE_EBP',-16)
+deflit(`FRAME',0)
+
+	subl	$16, %esp
+deflit(`FRAME',16)
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_WP, %edi
+
+	movl	%ebx, SAVE_EBX
+	movl	%ebp, SAVE_EBP
+	movl	%eax, %ebp
+
+	movl	%esi, SAVE_ESI
+	xorl	%ebx, %ebx
+	leal	(%edx,%ecx,4), %esi	C xp end
+
+	leal	(%edi,%ecx,4), %edi	C wp end of mul1
+	negl	%ecx
+
+
+L(mul1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp	multiplier
+
+	movl	(%esi,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	movl	%eax, (%edi,%ecx,4)
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+	incl	%ecx
+	jnz	L(mul1)
+
+
+	movl	PARAM_YSIZE, %edx
+	movl	PARAM_XSIZE, %ecx
+
+	movl	%ebx, (%edi)		C final carry
+	decl	%edx
+
+	jnz	L(ysize_more_than_one)
+
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+
+	movl	SAVE_EBP, %ebp
+	movl	SAVE_ESI, %esi
+	addl	$FRAME, %esp
+
+	ret
+
+
+L(ysize_more_than_one):
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_YP, %eax
+
+	jae	L(unroll)
+
+
+C -----------------------------------------------------------------------------
+	C simple addmul looping
+	C
+	C eax	yp
+	C ebx
+	C ecx	xsize
+	C edx	ysize-1
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp
+
+	leal	4(%eax,%edx,4), %ebp	C yp end
+	negl	%ecx
+	negl	%edx
+
+	movl	(%esi,%ecx,4), %eax	C xp low limb
+	movl	%edx, PARAM_YSIZE	C -(ysize-1)
+	incl	%ecx
+
+	xorl	%ebx, %ebx		C initial carry
+	movl	%ecx, PARAM_XSIZE	C -(xsize-1)
+	movl	%ebp, PARAM_YP
+
+	movl	(%ebp,%edx,4), %ebp	C yp second lowest limb - multiplier
+	jmp	L(simple_outer_entry)
+
+
+	C this is offset 0x121 so close enough to aligned
+L(simple_outer_top):	
+	C ebp	ysize counter, negative
+
+	movl	PARAM_YP, %edx
+	movl	PARAM_XSIZE, %ecx	C -(xsize-1)
+	xorl	%ebx, %ebx		C carry
+
+	movl	%ebp, PARAM_YSIZE
+	addl	$4, %edi		C next position in wp
+
+	movl	(%edx,%ebp,4), %ebp	C yp limb - multiplier
+	movl	-4(%esi,%ecx,4), %eax	C xp low limb
+
+
+L(simple_outer_entry):
+
+L(simple_inner):
+	C eax	xp limb
+	C ebx	carry limb
+	C ecx	loop counter (negative)
+	C edx	scratch
+	C esi	xp end
+	C edi	wp end
+	C ebp	multiplier
+
+	mull	%ebp
+
+	addl	%eax, %ebx
+	adcl	$0, %edx
+
+	addl	%ebx, (%edi,%ecx,4)
+	movl	(%esi,%ecx,4), %eax
+	adcl	$0, %edx
+
+	incl	%ecx
+	movl	%edx, %ebx
+	jnz	L(simple_inner)
+
+
+	mull	%ebp
+
+	movl	PARAM_YSIZE, %ebp
+	addl	%eax, %ebx
+
+	adcl	$0, %edx
+	addl	%ebx, (%edi)
+
+	adcl	$0, %edx
+	incl	%ebp
+
+	movl	%edx, 4(%edi)
+	jnz	L(simple_outer_top)
+
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
+C comments.
+C
+C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
+C increment xp and wp.  This is used to adjust back xp and wp, and rshifted
+C to given an initial VAR_COUNTER at the top of the outer loop.
+C
+C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
+C up to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
+C outer loop to take care of xp, wp and the inner loop counter.
+
+defframe(VAR_COUNTER,  -20)
+defframe(VAR_ADJUST,   -24)
+defframe(VAR_JMP,      -28)
+defframe(VAR_XP_LOW,   -32)
+deflit(VAR_EXTRA_SPACE, 16)
+
+
+L(unroll):
+	C eax	yp
+	C ebx
+	C ecx	xsize
+	C edx	ysize-1
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp
+
+	movl	PARAM_XP, %esi
+	movl	4(%eax), %ebp		C multiplier (yp second limb)
+	leal	4(%eax,%edx,4), %eax	C yp adjust for ysize indexing
+
+	movl	PARAM_WP, %edi
+	movl	%eax, PARAM_YP
+	negl	%edx
+
+	movl	%edx, PARAM_YSIZE
+	leal	UNROLL_COUNT-2(%ecx), %ebx	C (xsize-1)+UNROLL_COUNT-1
+	decl	%ecx				C xsize-1
+
+	movl	(%esi), %eax		C xp low limb
+	andl	$-UNROLL_MASK-1, %ebx
+	negl	%ecx
+
+	subl	$VAR_EXTRA_SPACE, %esp
+deflit(`FRAME',16+VAR_EXTRA_SPACE)
+	negl	%ebx
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%ebx, VAR_ADJUST
+	movl	%ecx, %edx
+	shll	$4, %ecx
+
+	sarl	$UNROLL_LOG2, %ebx
+
+	C 17 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(unroll_here):
+',`
+	leal	L(unroll_entry) (%ecx,%edx,1), %ecx
+')
+	negl	%edx
+
+	movl	%eax, VAR_XP_LOW
+	movl	%ecx, VAR_JMP
+	leal	4(%edi,%edx,4), %edi	C wp and xp, adjust for unrolling,
+	leal	4(%esi,%edx,4), %esi	C  and start at second limb
+	jmp	L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%ecx,%edx,1), %ecx
+	addl	$L(unroll_entry)-L(unroll_here), %ecx
+	addl	(%esp), %ecx
+	ret
+')
+
+
+C --------------------------------------------------------------------------
+	ALIGN(32)
+L(unroll_outer_top):
+	C ebp	ysize counter, negative
+
+	movl	VAR_ADJUST, %ebx
+	movl	PARAM_YP, %edx
+
+	movl	VAR_XP_LOW, %eax
+	movl	%ebp, PARAM_YSIZE	C store incremented ysize counter
+
+	leal	4(%edi,%ebx,4), %edi
+	leal	(%esi,%ebx,4), %esi
+	sarl	$UNROLL_LOG2, %ebx
+
+	movl	(%edx,%ebp,4), %ebp	C yp next multiplier
+	movl	VAR_JMP, %ecx
+
+L(unroll_outer_entry):
+	mull	%ebp
+
+	testb	$1, %cl		C and clear carry bit
+	movl	%ebx, VAR_COUNTER
+	movl	$0, %ebx
+
+	movl	$0, %ecx
+	cmovz(	%eax, %ecx)	C eax into low carry, zero into high carry limb
+	cmovnz(	%eax, %ebx)
+
+	C Extra fetch of VAR_JMP is bad, but registers are tight
+	jmp	*VAR_JMP
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(unroll_top):
+	C eax	xp limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	xp+8
+	C edi	wp
+	C ebp	yp multiplier limb
+	C
+	C VAR_COUNTER  loop counter, negative
+	C
+	C 17 bytes each limb
+
+L(unroll_entry):
+
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%esi), %eax)
+	adcl	%edx, %ebx
+
+	mull	%ebp
+
+Zdisp(	addl,	%ecx, disp0,(%edi))
+	movl	$0, %ecx
+
+	adcl	%eax, %ebx
+
+
+	movl	disp1(%esi), %eax
+	adcl	%edx, %ecx	
+
+	mull	%ebp
+
+	addl	%ebx, disp1(%edi)
+	movl	$0, %ebx
+
+	adcl	%eax, %ecx
+')
+
+
+	incl	VAR_COUNTER
+	leal	UNROLL_BYTES(%esi), %esi
+	leal	UNROLL_BYTES(%edi), %edi
+
+	jnz	L(unroll_top)
+
+
+	C eax
+	C ebx	zero
+	C ecx	low
+	C edx	high
+	C esi
+	C edi	wp, pointing at second last limb)
+	C ebp
+	C
+	C carry flag to be added to high
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 4))
+
+	movl	PARAM_YSIZE, %ebp
+	adcl	$0, %edx
+	addl	%ecx, disp0(%edi)
+
+	adcl	$0, %edx
+	incl	%ebp
+
+	movl	%edx, disp1(%edi)
+	jnz	L(unroll_outer_top)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/sqr_basecase.asm b/rts/gmp/mpn/x86/k7/sqr_basecase.asm
new file mode 100644
index 0000000000..84861ea66b
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/sqr_basecase.asm
@@ -0,0 +1,627 @@
+dnl  AMD K7 mpn_sqr_basecase -- square an mpn number.
+dnl 
+dnl  K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
+dnl  (measured on the speed difference between 25 and 50 limbs, which is
+dnl  roughly the Karatsuba recursing range).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
+dnl  some comments.
+
+deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66)
+
+ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
+`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
+
+m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
+deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C With a KARATSUBA_SQR_THRESHOLD around 50 this code is about 1500 bytes,
+C which is quite a bit, but is considered good value since squares big
+C enough to use most of the code will be spending quite a few cycles in it.
+
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %eax
+	cmpl	$2, %ecx
+
+	movl	PARAM_DST, %edx
+	je	L(two_limbs)
+	ja	L(three_or_more)
+
+
+C------------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ecx	size
+	C edx	dst
+
+	movl	(%eax), %eax
+	movl	%edx, %ecx
+
+	mull	%eax
+
+	movl	%edx, 4(%ecx)
+	movl	%eax, (%ecx)
+	ret
+
+
+C------------------------------------------------------------------------------
+C
+C Using the read/modify/write "add"s seems to be faster than saving and
+C restoring registers.  Perhaps the loads for the first set hide under the
+C mul latency and the second gets store to load forwarding.
+
+	ALIGN(16)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+deflit(`FRAME',0)
+
+	pushl	%ebx		FRAME_pushl()
+	movl	%eax, %ebx	C src
+	movl	(%eax), %eax
+
+	movl	%edx, %ecx	C dst
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)	C dst[0]
+	movl	4(%ebx), %eax
+
+	movl	%edx, 4(%ecx)	C dst[1]
+
+	mull	%eax		C src[1]^2
+
+	movl	%eax, 8(%ecx)	C dst[2]
+	movl	(%ebx), %eax
+
+	movl	%edx, 12(%ecx)	C dst[3]
+
+	mull	4(%ebx)		C src[0]*src[1]
+
+	popl	%ebx
+
+ 	addl	%eax, 4(%ecx)
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+	ASSERT(nc)
+
+	addl	%eax, 4(%ecx)
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+	ASSERT(nc)
+
+	ret
+
+
+C------------------------------------------------------------------------------
+defframe(SAVE_EBX,  -4)
+defframe(SAVE_ESI,  -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(STACK_SPACE, 16)
+
+L(three_or_more):
+	subl	$STACK_SPACE, %esp
+	cmpl	$4, %ecx
+	jae	L(four_or_more)
+deflit(`FRAME',STACK_SPACE)
+
+
+C------------------------------------------------------------------------------
+C Three limbs
+C
+C Writing out the loads and stores separately at the end of this code comes
+C out about 10 cycles faster than using adcls to memory.
+
+	C eax	src
+	C ecx	size
+	C edx	dst
+
+	movl	%ebx, SAVE_EBX
+	movl	%eax, %ebx	C src
+	movl	(%eax), %eax
+
+	movl	%edx, %ecx	C dst
+	movl	%esi, SAVE_ESI
+	movl	%edi, SAVE_EDI
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	4(%ebx), %eax
+	movl	%edx, 4(%ecx)
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	8(%ebx), %eax
+	movl	%edx, 12(%ecx)
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	(%ebx), %eax
+	movl	%edx, 20(%ecx)
+
+	mull	4(%ebx)		C src[0] * src[1]
+
+	movl	%eax, %esi
+	movl	(%ebx), %eax
+	movl	%edx, %edi
+
+	mull	8(%ebx)		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	%ebp, SAVE_EBP
+	movl	$0, %ebp
+
+	movl	4(%ebx), %eax
+	adcl	%edx, %ebp
+
+	mull	8(%ebx)		C src[1] * src[2]
+
+	xorl	%ebx, %ebx
+	addl	%eax, %ebp
+
+	adcl	$0, %edx
+
+	C eax
+	C ebx	zero, will be dst[5]
+	C ecx	dst
+	C edx	dst[4]
+	C esi	dst[1]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	adcl	$0, %edx
+	addl	%esi, %esi
+
+	adcl	%edi, %edi
+	movl	4(%ecx), %eax
+
+	adcl	%ebp, %ebp
+
+	adcl	%edx, %edx
+
+	adcl	$0, %ebx
+	addl	%eax, %esi
+	movl	8(%ecx), %eax
+
+	adcl	%eax, %edi
+	movl	12(%ecx), %eax
+	movl	%esi, 4(%ecx)
+
+	adcl	%eax, %ebp
+	movl	16(%ecx), %eax
+	movl	%edi, 8(%ecx)
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EDI, %edi
+
+	adcl	%eax, %edx
+	movl	20(%ecx), %eax
+	movl	%ebp, 12(%ecx)
+
+	adcl	%ebx, %eax
+	ASSERT(nc)
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+
+	movl	%edx, 16(%ecx)
+	movl	%eax, 20(%ecx)
+	addl	$FRAME, %esp
+
+	ret
+
+
+C------------------------------------------------------------------------------
+L(four_or_more):
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+C Further products are added in rather than stored.
+ 
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+
+defframe(`VAR_COUNTER',-20)
+defframe(`VAR_JMP',    -24)
+deflit(EXTRA_STACK_SPACE, 8)
+
+	movl	%ebx, SAVE_EBX
+	movl	%edi, SAVE_EDI
+	leal	(%edx,%ecx,4), %edi	C &dst[size]
+
+	movl	%esi, SAVE_ESI
+	movl	%ebp, SAVE_EBP
+	leal	(%eax,%ecx,4), %esi	C &src[size]
+
+	movl	(%eax), %ebp		C multiplier
+	movl	$0, %ebx
+	decl	%ecx
+
+	negl	%ecx
+	subl	$EXTRA_STACK_SPACE, %esp
+FRAME_subl_esp(EXTRA_STACK_SPACE)
+
+L(mul_1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp	multiplier
+
+        movl    (%esi,%ecx,4), %eax
+
+        mull    %ebp
+
+        addl    %ebx, %eax
+        movl    %eax, (%edi,%ecx,4)
+        movl    $0, %ebx
+
+        adcl    %edx, %ebx
+        incl    %ecx
+        jnz     L(mul_1)
+
+
+C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two products, which are the bottom right corner of the product
+C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as in mpn_addmul_1, see that routine for
+C some comments.
+C
+C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+C
+C K7 does branch prediction on indirect jumps, which is bad since it's a
+C different target each time.  There seems no way to avoid this.
+
+dnl  This value also hard coded in some shifts and adds
+deflit(CODE_BYTES_PER_LIMB, 17)
+
+dnl  With the unmodified &src[size] and &dst[size] pointers, the
+dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
+dnl  values up to 31, but above that an offset must be added to them.
+
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>31),1,
+eval((UNROLL_COUNT-31)*4),
+0))
+
+dnl  Because the last chunk of code is generated differently, a label placed
+dnl  at the end doesn't work.  Instead calculate the implied end using the
+dnl  start and how many chunks of code there are.
+
+deflit(UNROLL_INNER_END,
+`L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
+
+	C eax
+	C ebx	carry
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+	movl	%ebx, (%edi)
+
+	subl	$4, %ecx
+	jz	L(corner)
+
+	negl	%ecx
+ifelse(OFFSET,0,,`subl	$OFFSET, %edi')
+ifelse(OFFSET,0,,`subl	$OFFSET, %esi')
+
+	movl	%ecx, %edx
+	shll	$4, %ecx
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+
+
+	C The calculated jump mustn't come out to before the start of the
+	C code available.  This is the limit UNROLL_COUNT puts on the src
+	C operand size, but checked here directly using the jump address.
+	ASSERT(ae,
+	`movl_text_address(L(unroll_inner_start), %eax)
+	cmpl	%eax, %ecx')
+
+
+C------------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll_outer_top):
+	C eax
+	C ebx	high limb to store
+	C ecx	VAR_JMP
+	C edx	VAR_COUNTER, limbs, negative
+	C esi	&src[size], constant
+	C edi	dst ptr, high of last addmul
+	C ebp
+
+	movl	-12+OFFSET(%esi,%edx,4), %ebp	C next multiplier
+	movl	-8+OFFSET(%esi,%edx,4), %eax	C first of multiplicand
+
+	movl	%edx, VAR_COUNTER
+
+	mull	%ebp
+
+define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')')
+
+	testb	$1, %cl
+	movl	%edx, %ebx	C high carry
+	movl	%ecx, %edx	C jump
+
+	movl	%eax, %ecx	C low carry
+	cmovX(	%ebx, %ecx)	C high carry reverse
+	cmovX(	%eax, %ebx)	C low carry reverse
+
+	leal	CODE_BYTES_PER_LIMB(%edx), %eax
+	xorl	%edx, %edx
+	leal	4(%edi), %edi
+
+	movl	%eax, VAR_JMP
+
+	jmp	*%eax
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	addl	(%esp), %ecx
+	addl	$UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
+	addl	%edx, %ecx
+	ret
+')
+
+
+	C Must be an even address to preserve the significance of the low
+	C bit of the jump address indicating which way around ecx/ebx should
+	C start.
+	ALIGN(2)
+
+L(unroll_inner_start):
+	C eax	next limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+forloop(`i', UNROLL_COUNT, 1, `
+	deflit(`disp_src', eval(-i*4 + OFFSET))
+	deflit(`disp_dst', eval(disp_src - 4))
+
+	m4_assert(`disp_src>=-128 && disp_src<128')
+	m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp(	movl,	disp_src,(%esi), %eax)
+        adcl    %edx, %ebx
+
+        mull	%ebp
+
+Zdisp(  addl,	%ecx, disp_dst,(%edi))
+	movl	$0, %ecx
+
+	adcl	%eax, %ebx
+
+',`
+	dnl  this bit comes out last
+Zdisp(  movl,	disp_src,(%esi), %eax)
+	adcl	%edx, %ecx
+
+	mull    %ebp
+
+dnl Zdisp(	addl	%ebx, disp_src,(%edi))
+	addl	%ebx, disp_dst(%edi)
+ifelse(forloop_last,0,
+`	movl	$0, %ebx')
+
+	adcl    %eax, %ecx
+')
+')
+
+	C eax	next limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+        adcl    $0, %edx
+	addl	%ecx, -4+OFFSET(%edi)
+	movl	VAR_JMP, %ecx
+
+        adcl    $0, %edx
+	
+	movl	%edx, m4_empty_if_zero(OFFSET) (%edi)
+	movl	VAR_COUNTER, %edx
+
+	incl	%edx
+	jnz	L(unroll_outer_top)
+	
+
+ifelse(OFFSET,0,,`
+	addl	$OFFSET, %esi
+	addl	$OFFSET, %edi
+')
+
+
+C------------------------------------------------------------------------------
+L(corner):
+	C esi	&src[size]
+	C edi	&dst[2*size-5]
+
+	movl	-12(%esi), %ebp
+	movl	-8(%esi), %eax
+	movl	%eax, %ecx
+
+	mull	%ebp
+
+	addl	%eax, -4(%edi)
+	movl	-4(%esi), %eax
+
+	adcl	$0, %edx
+	movl	%edx, %ebx
+	movl	%eax, %esi
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+
+	adcl	$0, %edx
+	addl	%eax, (%edi)
+	movl	%esi, %eax
+
+	adcl	$0, %edx
+	movl	%edx, %ebx
+
+	mull	%ecx
+
+	addl	%ebx, %eax
+	movl	%eax, 4(%edi)
+
+	adcl	$0, %edx
+	movl	%edx, 8(%edi)
+	
+
+
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+L(lshift_start):
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_DST, %edi
+	xorl	%ecx, %ecx		C clear carry
+
+	leal	(%edi,%eax,8), %edi
+	notl	%eax			C -size-1, preserve carry
+
+	leal	2(%eax), %eax		C -(size-1)
+
+L(lshift):
+	C eax	counter, negative
+	C ebx
+	C ecx
+	C edx
+	C esi
+	C edi	dst, pointing just after last limb
+	C ebp
+
+	rcll	-4(%edi,%eax,8)
+	rcll	(%edi,%eax,8)
+	incl	%eax
+	jnz	L(lshift)
+
+	setc	%al
+
+	movl	PARAM_SRC, %esi
+	movl	%eax, -4(%edi)		C dst most significant limb
+
+	movl	PARAM_SIZE, %ecx
+
+
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+	movl	(%esi), %eax		C src[0]
+
+	mull	%eax
+
+	leal	(%esi,%ecx,4), %esi	C src point just after last limb
+	negl	%ecx
+
+	movl	%eax, (%edi,%ecx,8)	C dst[0]
+	incl	%ecx
+
+L(diag):
+	C eax	scratch
+	C ebx	scratch
+	C ecx	counter, negative
+	C edx	carry
+	C esi	src just after last limb
+	C edi	dst just after last limb
+	C ebp
+
+	movl	(%esi,%ecx,4), %eax
+	movl	%edx, %ebx
+
+	mull	%eax
+
+	addl	%ebx, -4(%edi,%ecx,8)
+	adcl	%eax, (%edi,%ecx,8)
+	adcl	$0, %edx
+
+	incl	%ecx
+	jnz	L(diag)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+
+	addl	%edx, -4(%edi)		C dst most significant limb
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/lshift.asm b/rts/gmp/mpn/x86/lshift.asm
new file mode 100644
index 0000000000..4735335cbe
--- /dev/null
+++ b/rts/gmp/mpn/x86/lshift.asm
@@ -0,0 +1,90 @@
+dnl  x86 mpn_lshift -- mpn left shift.
+
+dnl  Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
+dnl  Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_lshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+deflit(`FRAME',12)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%edx
+	movl	PARAM_SHIFT,%ecx
+
+	subl	$4,%esi			C adjust src
+
+	movl	(%esi,%edx,4),%ebx	C read most significant limb
+	xorl	%eax,%eax
+	shldl(	%cl, %ebx, %eax)	C compute carry limb
+	decl	%edx
+	jz	L(end)
+	pushl	%eax			C push carry limb onto stack
+	testb	$1,%dl
+	jnz	L(1)			C enter loop in the middle
+	movl	%ebx,%eax
+
+	ALIGN(8)
+L(oop):	movl	(%esi,%edx,4),%ebx	C load next lower limb
+	shldl(	%cl, %ebx, %eax)	C compute result limb
+	movl	%eax,(%edi,%edx,4)	C store it
+	decl	%edx
+L(1):	movl	(%esi,%edx,4),%eax
+	shldl(	%cl, %eax, %ebx)
+	movl	%ebx,(%edi,%edx,4)
+	decl	%edx
+	jnz	L(oop)
+
+	shll	%cl,%eax		C compute least significant limb
+	movl	%eax,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+L(end):	shll	%cl,%ebx		C compute least significant limb
+	movl	%ebx,(%edi)		C store it
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/mod_1.asm b/rts/gmp/mpn/x86/mod_1.asm
new file mode 100644
index 0000000000..3908161b3e
--- /dev/null
+++ b/rts/gmp/mpn/x86/mod_1.asm
@@ -0,0 +1,141 @@
+dnl  x86 mpn_mod_1 -- mpn by limb remainder.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl        cycles/limb
+dnl  K6        20
+dnl  P5        44
+dnl  P6        39
+dnl  486   approx 42 maybe
+dnl
+dnl  The following have their own optimized mod_1 implementations, but for
+dnl  reference the code here runs as follows.
+dnl
+dnl  P6MMX     39
+dnl  K7        41
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                       mp_limb_t carry);
+C
+C Divide src,size by divisor and return the remainder.  The quotient is
+C discarded.
+C
+C See mpn/x86/divrem_1.asm for some comments.
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+
+	.text
+	ALIGN(16)
+
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %esi
+	orl	%ecx, %ecx
+
+	movl	PARAM_CARRY, %edx
+	jnz	LF(mpn_mod_1,top)
+
+	popl	%esi
+	movl	%edx, %eax
+
+	popl	%ebx
+	
+	ret
+
+EPILOGUE()
+
+
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	pushl	%esi		FRAME_pushl()
+
+	orl	%ecx, %ecx
+	jz	L(done_zero)
+
+	movl	PARAM_DIVISOR, %esi
+	movl	-4(%ebx,%ecx,4), %eax	C src high limb
+
+	cmpl	%esi, %eax
+
+	sbbl	%edx, %edx		C -1 if high<divisor
+
+	addl	%edx, %ecx		C skip one division if high<divisor
+	jz	L(done_eax)
+
+	andl	%eax, %edx		C carry if high<divisor
+
+
+L(top):
+	C eax	scratch (quotient)
+	C ebx	src
+	C ecx	counter
+	C edx	carry (remainder)
+	C esi	divisor
+	C edi
+	C ebp
+
+	movl	-4(%ebx,%ecx,4), %eax
+
+	divl	%esi
+
+	loop_or_decljnz	L(top)
+
+
+	movl	%edx, %eax
+L(done_eax):
+	popl	%esi
+
+	popl	%ebx
+
+	ret
+
+
+L(done_zero):
+	popl	%esi
+	xorl	%eax, %eax
+
+	popl	%ebx
+
+	ret
+	
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/mul_1.asm b/rts/gmp/mpn/x86/mul_1.asm
new file mode 100644
index 0000000000..8817f291bc
--- /dev/null
+++ b/rts/gmp/mpn/x86/mul_1.asm
@@ -0,0 +1,130 @@
+dnl  x86 mpn_mul_1 (for 386, 486, and Pentium Pro) -- Multiply a limb vector
+dnl  with a limb and store the result in a second limb vector.
+dnl 
+dnl      cycles/limb
+dnl  P6:     5.5
+dnl
+dnl  The following CPUs have their own optimized code, but for reference the
+dnl  code here runs as follows.
+dnl
+dnl      cycles/limb
+dnl  P5:    12.5
+dnl  K6:    10.5
+dnl  K7:     4.5
+
+
+dnl  Copyright (C) 1992, 1994, 1997, 1998, 1999, 2000 Free Software
+dnl  Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ecx
+
+	xorl	%ebx,%ebx
+	andl	$3,%ecx
+	jz	L(end0)
+
+L(oop0):
+	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	leal	4(%esi),%esi
+	addl	%ebx,%eax
+	movl	$0,%ebx
+	adcl	%ebx,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%ebx	C propagate carry into cylimb
+
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oop0)
+
+L(end0):
+	movl	PARAM_SIZE,%ecx
+	shrl	$2,%ecx
+	jz	L(end)
+
+
+	ALIGN(8)
+L(oop):	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	addl	%eax,%ebx
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	4(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	movl	%ebx,(%edi)
+	addl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	8(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	movl	%ebp,4(%edi)
+	addl	%eax,%ebx	C new lo + cylimb
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	12(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	movl	%ebx,8(%edi)
+	addl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	%ebp,12(%edi)
+
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ecx
+	jnz	L(oop)
+
+L(end):	movl	%ebx,%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/mul_basecase.asm b/rts/gmp/mpn/x86/mul_basecase.asm
new file mode 100644
index 0000000000..3a9b73895b
--- /dev/null
+++ b/rts/gmp/mpn/x86/mul_basecase.asm
@@ -0,0 +1,209 @@
+dnl  x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
+dnl  in a third limb vector.
+
+
+dnl  Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation,
+dnl  Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+C
+C This was written in a haste since the Pentium optimized code that was used
+C for all x86 machines was slow for the Pentium II.  This code would benefit
+C from some cleanup.
+C
+C To shave off some percentage of the run-time, one should make 4 variants
+C of the Louter loop, for the four different outcomes of un mod 4.  That
+C would avoid Loop0 altogether.  Code expansion would be > 4-fold for that
+C part of the function, but since it is not very large, that would be
+C acceptable.
+C
+C The mul loop (at L(oopM)) might need some tweaking.  It's current speed is
+C unknown.
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP,   16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+defframe(VAR_MULTIPLIER, -4)
+defframe(VAR_COUNTER,    -8)
+deflit(VAR_STACK_SPACE,  8)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+	subl	$VAR_STACK_SPACE,%esp
+	pushl	%esi
+	pushl	%ebp
+	pushl	%edi
+deflit(`FRAME',eval(VAR_STACK_SPACE+12))
+
+	movl	PARAM_XP,%esi
+	movl	PARAM_WP,%edi
+	movl	PARAM_YP,%ebp
+
+	movl	(%esi),%eax		C load xp[0]
+	mull	(%ebp)			C multiply by yp[0]
+	movl	%eax,(%edi)		C store to wp[0]
+	movl	PARAM_XSIZE,%ecx	C xsize
+	decl	%ecx			C If xsize = 1, ysize = 1 too
+	jz	L(done)
+
+	pushl	%ebx
+FRAME_pushl()
+	movl	%edx,%ebx
+
+	leal	4(%esi),%esi
+	leal	4(%edi),%edi
+
+L(oopM):
+	movl	(%esi),%eax		C load next limb at xp[j]
+	leal	4(%esi),%esi
+	mull	(%ebp)
+	addl	%ebx,%eax
+	movl	%edx,%ebx
+	adcl	$0,%ebx
+	movl	%eax,(%edi)
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oopM)
+
+	movl	%ebx,(%edi)		C most significant limb of product
+	addl	$4,%edi			C increment wp
+	movl	PARAM_XSIZE,%eax
+	shll	$2,%eax
+	subl	%eax,%edi
+	subl	%eax,%esi
+
+	movl	PARAM_YSIZE,%eax	C ysize
+	decl	%eax
+	jz	L(skip)
+	movl	%eax,VAR_COUNTER	C set index i to ysize
+
+L(outer):
+	movl	PARAM_YP,%ebp		C yp
+	addl	$4,%ebp			C make ebp point to next v limb
+	movl	%ebp,PARAM_YP
+	movl	(%ebp),%eax		C copy y limb ...
+	movl	%eax,VAR_MULTIPLIER	C ... to stack slot
+	movl	PARAM_XSIZE,%ecx
+
+	xorl	%ebx,%ebx
+	andl	$3,%ecx
+	jz	L(end0)
+
+L(oop0):
+	movl	(%esi),%eax
+	mull	VAR_MULTIPLIER
+	leal	4(%esi),%esi
+	addl	%ebx,%eax
+	movl	$0,%ebx
+	adcl	%ebx,%edx
+	addl	%eax,(%edi)
+	adcl	%edx,%ebx		C propagate carry into cylimb
+
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oop0)
+
+L(end0):
+	movl	PARAM_XSIZE,%ecx
+	shrl	$2,%ecx
+	jz	L(endX)
+
+	ALIGN(8)
+L(oopX):
+	movl	(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%eax,%ebx
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	4(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%ebx,(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	8(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%ebp,4(%edi)
+	adcl	%eax,%ebx	C new lo + cylimb
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	12(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%ebx,8(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	addl	%ebp,12(%edi)
+	adcl	$0,%ebx		C propagate carry into cylimb
+
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ecx
+	jnz	L(oopX)
+
+L(endX):
+	movl	%ebx,(%edi)
+	addl	$4,%edi
+
+	C we incremented wp and xp in the loop above; compensate
+	movl	PARAM_XSIZE,%eax
+	shll	$2,%eax
+	subl	%eax,%edi
+	subl	%eax,%esi
+
+	movl	VAR_COUNTER,%eax
+	decl	%eax
+	movl	%eax,VAR_COUNTER
+	jnz	L(outer)
+
+L(skip):
+	popl	%ebx
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	addl	$8,%esp
+	ret
+
+L(done):
+	movl	%edx,4(%edi)	   C store to wp[1]
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	addl	$8,%esp
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/README b/rts/gmp/mpn/x86/p6/README
new file mode 100644
index 0000000000..7dbc905a0d
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/README
@@ -0,0 +1,95 @@
+
+                      INTEL P6 MPN SUBROUTINES
+
+
+
+This directory contains code optimized for Intel P6 class CPUs, meaning
+PentiumPro, Pentium II and Pentium III.  The mmx and p3mmx subdirectories
+have routines using MMX instructions.
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache, are as follows.
+Some of these might be able to be improved.
+
+                               cycles/limb
+
+	mpn_add_n/sub_n           3.7
+
+	mpn_copyi                 0.75
+	mpn_copyd                 2.4
+
+	mpn_divrem_1             39.0
+	mpn_mod_1                39.0
+	mpn_divexact_by3          8.5
+
+	mpn_mul_1                 5.5
+	mpn_addmul/submul_1       6.35
+
+	mpn_l/rshift              2.5
+
+	mpn_mul_basecase          8.2 cycles/crossproduct (approx)
+	mpn_sqr_basecase          4.0 cycles/crossproduct (approx)
+	                          or 7.75 cycles/triangleproduct (approx)
+
+Pentium II and III have MMX and get the following improvements.
+
+	mpn_divrem_1             25.0 integer part, 17.5 fractional part
+	mpn_mod_1                24.0
+
+	mpn_l/rshift              1.75
+
+
+
+
+NOTES
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+
+Mispredicted branches have a penalty of between 9 and 15 cycles, and even up
+to 26 cycles depending how far speculative execution has gone.  The 9 cycle
+minimum penalty comes from the issue pipeline being 9 stages.
+
+A copy with rep movs seems to copy 16 bytes at a time, since speeds for 4,
+5, 6 or 7 limb operations are all the same.  The 0.75 cycles/limb would be 3
+cycles per 16 byte block.
+
+
+
+
+CODING
+
+Instructions in general code have been shown grouped if they can execute
+together, which means up to three instructions with no successive
+dependencies, and with only the first being a multiple micro-op.
+
+P6 has out-of-order execution, so the groupings are really only showing
+dependent paths where some shuffling might allow some latencies to be
+hidden.
+
+
+
+
+REFERENCES
+
+"Intel Architecture Optimization Reference Manual", 1999, revision 001 dated
+02/99, order number 245127 (order number 730795-001 is in the document too).
+Available on-line:
+
+	http://download.intel.com/design/PentiumII/manuals/245127.htm
+
+"Intel Architecture Optimization Manual", 1997, order number 242816.  This
+is an older document mostly about P5 and not as good as the above.
+Available on-line:
+
+	http://download.intel.com/design/PentiumII/manuals/242816.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/p6/aorsmul_1.asm b/rts/gmp/mpn/x86/p6/aorsmul_1.asm
new file mode 100644
index 0000000000..feb364ec0b
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/aorsmul_1.asm
@@ -0,0 +1,300 @@
+dnl  Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+dnl 
+dnl  P6: 6.35 cycles/limb (at 16 limbs/loop).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  P6 UNROLL_COUNT cycles/limb
+dnl          8           6.7
+dnl         16           6.35
+dnl         32           6.3
+dnl         64           6.3
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1', `
+	define(M4_inst,        addl)
+	define(M4_function_1,  mpn_addmul_1)
+	define(M4_function_1c, mpn_addmul_1c)
+	define(M4_description, add it to)
+	define(M4_desc_retval, carry)
+',`ifdef(`OPERATION_submul_1', `
+	define(M4_inst,        subl)
+	define(M4_function_1,  mpn_submul_1)
+	define(M4_function_1c, mpn_submul_1c)
+	define(M4_description, subtract it from)
+	define(M4_desc_retval, borrow)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                            mp_limb_t mult);
+C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                             mp_limb_t mult, mp_limb_t carry);
+C
+C Calculate src,size multiplied by mult and M4_description dst,size.
+C Return the M4_desc_retval limb from the top of the result.
+C
+C This code is pretty much the same as the K6 code.  The unrolled loop is
+C the same, but there's just a few scheduling tweaks in the setups and the
+C simple loop.
+C
+C A number of variations have been tried for the unrolled loop, with one or
+C two carries, and with loads scheduled earlier, but nothing faster than 6
+C cycles/limb has been found.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(M4_function_1c)
+	pushl	%ebx
+deflit(`FRAME',4)
+	movl	PARAM_CARRY, %ebx
+	jmp	LF(M4_function_1,start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function_1)
+	push	%ebx
+deflit(`FRAME',4)
+	xorl	%ebx, %ebx	C initial carry
+
+L(start_nc):
+	movl	PARAM_SIZE, %ecx
+	pushl	%esi
+deflit(`FRAME',8)
+
+	movl	PARAM_SRC, %esi
+	pushl	%edi
+deflit(`FRAME',12)
+
+	movl	PARAM_DST, %edi
+	pushl	%ebp
+deflit(`FRAME',16)
+	cmpl	$UNROLL_THRESHOLD, %ecx
+
+	movl	PARAM_MULTIPLIER, %ebp
+	jae	L(unroll)
+
+	
+	C simple loop
+	C this is offset 0x22, so close enough to aligned
+L(simple):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+	movl	(%esi), %eax
+	addl	$4, %edi
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	adcl	$0, %edx
+
+	M4_inst	%eax, -4(%edi)
+	movl	%edx, %ebx
+
+	adcl	$0, %ebx
+	decl	%ecx
+
+	leal	4(%esi), %esi
+	jnz	L(simple)
+
+
+	popl	%ebp
+	popl	%edi
+
+	popl	%esi
+	movl	%ebx, %eax
+
+	popl	%ebx
+	ret
+
+
+
+C------------------------------------------------------------------------------
+C VAR_JUMP holds the computed jump temporarily because there's not enough
+C registers when doing the mul for the initial two carry limbs.
+C
+C The add/adc for the initial carry in %ebx is necessary only for the
+C mpn_add/submul_1c entry points.  Duplicating the startup code to
+C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
+C idea.
+
+dnl  overlapping with parameters already fetched
+define(VAR_COUNTER,`PARAM_SIZE')
+define(VAR_JUMP,   `PARAM_DST')
+
+	C this is offset 0x43, so close enough to aligned
+L(unroll):
+	C eax
+	C ebx	initial carry
+	C ecx	size
+	C edx
+	C esi	src
+	C edi	dst
+	C ebp
+
+	movl	%ecx, %edx
+	decl	%ecx
+
+	subl	$2, %edx
+	negl	%ecx
+
+	shrl	$UNROLL_LOG2, %edx
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%edx, VAR_COUNTER
+	movl	%ecx, %edx
+
+	C 15 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	shll	$4, %edx
+	negl	%ecx
+
+	leal	L(entry) (%edx,%ecx,1), %edx
+')
+	movl	(%esi), %eax		C src low limb
+
+	movl	%edx, VAR_JUMP
+	leal	ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
+
+	mull	%ebp
+
+	addl	%ebx, %eax	C initial carry (from _1c)
+	adcl	$0, %edx
+
+	movl	%edx, %ebx	C high carry
+	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
+
+	movl	VAR_JUMP, %edx
+	testl	$1, %ecx
+	movl	%eax, %ecx	C low carry
+
+	cmovnz(	%ebx, %ecx)	C high,low carry other way around
+	cmovnz(	%eax, %ebx)
+
+	jmp	*%edx
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	shll	$4, %edx
+	negl	%ecx
+
+	C See README.family about old gas bugs
+	leal	(%edx,%ecx,1), %edx
+	addl	$L(entry)-L(here), %edx
+
+	addl	(%esp), %edx
+
+	ret
+')
+
+
+C -----------------------------------------------------------
+	ALIGN(32)
+L(top):
+deflit(`FRAME',16)
+	C eax	scratch
+	C ebx	carry hi
+	C ecx	carry lo
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+	C
+	C VAR_COUNTER	loop counter
+	C
+	C 15 code bytes per limb
+
+	addl	$UNROLL_BYTES, %edi
+
+L(entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%esi), %eax)
+	mull	%ebp
+Zdisp(	M4_inst,%ecx, disp0,(%edi))
+	adcl	%eax, %ebx
+	movl	%edx, %ecx
+	adcl	$0, %ecx
+
+	movl	disp1(%esi), %eax
+	mull	%ebp
+	M4_inst	%ebx, disp1(%edi)
+	adcl	%eax, %ecx
+	movl	%edx, %ebx
+	adcl	$0, %ebx
+')
+
+	decl	VAR_COUNTER
+	leal	UNROLL_BYTES(%esi), %esi
+
+	jns	L(top)
+
+
+deflit(`disp0',	eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
+
+	M4_inst	%ecx, disp0(%edi)
+	movl	%ebx, %eax
+
+	popl	%ebp
+	popl	%edi
+
+	popl	%esi
+	popl	%ebx
+	adcl	$0, %eax
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/diveby3.asm b/rts/gmp/mpn/x86/p6/diveby3.asm
new file mode 100644
index 0000000000..a77703ea89
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/diveby3.asm
@@ -0,0 +1,37 @@
+dnl  Intel P6 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+dnl       
+dnl  P6: 8.5 cycles/limb
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl  The P5 code runs well on P6, in fact better than anything else found so
+dnl  far.  An imul is 4 cycles, meaning the two cmp/sbbl pairs on the
+dnl  dependent path are taking 4.5 cycles.
+dnl
+dnl  The destination cache line prefetching is unnecessary on P6, but
+dnl  removing it is a 2 cycle slowdown (approx), so it must be inducing
+dnl  something good in the out of order execution.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_divexact_by3c)
+include_mpn(`x86/pentium/diveby3.asm')
diff --git a/rts/gmp/mpn/x86/p6/gmp-mparam.h b/rts/gmp/mpn/x86/p6/gmp-mparam.h
new file mode 100644
index 0000000000..d7bfb6d60c
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/gmp-mparam.h
@@ -0,0 +1,96 @@
+/* Intel P6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME   5   /* cycles */
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME   39  /* cycles */
+#endif
+
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME   2  /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   23
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      139
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   52
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      166
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD             116
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             66
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            20
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          54
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 592, 1440, 2688, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     608
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         5888
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 656, 1504, 2944, 6656, 18432, 57344, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     672
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         5888
+#endif
diff --git a/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm b/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm
new file mode 100644
index 0000000000..f1b011b623
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm
@@ -0,0 +1,677 @@
+dnl  Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
+dnl 
+dnl  P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size,
+C                         mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size,
+C                          mp_limb_t divisor, mp_limb_t carry);
+C
+C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
+C see that file for some comments.  It's likely what's here can be improved.
+
+
+dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
+dnl
+dnl  The different speeds of the integer and fraction parts means that using
+dnl  xsize+size isn't quite right.  The threshold wants to be a bit higher
+dnl  for the integer part and a bit lower for the fraction part.  (Or what's
+dnl  really wanted is to speed up the integer part!)
+dnl
+dnl  The threshold is set to make the integer part right.  At 4 limbs the
+dnl  div and mul are about the same there, but on the fractional part the
+dnl  mul is much faster.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_CARRY,  24)
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+defframe(SAVE_EBX,    -4)
+defframe(SAVE_ESI,    -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+defframe(VAR_NORM,    -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC,     -28)
+defframe(VAR_DST,     -32)
+defframe(VAR_DST_STOP,-36)
+
+deflit(STACK_SPACE, 36)
+
+	.text
+	ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	leal	-4(%edi,%ebx,4), %edi
+	jmp	LF(mpn_divrem_1,start_1c)
+
+EPILOGUE()
+
+
+	C offset 0x31, close enough to aligned
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	$0, %edx		C initial carry (if can't skip a div)
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	orl	%ecx, %ecx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+	jz	L(no_skip_div)
+
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+	cmpl	%ebp, %eax		C one less div if high<divisor
+	jnb	L(no_skip_div)
+
+	movl	$0, (%edi,%ecx,4)	C dst high limb
+	decl	%ecx			C size-1
+	movl	%eax, %edx		C src high limb as initial carry
+L(no_skip_div):
+
+
+L(start_1c):
+	C eax	
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	(%ebx,%ecx), %eax	C size+xsize
+	cmpl	$MUL_THRESHOLD, %eax
+	jae	L(mul_by_inverse)
+
+	orl	%ecx, %ecx
+	jz	L(divide_no_integer)
+
+L(divide_integer):
+	C eax	scratch (quotient)
+	C ebx	xsize
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	%ebp
+
+	movl	%eax, (%edi,%ecx,4)
+	decl	%ecx
+	jnz	L(divide_integer)
+
+
+L(divide_no_integer):
+	movl	PARAM_DST, %edi
+	orl	%ebx, %ebx
+	jnz	L(divide_fraction)
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBX, %ebx
+	movl	%edx, %eax
+
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+L(divide_fraction):
+	C eax	scratch (quotient)
+	C ebx	counter
+	C ecx
+	C edx	scratch (remainder)
+	C esi
+	C edi	dst
+	C ebp	divisor
+
+	movl	$0, %eax
+
+	divl	%ebp
+
+	movl	%eax, -4(%edi,%ebx,4)
+	decl	%ebx
+	jnz	L(divide_fraction)
+
+	jmp	L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	12(%edi), %ebx
+
+	movl	%ebx, VAR_DST_STOP
+	leal	4(%edi,%ecx,4), %edi	C &dst[xsize+size]
+
+	movl	%edi, VAR_DST
+	movl	%ecx, %ebx		C size
+
+	bsrl	%ebp, %ecx		C 31-l
+	movl	%edx, %edi		C carry
+
+	leal	1(%ecx), %eax		C 32-l
+	xorl	$31, %ecx		C l
+
+	movl	%ecx, VAR_NORM
+	movl	$-1, %edx
+
+	shll	%cl, %ebp		C d normalized
+	movd	%eax, %mm7
+
+	movl	$-1, %eax
+	subl	%ebp, %edx		C (b-d)-1 giving edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1) / d
+
+	movl	%eax, VAR_INVERSE
+	orl	%ebx, %ebx		C size
+	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
+
+	movl	%eax, VAR_SRC
+	jz	L(start_zero)
+
+	movl	8(%eax), %esi		C src high limb
+	cmpl	$1, %ebx
+	jz	L(start_one)
+
+L(start_two_or_more):
+	movl	4(%eax), %edx		C src second highest limb
+
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
+
+	cmpl	$2, %ebx
+	je	L(integer_two_left)
+	jmp	L(integer_top)
+
+
+L(start_one):
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shll	%cl, %esi		C n10 = high << l
+	jmp	L(integer_one_left)
+
+
+L(start_zero):
+	shll	%cl, %edi		C n2 = carry << l
+	movl	$0, %esi		C n10 = 0
+
+	C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
+	C must have xsize!=0
+	jmp	L(fraction_some)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C This loop runs at about 25 cycles, which is probably sub-optimal, and
+C certainly more than the dependent chain would suggest.  A better loop, or
+C a better rough analysis of what's possible, would be welcomed.
+C
+C In the current implementation, the following successively dependent
+C micro-ops seem to exist.
+C
+C		       uops
+C		n2+n1	1   (addl)
+C		mul	5
+C		q1+1	3   (addl/adcl)
+C		mul	5
+C		sub	3   (subl/sbbl)
+C		addback	2   (cmov)
+C		       ---
+C		       19
+C
+C Lack of registers hinders explicit scheduling and it might be that the
+C normal out of order execution isn't able to hide enough under the mul
+C latencies.
+C
+C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
+C cmov (and takes one uop off the dependent chain).  A sarl/andl/addl
+C combination was tried for the addback (despite the fact it would lengthen
+C the dependent chain) but found to be no faster.
+
+
+	ALIGN(16)
+L(integer_top):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	d
+	C
+	C mm0	scratch (src qword)
+	C mm7	rshift for normalization
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+	movl	VAR_SRC, %ecx
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+	movq	(%ecx), %mm0       C next src limb and the one below it
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	subl	$4, %ecx
+
+	movl	%ecx, VAR_SRC
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	movl	%ebp, %eax	   C d
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+	jz	L(q1_ff)
+
+	mull	%ebx		   C (q1+1)*d
+
+	movl	VAR_DST, %ecx
+	psrlq	%mm7, %mm0
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+	movl	VAR_DST_STOP, %eax
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	movd	%mm0, %esi
+
+	sbbl	$0, %ebx	   C q
+	subl	$4, %ecx
+
+	movl	%ebx, (%ecx)
+	cmpl	%eax, %ecx
+
+	movl	%ecx, VAR_DST
+	jne	L(integer_top)
+
+
+L(integer_loop_done):
+
+
+C -----------------------------------------------------------------------------
+C
+C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
+C q1_ff special case.  This make the code a bit smaller and simpler, and
+C costs only 2 cycles (each).
+
+L(integer_two_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+
+ 	movl	%esi, %eax
+ 	movl	%ebp, %ebx
+
+ 	sarl	$31, %eax          C -n1
+ 	movl	PARAM_SRC, %ecx
+
+ 	andl	%eax, %ebx         C -n1 & d
+ 	negl	%eax               C n1
+
+ 	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+ 	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movd	(%ecx), %mm0	   C src low limb
+
+	movl	VAR_DST_STOP, %ecx
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	movd	%mm0, %esi
+
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -4(%ecx)
+
+
+C -----------------------------------------------------------------------------
+L(integer_one_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+
+ 	movl	%esi, %eax
+ 	movl	%ebp, %ebx
+
+ 	sarl	$31, %eax          C -n1
+ 	movl	VAR_DST_STOP, %ecx
+
+ 	andl	%eax, %ebx         C -n1 & d
+ 	negl	%eax               C n1
+
+ 	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+ 	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx           C q1 if q1+1 overflowed
+
+	mull	%ebx
+
+	C
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+	movl	PARAM_XSIZE, %eax
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -8(%ecx)
+	subl	$8, %ecx
+
+
+
+	orl	%eax, %eax         C xsize
+	jnz	L(fraction_some)
+
+	movl	%edi, %eax
+L(fraction_done):
+	movl	VAR_NORM, %ecx
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EBX, %ebx
+	addl	$STACK_SPACE, %esp
+
+	shrl	%cl, %eax
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+	C eax	(divisor)
+	C ebx	(q1+1 == 0)
+	C ecx
+	C edx
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+
+	movl	VAR_DST, %ecx
+	movl	VAR_DST_STOP, %edx
+	subl	$4, %ecx
+
+	movl	%ecx, VAR_DST
+	psrlq	%mm7, %mm0
+	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
+
+	movl	$-1, (%ecx)
+	movd	%mm0, %esi		C next n10
+
+	cmpl	%ecx, %edx
+	jne	L(integer_top)
+
+	jmp	L(integer_loop_done)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C In the current implementation, the following successively dependent
+C micro-ops seem to exist.
+C
+C		       uops
+C		mul	5
+C		q1+1	1   (addl)
+C		mul	5
+C		sub	3   (negl/sbbl)
+C		addback	2   (cmov)
+C		       ---
+C		       16
+C
+C The loop in fact runs at about 17.5 cycles.  Using a sarl/andl/addl for
+C the addback was found to be a touch slower.
+
+
+	ALIGN(16)
+L(fraction_some):
+	C eax
+	C ebx
+	C ecx
+	C edx
+	C esi
+	C edi	carry
+	C ebp	divisor
+
+	movl	PARAM_DST, %esi
+	movl	VAR_DST_STOP, %ecx
+	movl	%edi, %eax
+
+	subl	$8, %ecx
+
+
+	ALIGN(16)
+L(fraction_top):
+	C eax	n2, then scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	dst, decrementing
+	C edx	scratch
+	C esi	dst stop point
+	C edi	n2
+	C ebp	divisor
+
+	mull	VAR_INVERSE	C m*n2
+
+	movl	%ebp, %eax	C d
+	subl	$4, %ecx	C dst
+	leal	1(%edi), %ebx
+
+	C
+
+	C
+
+	C
+
+	addl	%edx, %ebx	C 1 + high(n2<<32 + m*n2) = q1+1
+
+	mull	%ebx		C (q1+1)*d
+
+	C
+
+	C
+
+	C
+
+	C
+
+	negl	%eax		C low of n - (q1+1)*d
+
+ 	sbbl	%edx, %edi	C high of n - (q1+1)*d, caring only about carry
+	leal    (%ebp,%eax), %edx
+
+ 	cmovc(	%edx, %eax)	C n - q1*d if underflow from using q1+1
+
+ 	sbbl	$0, %ebx	C q
+	movl	%eax, %edi	C remainder->n2
+ 	cmpl	%esi, %ecx
+
+	movl	%ebx, (%ecx)	C previous q
+	jne	L(fraction_top)
+
+
+	jmp	L(fraction_done)
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/mmx/mod_1.asm b/rts/gmp/mpn/x86/p6/mmx/mod_1.asm
new file mode 100644
index 0000000000..e7d8d94d33
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/mmx/mod_1.asm
@@ -0,0 +1,444 @@
+dnl  Intel Pentium-II mpn_mod_1 -- mpn by limb remainder.
+dnl 
+dnl  P6MMX: 24.0 cycles/limb.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                       mp_limb_t carry);
+C
+C The code here very similar to mpn_divrem_1, but with the quotient
+C discarded.  What's here probably isn't optimal.
+C
+C See mpn/x86/p6/mmx/divrem_1.c and mpn/x86/k7/mmx/mod_1.asm for some
+C comments.
+
+
+dnl  MUL_THRESHOLD is the size at which the multiply by inverse method is
+dnl  used, rather than plain "divl"s.  Minimum value 2.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,    8)
+defframe(PARAM_SRC,     4)
+
+defframe(SAVE_EBX,    -4)
+defframe(SAVE_ESI,    -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+defframe(VAR_NORM,    -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC_STOP,-28)
+
+deflit(STACK_SPACE, 28)
+
+	.text
+	ALIGN(16)
+
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	jmp	LF(mpn_mod_1,start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(16)
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+	movl	$0, %edx		C initial carry (if can't skip a div)
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	orl	%ecx, %ecx
+	jz	L(divide_done)
+
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+
+	cmpl	%ebp, %eax		C carry flag if high<divisor
+					
+	cmovc(	%eax, %edx)		C src high limb as initial carry
+	sbbl	$0, %ecx		C size-1 to skip one div
+	jz	L(divide_done)
+
+
+	ALIGN(16)
+L(start_1c):
+	C eax	
+	C ebx
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi
+	C ebp	divisor
+
+	cmpl	$MUL_THRESHOLD, %ecx
+	jae	L(mul_by_inverse)
+
+
+	orl	%ecx, %ecx
+	jz	L(divide_done)
+
+
+L(divide_top):
+	C eax	scratch (quotient)
+	C ebx
+	C ecx	counter, limbs, decrementing
+	C edx	scratch (remainder)
+	C esi	src
+	C edi
+	C ebp
+
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	%ebp
+
+	decl	%ecx
+	jnz	L(divide_top)
+
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+	movl	%edx, %eax
+
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi
+	C ebp	divisor
+
+	movl	%ebx, SAVE_EBX
+	leal	-4(%esi), %ebx
+
+	movl	%ebx, VAR_SRC_STOP
+	movl	%ecx, %ebx		C size
+
+	movl	%edi, SAVE_EDI
+	movl	%edx, %edi		C carry
+
+	bsrl	%ebp, %ecx		C 31-l
+	movl	$-1, %edx
+
+	leal	1(%ecx), %eax		C 32-l
+	xorl	$31, %ecx		C l
+
+	movl	%ecx, VAR_NORM
+	shll	%cl, %ebp		C d normalized
+
+	movd	%eax, %mm7
+	movl	$-1, %eax
+	subl	%ebp, %edx		C (b-d)-1 so  edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1) / d
+
+	C
+
+	movl	%eax, VAR_INVERSE
+	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
+
+	movl	8(%eax), %esi		C src high limb
+	movl	4(%eax), %edx		C src second highest limb
+
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
+
+	movl	%eax, %ecx		C &src[size-3]
+
+
+ifelse(MUL_THRESHOLD,2,`
+	cmpl	$2, %ebx
+	je	L(inverse_two_left)
+')
+
+
+C The dependent chain here is the same as in mpn_divrem_1, but a few
+C instructions are saved by not needing to store the quotient limbs.  This
+C gets it down to 24 c/l, which is still a bit away from a theoretical 19
+C c/l.
+
+	ALIGN(16)
+L(inverse_top):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	src pointer, decrementing
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src qword)
+	C mm7	rshift for normalization
+
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movq	(%ecx), %mm0       C next src limb and the one below it
+	subl	$4, %ecx
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+	jz	L(q1_ff)
+
+	mull	%ebx		   C (q1+1)*d
+
+	psrlq	%mm7, %mm0
+	movl	VAR_SRC_STOP, %ebx
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	movd	%mm0, %esi
+	cmpl	%ebx, %ecx
+
+	jne	L(inverse_top)
+
+
+L(inverse_loop_done):
+
+
+C -----------------------------------------------------------------------------
+
+L(inverse_two_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	&src[-1]
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src dword)
+	C mm7	rshift
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movd	4(%ecx), %mm0	   C src low limb
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+	movl	%ebp, %eax	   C d
+
+	mull	%ebx		   C (q1+1)*d
+
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	movd	%mm0, %esi
+
+
+C One limb left
+
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movl	VAR_NORM, %ecx     C for final denorm
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+	movl	%ebp, %eax	   C d
+
+	mull	%ebx		   C (q1+1)*d
+
+	movl	SAVE_EBX, %ebx
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	leal	(%ebp,%esi), %edx
+	movl	SAVE_EBP, %ebp
+
+	movl	%esi, %eax	   C remainder
+	movl	SAVE_ESI, %esi
+
+	cmovc(	%edx, %eax)	   C n - q1*d if underflow from using q1+1
+	movl	SAVE_EDI, %edi
+
+	shrl	%cl, %eax	   C denorm remainder
+	addl	$STACK_SPACE, %esp
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+	C eax	(divisor)
+	C ebx	(q1+1 == 0)
+	C ecx	src pointer
+	C edx
+	C esi	n10
+	C edi	(n2)
+	C ebp	divisor
+
+	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
+	movl	VAR_SRC_STOP, %edx
+	psrlq	%mm7, %mm0
+
+	movd	%mm0, %esi		C next n10
+	cmpl	%ecx, %edx
+	jne	L(inverse_top)
+
+	jmp	L(inverse_loop_done)
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/mmx/popham.asm b/rts/gmp/mpn/x86/p6/mmx/popham.asm
new file mode 100644
index 0000000000..50f9a11218
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/mmx/popham.asm
@@ -0,0 +1,31 @@
+dnl  Intel Pentium-II mpn_popcount, mpn_hamdist -- population count and
+dnl  hamming distance.
+dnl 
+dnl  P6MMX: popcount 11 cycles/limb (approx), hamdist 11.5 cycles/limb
+dnl  (approx)
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')
diff --git a/rts/gmp/mpn/x86/p6/p3mmx/popham.asm b/rts/gmp/mpn/x86/p6/p3mmx/popham.asm
new file mode 100644
index 0000000000..e63fbf334b
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/p3mmx/popham.asm
@@ -0,0 +1,30 @@
+dnl  Intel Pentium-III mpn_popcount, mpn_hamdist -- population count and
+dnl  hamming distance.
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl  Haven't actually measured it, but the K7 code with the psadbw should be
+dnl  good on P-III.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k7/mmx/popham.asm')
diff --git a/rts/gmp/mpn/x86/p6/sqr_basecase.asm b/rts/gmp/mpn/x86/p6/sqr_basecase.asm
new file mode 100644
index 0000000000..174c78406a
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/sqr_basecase.asm
@@ -0,0 +1,641 @@
+dnl  Intel P6 mpn_sqr_basecase -- square an mpn number.
+dnl 
+dnl  P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
+dnl  product (measured on the speed difference between 20 and 40 limbs,
+dnl  which is the Karatsuba recursing range).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
+dnl  a description.  The only difference here is that UNROLL_COUNT can go up
+dnl  to 64 (not 63) making KARATSUBA_SQR_THRESHOLD_MAX 67.
+
+deflit(KARATSUBA_SQR_THRESHOLD_MAX, 67)
+
+ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
+`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
+
+m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
+deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the given size
+C is small.
+C
+C The code size might look a bit excessive, but not all of it is executed so
+C it won't all get into the code cache.  The 1x1, 2x2 and 3x3 special cases
+C clearly apply only to those sizes; mid sizes like 10x10 only need part of
+C the unrolled addmul; and big sizes like 40x40 that do use the full
+C unrolling will least be making good use of it, because 40x40 will take
+C something like 7000 cycles.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %edx
+	movl	PARAM_DST, %ecx
+	je	L(two_limbs)
+
+	movl	(%eax), %eax
+	ja	L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src limb
+	C ebx
+	C ecx	dst
+	C edx
+
+	mull	%eax
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx
+
+defframe(SAVE_ESI, -4)
+defframe(SAVE_EBX, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(`STACK_SPACE',16)
+
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	%eax, %esi
+	movl	(%eax), %eax
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)	C dst[0]
+	movl	4(%esi), %eax
+
+	movl	%ebx, SAVE_EBX
+	movl	%edx, %ebx	C dst[1]
+
+	mull	%eax		C src[1]^2
+
+	movl	%edi, SAVE_EDI
+	movl	%eax, %edi	C dst[2]
+	movl	(%esi), %eax
+
+	movl	%ebp, SAVE_EBP
+	movl	%edx, %ebp	C dst[3]
+
+	mull	4(%esi)		C src[0]*src[1]
+
+	addl	%eax, %ebx
+	movl	SAVE_ESI, %esi
+
+	adcl	%edx, %edi
+
+	adcl	$0, %ebp
+	addl	%ebx, %eax
+	movl	SAVE_EBX, %ebx
+
+	adcl	%edi, %edx
+	movl	SAVE_EDI, %edi
+
+	adcl	$0, %ebp
+
+	movl	%eax, 4(%ecx)
+
+	movl	%ebp, 12(%ecx)
+	movl	SAVE_EBP, %ebp
+
+	movl	%edx, 8(%ecx)
+	addl	$FRAME, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(three_or_more):
+	C eax	src low limb
+	C ebx
+	C ecx	dst
+	C edx	size
+deflit(`FRAME',0)
+
+	pushl	%esi	defframe_pushl(`SAVE_ESI')
+	cmpl	$4, %edx
+
+	movl	PARAM_SRC, %esi
+	jae	L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+
+	C eax	src low limb
+	C ebx
+	C ecx	dst
+	C edx
+	C esi	src
+	C edi
+	C ebp
+
+	pushl	%ebp	defframe_pushl(`SAVE_EBP')
+	pushl	%edi	defframe_pushl(`SAVE_EDI')
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	movl	4(%esi), %eax
+	xorl	%ebp, %ebp
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	%edx, 12(%ecx)
+	movl	8(%esi), %eax
+
+	pushl	%ebx	defframe_pushl(`SAVE_EBX')
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	%edx, 20(%ecx)
+
+	movl	(%esi), %eax
+
+	mull	4(%esi)		C src[0] * src[1]
+
+	movl	%eax, %ebx
+	movl	%edx, %edi
+
+	movl	(%esi), %eax
+
+	mull	8(%esi)		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	%edx, %ebp
+
+	adcl	$0, %ebp
+	movl	4(%esi), %eax
+
+	mull	8(%esi)		C src[1] * src[2]
+
+	xorl	%esi, %esi
+	addl	%eax, %ebp
+
+	C eax
+	C ebx	dst[1]
+	C ecx	dst
+	C edx	dst[4]
+	C esi	zero, will be dst[5]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	adcl	$0, %edx
+	addl	%ebx, %ebx
+
+	adcl	%edi, %edi
+
+	adcl	%ebp, %ebp
+
+	adcl	%edx, %edx
+	movl	4(%ecx), %eax
+
+	adcl	$0, %esi
+	addl	%ebx, %eax
+
+	movl	%eax, 4(%ecx)
+	movl	8(%ecx), %eax
+
+	adcl	%edi, %eax
+	movl	12(%ecx), %ebx
+
+	adcl	%ebp, %ebx
+	movl	16(%ecx), %edi
+
+	movl	%eax, 8(%ecx)
+	movl	SAVE_EBP, %ebp
+
+	movl	%ebx, 12(%ecx)
+	movl	SAVE_EBX, %ebx
+
+	adcl	%edx, %edi
+	movl	20(%ecx), %eax
+
+	movl	%edi, 16(%ecx)
+	movl	SAVE_EDI, %edi
+
+	adcl	%esi, %eax	C no carry out of this
+	movl	SAVE_ESI, %esi
+
+	movl	%eax, 20(%ecx)
+	addl	$FRAME, %esp
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+defframe(VAR_COUNTER,-20)
+defframe(VAR_JMP,    -24)
+deflit(`STACK_SPACE',24)
+
+L(four_or_more):
+	C eax	src low limb
+	C ebx
+	C ecx
+	C edx	size
+	C esi	src
+	C edi
+	C ebp
+deflit(`FRAME',4)  dnl  %esi already pushed
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+ 
+	subl	$STACK_SPACE-FRAME, %esp
+deflit(`FRAME',STACK_SPACE)
+	movl	$1, %ecx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebx, SAVE_EBX
+	subl	%edx, %ecx		C -(size-1)
+
+	movl	%ebp, SAVE_EBP
+	movl	$0, %ebx		C initial carry
+
+	leal	(%esi,%edx,4), %esi	C &src[size]
+	movl	%eax, %ebp		C multiplier
+
+	leal	-4(%edi,%edx,4), %edi	C &dst[size-1]
+
+
+C This loop runs at just over 6 c/l.
+
+L(mul_1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, limbs, negative, -(size-1) to -1
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size-1]
+	C ebp	multiplier
+
+	movl	%ebp, %eax
+
+	mull	(%esi,%ecx,4)
+
+	addl	%ebx, %eax
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+	movl	%eax, 4(%edi,%ecx,4)
+
+	incl	%ecx
+	jnz	L(mul_1)
+
+
+	movl	%ebx, 4(%edi)
+
+
+C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two addmuls, which are the bottom right corner of the product
+C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as mpn_addmul_1(), see that routine for some
+C comments.
+C
+C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+
+dnl  This is also hard-coded in the address calculation below.
+deflit(CODE_BYTES_PER_LIMB, 15)
+
+dnl  With &src[size] and &dst[size-1] pointers, the displacements in the
+dnl  unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
+dnl  that an offset must be added to them.
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>32),1,
+eval((UNROLL_COUNT-32)*4),
+0))
+
+	C eax
+	C ebx	carry
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size-1]
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+
+	subl	$4, %ecx
+	jz	L(corner)
+
+	movl	%ecx, %edx
+	negl	%ecx
+
+	shll	$4, %ecx
+ifelse(OFFSET,0,,`subl	$OFFSET, %esi')
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+	negl	%edx
+
+ifelse(OFFSET,0,,`subl	$OFFSET, %edi')
+
+	C The calculated jump mustn't be before the start of the available
+	C code.  This is the limit that UNROLL_COUNT puts on the src operand
+	C size, but checked here using the jump address directly.
+
+	ASSERT(ae,
+	`movl_text_address( L(unroll_inner_start), %eax)
+	cmpl	%eax, %ecx')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll_outer_top):
+	C eax
+	C ebx	high limb to store
+	C ecx	VAR_JMP
+	C edx	VAR_COUNTER, limbs, negative
+	C esi	&src[size], constant
+	C edi	dst ptr, second highest limb of last addmul
+	C ebp
+
+	movl	-12+OFFSET(%esi,%edx,4), %ebp	C multiplier
+	movl	%edx, VAR_COUNTER
+
+	movl	-8+OFFSET(%esi,%edx,4), %eax	C first limb of multiplicand
+
+	mull	%ebp
+
+define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
+
+	testb	$1, %cl
+
+	movl	%edx, %ebx	C high carry
+	leal	4(%edi), %edi
+
+	movl	%ecx, %edx	C jump
+
+	movl	%eax, %ecx	C low carry
+	leal	CODE_BYTES_PER_LIMB(%edx), %edx
+
+	cmovX(	%ebx, %ecx)	C high carry reverse
+	cmovX(	%eax, %ebx)	C low carry reverse
+	movl	%edx, VAR_JMP
+	jmp	*%edx
+
+
+	C Must be on an even address here so the low bit of the jump address
+	C will indicate which way around ecx/ebx should start.
+
+	ALIGN(2)
+
+L(unroll_inner_start):
+	C eax	scratch
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	src pointer
+	C edi	dst pointer
+	C ebp	multiplier
+	C
+	C 15 code bytes each limb
+	C ecx/ebx reversed on each chunk
+
+forloop(`i', UNROLL_COUNT, 1, `
+	deflit(`disp_src', eval(-i*4 + OFFSET))
+	deflit(`disp_dst', eval(disp_src))
+
+	m4_assert(`disp_src>=-128 && disp_src<128')
+	m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp(	movl,	disp_src,(%esi), %eax)
+	mull	%ebp
+Zdisp(	addl,	%ebx, disp_dst,(%edi))
+	adcl	%eax, %ecx
+	movl	%edx, %ebx
+	adcl	$0, %ebx
+',`
+	dnl  this one comes out last
+Zdisp(	movl,	disp_src,(%esi), %eax)
+	mull	%ebp
+Zdisp(	addl,	%ecx, disp_dst,(%edi))
+	adcl	%eax, %ebx
+	movl	%edx, %ecx
+	adcl	$0, %ecx
+')
+')
+L(unroll_inner_end):
+
+	addl	%ebx, m4_empty_if_zero(OFFSET)(%edi)
+
+	movl	VAR_COUNTER, %edx
+	adcl	$0, %ecx
+
+	movl	%ecx, m4_empty_if_zero(OFFSET+4)(%edi)
+	movl	VAR_JMP, %ecx
+
+	incl	%edx
+	jnz	L(unroll_outer_top)
+	
+
+ifelse(OFFSET,0,,`
+	addl	$OFFSET, %esi
+	addl	$OFFSET, %edi
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(corner):
+	C eax
+	C ebx
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[2*size-5]
+	C ebp
+
+	movl	-12(%esi), %eax
+
+	mull	-8(%esi)
+
+	addl	%eax, (%edi)
+	movl	-12(%esi), %eax
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+
+	mull	-4(%esi)
+
+	addl	%eax, %ebx
+	movl	-8(%esi), %eax
+
+	adcl	$0, %edx
+
+	addl	%ebx, 4(%edi)
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+
+	mull	-4(%esi)
+
+	movl	PARAM_SIZE, %ecx
+	addl	%ebx, %eax
+
+	adcl	$0, %edx
+
+	movl	%eax, 8(%edi)
+
+	movl	%edx, 12(%edi)
+	movl	PARAM_DST, %edi
+
+
+C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
+
+	subl	$1, %ecx		C size-1
+	xorl	%eax, %eax		C ready for final adcl, and clear carry
+
+	movl	%ecx, %edx
+	movl	PARAM_SRC, %esi
+
+
+L(lshift):
+	C eax
+	C ebx
+	C ecx	counter, size-1 to 1
+	C edx	size-1 (for later use)
+	C esi	src (for later use)
+	C edi	dst, incrementing
+	C ebp
+
+	rcll	4(%edi)
+	rcll	8(%edi)
+
+	leal	8(%edi), %edi
+	decl	%ecx
+	jnz	L(lshift)
+
+
+	adcl	%eax, %eax
+
+	movl	%eax, 4(%edi)		C dst most significant limb
+	movl	(%esi), %eax		C src[0]
+
+	leal	4(%esi,%edx,4), %esi	C &src[size]
+	subl	%edx, %ecx		C -(size-1)
+
+
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+
+	mull	%eax
+
+	movl	%eax, (%edi,%ecx,8)	C dst[0]
+
+
+L(diag):
+	C eax	scratch
+	C ebx	scratch
+	C ecx	counter, negative
+	C edx	carry
+	C esi	&src[size]
+	C edi	dst[2*size-2]
+	C ebp
+
+	movl	(%esi,%ecx,4), %eax
+	movl	%edx, %ebx
+
+	mull	%eax
+
+	addl	%ebx, 4(%edi,%ecx,8)
+	adcl	%eax, 8(%edi,%ecx,8)
+	adcl	$0, %edx
+
+	incl	%ecx
+	jnz	L(diag)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+
+	addl	%edx, 4(%edi)		C dst most significant limb
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+ifdef(`PIC',`
+L(pic_calc):
+	addl	(%esp), %ecx
+	addl	$L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
+	addl	%edx, %ecx
+	ret
+')
+
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/README b/rts/gmp/mpn/x86/pentium/README
new file mode 100644
index 0000000000..3b9ec8ac6f
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/README
@@ -0,0 +1,77 @@
+
+                   INTEL PENTIUM P5 MPN SUBROUTINES
+
+
+This directory contains mpn functions optimized for Intel Pentium (P5,P54)
+processors.  The mmx subdirectory has code for Pentium with MMX (P55).
+
+
+STATUS
+
+                                cycles/limb
+
+	mpn_add_n/sub_n            2.375
+
+	mpn_copyi/copyd            1.0
+
+	mpn_divrem_1              44.0
+	mpn_mod_1                 44.0
+	mpn_divexact_by3          15.0
+
+	mpn_l/rshift               5.375 normal (6.0 on P54)
+				   1.875 special shift by 1 bit
+
+	mpn_mul_1                 13.0
+	mpn_add/submul_1          14.0
+
+	mpn_mul_basecase          14.2 cycles/crossproduct (approx)
+
+	mpn_sqr_basecase           8 cycles/crossproduct (approx)
+                                   or 15.5 cycles/triangleproduct (approx)
+
+Pentium MMX gets the following improvements
+
+	mpn_l/rshift               1.75
+
+
+1. mpn_lshift and mpn_rshift run at about 6 cycles/limb on P5 and P54, but the
+documentation indicates that they should take only 43/8 = 5.375 cycles/limb,
+or 5 cycles/limb asymptotically.  The P55 runs them at the expected speed.
+
+2. mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb.  Due to loop
+overhead and other delays (cache refill?), they run at or near 2.5 cycles/limb.
+
+3. mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they
+should.  Intel documentation says a mul instruction is 10 cycles, but it
+measures 9 and the routines using it run with it as 9.
+
+
+
+RELEVANT OPTIMIZATION ISSUES
+
+1. Pentium doesn't allocate cache lines on writes, unlike most other modern
+processors.  Since the functions in the mpn class do array writes, we have to
+handle allocating the destination cache lines by reading a word from it in the
+loops, to achieve the best performance.
+
+2. Pairing of memory operations requires that the two issued operations refer
+to different cache banks.  The simplest way to insure this is to read/write
+two words from the same object.  If we make operations on different objects,
+they might or might not be to the same cache bank.
+
+
+
+REFERENCES
+
+"Intel Architecture Optimization Manual", 1997, order number 242816.  This
+is mostly about P5, the parts about P6 aren't relevant.  Available on-line:
+
+        http://download.intel.com/design/PentiumII/manuals/242816.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/pentium/aors_n.asm b/rts/gmp/mpn/x86/pentium/aors_n.asm
new file mode 100644
index 0000000000..a61082a456
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/aors_n.asm
@@ -0,0 +1,196 @@
+dnl  Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+dnl
+dnl  P5: 2.375 cycles/limb
+
+
+dnl  Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl  Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_add_n',`
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+
+',`ifdef(`OPERATION_sub_n',`
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                           mp_size_t size, mp_limb_t carry);
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(M4_function_nc)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%ebp
+	movl	PARAM_SIZE,%ecx
+
+	movl	(%ebp),%ebx
+
+	decl	%ecx
+	movl	%ecx,%edx
+	shrl	$3,%ecx
+	andl	$7,%edx
+	testl	%ecx,%ecx		C zero carry flag
+	jz	L(endgo)
+
+	pushl	%edx
+FRAME_pushl()
+	movl	PARAM_CARRY,%eax
+	shrl	$1,%eax			C shift bit 0 into carry
+	jmp	LF(M4_function_n,oop)
+
+L(endgo):
+deflit(`FRAME',16)
+	movl	PARAM_CARRY,%eax
+	shrl	$1,%eax			C shift bit 0 into carry
+	jmp	LF(M4_function_n,end)
+
+EPILOGUE()
+
+
+	ALIGN(8)
+PROLOGUE(M4_function_n)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%ebp
+	movl	PARAM_SIZE,%ecx
+
+	movl	(%ebp),%ebx
+
+	decl	%ecx
+	movl	%ecx,%edx
+	shrl	$3,%ecx
+	andl	$7,%edx
+	testl	%ecx,%ecx		C zero carry flag
+	jz	L(end)
+	pushl	%edx
+FRAME_pushl()
+
+	ALIGN(8)
+L(oop):	movl	28(%edi),%eax		C fetch destination cache line
+	leal	32(%edi),%edi
+
+L(1):	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	4(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	8(%ebp),%ebx
+	movl	%eax,-32(%edi)
+	movl	%edx,-28(%edi)
+
+L(2):	movl	8(%esi),%eax
+	movl	12(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	12(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	16(%ebp),%ebx
+	movl	%eax,-24(%edi)
+	movl	%edx,-20(%edi)
+
+L(3):	movl	16(%esi),%eax
+	movl	20(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	20(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	24(%ebp),%ebx
+	movl	%eax,-16(%edi)
+	movl	%edx,-12(%edi)
+
+L(4):	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	28(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	32(%ebp),%ebx
+	movl	%eax,-8(%edi)
+	movl	%edx,-4(%edi)
+
+	leal	32(%esi),%esi
+	leal	32(%ebp),%ebp
+	decl	%ecx
+	jnz	L(oop)
+
+	popl	%edx
+FRAME_popl()
+L(end):
+	decl	%edx			C test %edx w/o clobbering carry
+	js	L(end2)
+	incl	%edx
+L(oop2):
+	leal	4(%edi),%edi
+	movl	(%esi),%eax
+	M4_inst	%ebx,%eax
+	movl	4(%ebp),%ebx
+	movl	%eax,-4(%edi)
+	leal	4(%esi),%esi
+	leal	4(%ebp),%ebp
+	decl	%edx
+	jnz	L(oop2)
+L(end2):
+	movl	(%esi),%eax
+	M4_inst	%ebx,%eax
+	movl	%eax,(%edi)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/aorsmul_1.asm b/rts/gmp/mpn/x86/pentium/aorsmul_1.asm
new file mode 100644
index 0000000000..147b55610f
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/aorsmul_1.asm
@@ -0,0 +1,99 @@
+dnl  Intel Pentium mpn_addmul_1 -- mpn by limb multiplication.
+dnl 
+dnl  P5: 14.0 cycles/limb
+
+
+dnl  Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
+dnl  Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA. */
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_addmul_1', `
+      define(M4_inst,        addl)
+      define(M4_function_1,  mpn_addmul_1)
+
+',`ifdef(`OPERATION_submul_1', `
+      define(M4_inst,        subl)
+      define(M4_function_1,  mpn_submul_1)
+
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult);
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(M4_function_1)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST, %edi
+	movl	PARAM_SRC, %esi
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_MULTIPLIER, %ebp
+
+	leal	(%edi,%ecx,4), %edi
+	leal	(%esi,%ecx,4), %esi
+	negl	%ecx
+	xorl	%ebx, %ebx
+	ALIGN(8)
+
+L(oop):	adcl	$0, %ebx
+	movl	(%esi,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	movl	(%edi,%ecx,4), %ebx
+
+	adcl	$0, %edx
+	M4_inst	%eax, %ebx
+
+	movl	%ebx, (%edi,%ecx,4)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(oop)
+
+	adcl	$0, %ebx
+	movl	%ebx, %eax
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/diveby3.asm b/rts/gmp/mpn/x86/pentium/diveby3.asm
new file mode 100644
index 0000000000..dbac81642f
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/diveby3.asm
@@ -0,0 +1,183 @@
+dnl  Intel P5 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+dnl       
+dnl  P5: 15.0 cycles/limb
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                              mp_limb_t carry);
+
+defframe(PARAM_CARRY,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,   8)
+defframe(PARAM_DST,   4)
+
+dnl  multiplicative inverse of 3, modulo 2^32
+deflit(INVERSE_3,        0xAAAAAAAB)
+
+dnl  ceil(b/3), ceil(b*2/3) and floor(b*2/3) where b=2^32
+deflit(ONE_THIRD_CEIL,   0x55555556)
+deflit(TWO_THIRDS_CEIL,  0xAAAAAAAB)
+deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(mpn_divexact_by3c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SRC, %ecx
+	movl	PARAM_SIZE, %edx
+
+	decl	%edx
+	jnz	L(two_or_more)
+
+	movl	(%ecx), %edx
+	movl	PARAM_CARRY, %eax	C risk of cache bank clash here
+
+	movl	PARAM_DST, %ecx
+	subl	%eax, %edx
+
+	sbbl	%eax, %eax		C 0 or -1
+
+	imull	$INVERSE_3, %edx, %edx
+
+	negl	%eax			C 0 or 1
+	cmpl	$ONE_THIRD_CEIL, %edx
+
+	sbbl	$-1, %eax		C +1 if edx>=ceil(b/3)
+	cmpl	$TWO_THIRDS_CEIL, %edx
+
+	sbbl	$-1, %eax		C +1 if edx>=ceil(b*2/3)
+	movl	%edx, (%ecx)
+
+	ret
+
+
+L(two_or_more):
+	C eax
+	C ebx
+	C ecx	src
+	C edx	size-1
+	C esi
+	C edi
+	C ebp
+
+	pushl	%ebx	FRAME_pushl()
+	pushl	%esi	FRAME_pushl()
+
+	pushl	%edi	FRAME_pushl()
+	pushl	%ebp	FRAME_pushl()
+
+	movl	PARAM_DST, %edi
+	movl	PARAM_CARRY, %esi
+
+	movl	(%ecx), %eax		C src low limb
+	xorl	%ebx, %ebx
+
+	sub	%esi, %eax
+	movl	$TWO_THIRDS_FLOOR, %esi
+
+	leal	(%ecx,%edx,4), %ecx	C &src[size-1]
+	leal	(%edi,%edx,4), %edi	C &dst[size-1]
+
+	adcl	$0, %ebx		C carry, 0 or 1
+	negl	%edx			C -(size-1)
+
+
+C The loop needs a source limb ready at the top, which leads to one limb
+C handled separately at the end, and the special case above for size==1.
+C There doesn't seem to be any scheduling that would keep the speed but move
+C the source load and carry subtract up to the top.
+C
+C The destination cache line prefetching adds 1 cycle to the loop but is
+C considered worthwhile.  The slowdown is a factor of 1.07, but will prevent
+C repeated write-throughs if the destination isn't in L1.  A version using
+C an outer loop to prefetch only every 8 limbs (a cache line) proved to be
+C no faster, due to unavoidable branch mispreditions in the inner loop.
+C
+C setc is 2 cycles on P54, so an adcl is used instead.  If the movl $0,%ebx
+C could be avoided then the src limb fetch could pair up and save a cycle.
+C This would probably mean going to a two limb loop with the carry limb
+C alternately positive or negative, since an sbbl %ebx,%ebx will leave a
+C value which is in the opposite sense to the preceding sbbl/adcl %ebx,%eax.
+C
+C A register is used for TWO_THIRDS_FLOOR because a cmp can't be done as
+C "cmpl %edx, $n" with the immediate as the second operand.
+C
+C The "4" source displacement is in the loop rather than the setup because
+C this gets L(top) aligned to 8 bytes at no cost.
+
+	ALIGN(8)
+L(top):
+	C eax	source limb, carry subtracted
+	C ebx	carry (0 or 1)
+	C ecx	&src[size-1]
+	C edx	counter, limbs, negative
+	C esi	TWO_THIRDS_FLOOR
+	C edi	&dst[size-1]
+	C ebp	scratch (result limb)
+
+	imull	$INVERSE_3, %eax, %ebp
+
+	cmpl	$ONE_THIRD_CEIL, %ebp
+	movl	(%edi,%edx,4), %eax	C dst cache line prefetch
+
+	sbbl	$-1, %ebx		C +1 if ebp>=ceil(b/3)
+	cmpl	%ebp, %esi
+
+	movl	4(%ecx,%edx,4), %eax	C next src limb
+
+	sbbl	%ebx, %eax		C and further -1 if ebp>=ceil(b*2/3)
+	movl	$0, %ebx
+
+	adcl	$0, %ebx		C new carry
+	movl	%ebp, (%edi,%edx,4)
+
+	incl	%edx
+	jnz	L(top)
+
+
+
+	imull	$INVERSE_3, %eax, %edx
+
+	cmpl	$ONE_THIRD_CEIL, %edx
+	movl	%edx, (%edi)
+
+	sbbl	$-1, %ebx	C +1 if edx>=ceil(b/3)
+	cmpl	$TWO_THIRDS_CEIL, %edx
+
+	sbbl	$-1, %ebx	C +1 if edx>=ceil(b*2/3)
+	popl	%ebp
+
+	movl	%ebx, %eax
+	popl	%edi
+
+	popl	%esi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/gmp-mparam.h b/rts/gmp/mpn/x86/pentium/gmp-mparam.h
new file mode 100644
index 0000000000..d3ed3d73ce
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/gmp-mparam.h
@@ -0,0 +1,97 @@
+/* Intel P54 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME   9 /* cycles */
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME   41 /* cycles */
+#endif
+
+/* bsf takes 18-42 cycles, put an average for uniform random numbers */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME   20  /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   14
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      179
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   22
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      153
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              46
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            110
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            13
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          25
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 496, 928, 1920, 4608, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     512
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         3840
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 496, 1184, 1920, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     512
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         3840
+#endif
diff --git a/rts/gmp/mpn/x86/pentium/lshift.asm b/rts/gmp/mpn/x86/pentium/lshift.asm
new file mode 100644
index 0000000000..e1e35d4c57
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/lshift.asm
@@ -0,0 +1,236 @@
+dnl  Intel Pentium mpn_lshift -- mpn left shift.
+dnl
+dnl          cycles/limb
+dnl  P5,P54:    6.0
+dnl  P55:       5.375
+
+
+dnl  Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl  Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_lshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ebp
+	movl	PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions. 
+	cmp	$1,%ecx
+	jne	L(normal)
+	leal	4(%esi),%eax
+	cmpl	%edi,%eax
+	jnc	L(special)		C jump if s_ptr + 1 >= res_ptr
+	leal	(%esi,%ebp,4),%eax
+	cmpl	%eax,%edi
+	jnc	L(special)		C jump if res_ptr >= s_ptr + size
+
+L(normal):
+	leal	-4(%edi,%ebp,4),%edi
+	leal	-4(%esi,%ebp,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+	xorl	%eax,%eax
+	shldl(	%cl, %edx, %eax)	C compute carry limb
+	pushl	%eax			C push carry limb onto stack
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+	jz	L(end)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(oop):	movl	-28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	shldl(	%cl, %eax, %ebx)
+	shldl(	%cl, %edx, %eax)
+	movl	%ebx,(%edi)
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebx
+	movl	-12(%esi),%eax
+	shldl(	%cl, %ebx, %edx)
+	shldl(	%cl, %eax, %ebx)
+	movl	%edx,-8(%edi)
+	movl	%ebx,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebx
+	shldl(	%cl, %edx, %eax)
+	shldl(	%cl, %ebx, %edx)
+	movl	%eax,-16(%edi)
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	shldl(	%cl, %eax, %ebx)
+	shldl(	%cl, %edx, %eax)
+	movl	%ebx,-24(%edi)
+	movl	%eax,-28(%edi)
+
+	subl	$32,%esi
+	subl	$32,%edi
+	decl	%ebp
+	jnz	L(oop)
+
+L(end):	popl	%ebp
+	andl	$7,%ebp
+	jz	L(end2)
+L(oop2):
+	movl	(%esi),%eax
+	shldl(	%cl,%eax,%edx)
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	subl	$4,%esi
+	subl	$4,%edi
+	decl	%ebp
+	jnz	L(oop2)
+
+L(end2):
+	shll	%cl,%edx		C compute least significant limb
+	movl	%edx,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+	movl	(%esi),%edx
+	addl	$4,%esi
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+
+	addl	%edx,%edx
+	incl	%ebp
+	decl	%ebp
+	jz	L(Lend)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(Loop):
+	movl	28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebx,(%edi)
+	adcl	%edx,%edx
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebx
+	movl	12(%esi),%eax
+	adcl	%ebx,%ebx
+	movl	%edx,8(%edi)
+	adcl	%eax,%eax
+	movl	%ebx,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	adcl	%edx,%edx
+	movl	%eax,16(%edi)
+	adcl	%ebx,%ebx
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebx,24(%edi)
+	adcl	%edx,%edx
+	movl	%eax,28(%edi)
+
+	leal	32(%esi),%esi		C use leal not to clobber carry
+	leal	32(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop)
+
+L(Lend):
+	popl	%ebp
+	sbbl	%eax,%eax		C save carry in %eax
+	andl	$7,%ebp
+	jz	L(Lend2)
+	addl	%eax,%eax		C restore carry from eax
+L(Loop2):
+	movl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	%edx,%edx
+	movl	%ebx,(%edi)
+
+	leal	4(%esi),%esi		C use leal not to clobber carry
+	leal	4(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop2)
+
+	jmp	L(L1)
+L(Lend2):
+	addl	%eax,%eax		C restore carry from eax
+L(L1):	movl	%edx,(%edi)		C store last limb
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h b/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h
new file mode 100644
index 0000000000..2379077d0c
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h
@@ -0,0 +1,97 @@
+/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME   9 /* cycles */
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME   41 /* cycles */
+#endif
+
+/* bsf takes 18-42 cycles, put an average for uniform random numbers */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME   20  /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   14
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD       99
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   22
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD       89
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              40
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             98
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            13
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        5
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          25
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 496, 1056, 1920, 4608, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     512
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         3840
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 496, 1184, 2176, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     512
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         4352
+#endif
diff --git a/rts/gmp/mpn/x86/pentium/mmx/lshift.asm b/rts/gmp/mpn/x86/pentium/mmx/lshift.asm
new file mode 100644
index 0000000000..2225438658
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/lshift.asm
@@ -0,0 +1,455 @@
+dnl  Intel P5 mpn_lshift -- mpn left shift.
+dnl 
+dnl  P5: 1.75 cycles/limb.
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size left by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the right.  Return the bits shifted out at the
+C left.
+C
+C The comments in mpn_rshift apply here too.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  minimum 5, because the unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(mpn_lshift)
+
+	pushl	%ebx
+	pushl	%edi
+deflit(`FRAME',8)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_DST, %edx
+
+	movl	PARAM_SRC, %ebx
+	movl	PARAM_SHIFT, %ecx
+
+	cmp	$UNROLL_THRESHOLD, %eax
+	jae	L(unroll)
+
+	movl	-4(%ebx,%eax,4), %edi	C src high limb
+	decl	%eax
+
+	jnz	L(simple)
+
+	shldl(	%cl, %edi, %eax)	C eax was decremented to zero
+
+ 	shll	%cl, %edi
+
+	movl	%edi, (%edx)		C dst low limb
+	popl	%edi			C risk of data cache bank clash
+
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(simple):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	(%ebx,%eax,4), %mm5	C src high limb
+
+ 	movd	%ecx, %mm6		C lshift
+	negl	%ecx
+
+	psllq	%mm6, %mm5
+	addl	$32, %ecx
+
+	movd	%ecx, %mm7
+	psrlq	$32, %mm5		C retval
+
+
+L(simple_top):
+	C eax	counter, limbs, negative
+	C ebx	src
+	C ecx
+	C edx	dst
+	C esi
+	C edi
+	C
+	C mm0	scratch
+	C mm5	return value
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%ebx,%eax,4), %mm0
+	decl	%eax
+
+ 	psrlq	%mm7, %mm0
+
+	C
+
+	movd	%mm0, 4(%edx,%eax,4)
+	jnz	L(simple_top)
+
+
+	movd	(%ebx), %mm0
+
+	movd	%mm5, %eax
+ 	psllq	%mm6, %mm0
+
+	popl	%edi
+	popl	%ebx
+
+	movd	%mm0, (%edx)
+
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(unroll):
+	C eax	size
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	-4(%ebx,%eax,4), %mm5	C src high limb
+	leal	(%ebx,%eax,4), %edi
+
+ 	movd	%ecx, %mm6		C lshift
+	andl	$4, %edi
+
+	psllq	%mm6, %mm5
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process high limb separately (marked xxx) to
+	C make it so.
+	C
+	C  source     -8(ebx,%eax,4)
+	C                  |
+	C  +-------+-------+-------+--
+	C  |               |          
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+	C
+	C  dest
+	C     -4(edx,%eax,4)
+	C          |
+	C  +-------+-------+--
+	C  |  xxx  |       |  
+	C  +-------+-------+--
+
+	movq	-8(%ebx,%eax,4), %mm0	C unaligned load
+
+	psllq	%mm6, %mm0
+	decl	%eax
+
+	psrlq	$32, %mm0
+
+	C
+
+	movd	%mm0, (%edx,%eax,4)
+L(start_src_aligned):
+
+	movq	-8(%ebx,%eax,4), %mm1	C src high qword
+	leal	(%edx,%eax,4), %edi
+
+	andl	$4, %edi
+	psrlq	$32, %mm5		C return value
+
+	movq	-16(%ebx,%eax,4), %mm3	C src second highest qword
+	jz	L(start_dst_aligned)
+
+	C dst isn't aligned, subtract 4 to make it so, and pretend the shift
+	C is 32 bits extra.  High limb of dst (marked xxx) handled here
+	C separately.
+	C
+	C  source     -8(ebx,%eax,4)
+	C                  |
+	C  +-------+-------+--
+	C  |      mm1      |  
+	C  +-------+-------+--
+	C                0mod8   4mod8
+	C
+	C  dest
+	C     -4(edx,%eax,4)
+	C          |
+	C  +-------+-------+-------+--
+	C  |  xxx  |               |
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+
+	movq	%mm1, %mm0
+	addl	$32, %ecx		C new shift
+
+	psllq	%mm6, %mm0
+
+	movd	%ecx, %mm6
+	psrlq	$32, %mm0
+
+	C wasted cycle here waiting for %mm0
+
+	movd	%mm0, -4(%edx,%eax,4)
+	subl	$4, %edx
+L(start_dst_aligned):
+
+
+ 	psllq	%mm6, %mm1
+	negl	%ecx			C -shift
+
+        addl    $64, %ecx		C 64-shift
+ 	movq	%mm3, %mm2
+
+        movd    %ecx, %mm7
+	subl	$8, %eax		C size-8
+
+ 	psrlq	%mm7, %mm3
+
+ 	por	%mm1, %mm3		C mm3 ready to store
+	jc	L(finish)
+
+
+	C The comments in mpn_rshift apply here too.
+
+	ALIGN(8)
+L(unroll_loop):
+	C eax	counter, limbs
+	C ebx	src
+	C ecx
+	C edx	dst
+	C esi
+	C edi
+	C
+	C mm0
+	C mm1
+	C mm2	src qword from 48(%ebx,%eax,4)
+	C mm3	dst qword ready to store to 56(%edx,%eax,4)
+	C
+	C mm5	return value
+	C mm6	lshift
+	C mm7	rshift
+
+ 	movq	8(%ebx,%eax,4), %mm0
+ 	psllq	%mm6, %mm2
+
+ 	movq	%mm0, %mm1
+ 	psrlq	%mm7, %mm0
+
+ 	movq	%mm3, 24(%edx,%eax,4)	C prev
+ 	por	%mm2, %mm0
+
+ 	movq	(%ebx,%eax,4), %mm3	C
+ 	psllq	%mm6, %mm1		C
+
+ 	movq	%mm0, 16(%edx,%eax,4)
+ 	movq	%mm3, %mm2		C
+
+ 	psrlq	%mm7, %mm3		C
+	subl	$4, %eax
+
+ 	por	%mm1, %mm3		C
+	jnc	L(unroll_loop)
+
+
+
+L(finish):
+	C eax	-4 to -1 representing respectively 0 to 3 limbs remaining
+
+	testb	$2, %al
+
+	jz	L(finish_no_two)
+
+ 	movq	8(%ebx,%eax,4), %mm0
+ 	psllq	%mm6, %mm2
+
+ 	movq	%mm0, %mm1
+ 	psrlq	%mm7, %mm0
+
+ 	movq	%mm3, 24(%edx,%eax,4)	C prev
+ 	por	%mm2, %mm0
+
+	movq	%mm1, %mm2
+	movq	%mm0, %mm3
+
+	subl	$2, %eax
+L(finish_no_two):
+
+
+	C eax	-4 or -3 representing respectively 0 or 1 limbs remaining
+	C
+	C mm2	src prev qword, from 48(%ebx,%eax,4)
+	C mm3	dst qword, for 56(%edx,%eax,4)
+
+	testb	$1, %al
+	movd	%mm5, %eax	C retval
+
+	popl	%edi
+	jz	L(finish_zero)
+
+
+	C One extra src limb, destination was aligned.
+	C
+	C                 source                  ebx
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C dest         edx+12           edx+4     edx
+	C --+---------------+---------------+-------+
+	C   |      mm3      |               |       |
+	C --+---------------+---------------+-------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C One extra src limb, destination was unaligned.
+	C
+	C                 source                  ebx
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C         dest         edx+12           edx+4
+	C         --+---------------+---------------+
+	C           |      mm3      |               |
+	C         --+---------------+---------------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword at 4(%edx), and in the aligned case
+	C there's an extra limb of dst to be formed from that extra src limb
+	C left shifted.
+
+
+        movd    (%ebx), %mm0
+ 	psllq	%mm6, %mm2
+
+	movq	%mm3, 12(%edx)
+	psllq	$32, %mm0
+
+        movq    %mm0, %mm1
+        psrlq   %mm7, %mm0
+
+        por     %mm2, %mm0
+        psllq   %mm6, %mm1
+
+	movq    %mm0, 4(%edx)
+	psrlq	$32, %mm1
+
+        andl	$32, %ecx
+	popl	%ebx
+
+	jz	L(finish_one_unaligned)
+
+	movd	%mm1, (%edx)
+L(finish_one_unaligned):
+
+	emms
+
+        ret
+
+
+L(finish_zero):
+
+	C No extra src limbs, destination was aligned.
+	C
+	C                 source          ebx
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C dest          edx+8             edx
+	C --+---------------+---------------+
+	C   |      mm3      |               |
+	C --+---------------+---------------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C No extra src limbs, destination was unaligned.
+	C
+	C               source            ebx
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C         dest          edx+8   edx+4
+	C         --+---------------+-------+
+	C           |      mm3      |       |
+	C         --+---------------+-------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C The movd for the unaligned case writes the same data to 4(%edx)
+	C that the movq does for the aligned case.
+
+
+ 	movq	%mm3, 8(%edx)
+	andl	$32, %ecx
+
+ 	psllq	%mm6, %mm2
+	jz	L(finish_zero_unaligned)
+
+ 	movq	%mm2, (%edx)
+L(finish_zero_unaligned):
+
+	psrlq	$32, %mm2
+	popl	%ebx
+
+	movd	%mm5, %eax	C retval
+
+	movd	%mm2, 4(%edx)
+
+	emms
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mmx/popham.asm b/rts/gmp/mpn/x86/pentium/mmx/popham.asm
new file mode 100644
index 0000000000..587a07ab3d
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/popham.asm
@@ -0,0 +1,30 @@
+dnl  Intel P55 mpn_popcount, mpn_hamdist -- population count and hamming
+dnl  distance.
+dnl 
+dnl  P55: popcount 11.5 cycles/limb, hamdist 12.0 cycles/limb
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')
diff --git a/rts/gmp/mpn/x86/pentium/mmx/rshift.asm b/rts/gmp/mpn/x86/pentium/mmx/rshift.asm
new file mode 100644
index 0000000000..7672630d57
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/rshift.asm
@@ -0,0 +1,460 @@
+dnl  Intel P5 mpn_rshift -- mpn right shift.
+dnl 
+dnl  P5: 1.75 cycles/limb.
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left.  Return the bits shifted out at the
+C right.
+C
+C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
+C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
+C
+C Full speed depends on source and destination being aligned.  Unaligned mmx
+C loads and stores on P5 don't pair and have a 2 cycle penalty.  Some hairy
+C setups and finish-ups are done to ensure alignment for the loop.
+C
+C MMX shifts work out a bit faster even for the simple loop.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  Minimum 5, because the unrolled loop can't handle less.
+deflit(UNROLL_THRESHOLD, 5)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(mpn_rshift)
+
+	pushl	%ebx
+	pushl	%edi
+deflit(`FRAME',8)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_DST, %edx
+
+	movl	PARAM_SRC, %ebx
+	movl	PARAM_SHIFT, %ecx
+
+	cmp	$UNROLL_THRESHOLD, %eax
+	jae	L(unroll)
+
+	decl	%eax
+	movl	(%ebx), %edi		C src low limb
+
+	jnz	L(simple)
+
+	shrdl(	%cl, %edi, %eax)	C eax was decremented to zero
+
+ 	shrl	%cl, %edi
+
+	movl	%edi, (%edx)		C dst low limb
+	popl	%edi			C risk of data cache bank clash
+
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(simple):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	(%ebx), %mm5		C src[0]
+	leal	(%ebx,%eax,4), %ebx	C &src[size-1]
+
+ 	movd	%ecx, %mm6		C rshift
+	leal	-4(%edx,%eax,4), %edx	C &dst[size-2]
+
+	psllq	$32, %mm5
+	negl	%eax
+
+
+C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
+C cycle waiting for the mm0 result to be ready.  For comparison a shrdl is 4
+C cycles and would be 8 in a simple loop.  Using mmx helps the return value
+C and last limb calculations too.
+
+L(simple_top):
+	C eax	counter, limbs, negative
+	C ebx	&src[size-1]
+	C ecx	return value
+	C edx	&dst[size-2]
+	C
+	C mm0	scratch
+	C mm5	return value
+	C mm6	shift
+
+	movq	(%ebx,%eax,4), %mm0
+	incl	%eax
+
+ 	psrlq	%mm6, %mm0
+
+	movd	%mm0, (%edx,%eax,4)
+	jnz	L(simple_top)
+
+
+	movd	(%ebx), %mm0
+	psrlq	%mm6, %mm5		C return value
+
+ 	psrlq	%mm6, %mm0
+	popl	%edi
+
+	movd	%mm5, %eax
+	popl	%ebx
+
+	movd	%mm0, 4(%edx)
+
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(unroll):
+	C eax	size
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	(%ebx), %mm5		C src[0]
+	movl	$4, %edi
+
+ 	movd	%ecx, %mm6		C rshift
+	testl	%edi, %ebx
+
+	psllq	$32, %mm5
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process low limb separately (marked xxx) and
+	C step src and dst by one limb, making src aligned.
+	C
+	C source                  ebx
+	C --+-------+-------+-------+
+	C           |          xxx  |
+	C --+-------+-------+-------+
+	C         4mod8   0mod8   4mod8
+	C
+	C         dest            edx
+	C         --+-------+-------+
+	C           |       |  xxx  |  
+	C         --+-------+-------+
+
+	movq	(%ebx), %mm0		C unaligned load
+
+	psrlq	%mm6, %mm0
+	addl	$4, %ebx
+
+	decl	%eax
+
+	movd	%mm0, (%edx)
+	addl	$4, %edx
+L(start_src_aligned):
+
+
+	movq	(%ebx), %mm1
+	testl	%edi, %edx
+
+	psrlq	%mm6, %mm5		C retval
+	jz	L(start_dst_aligned)
+
+	C dst isn't aligned, add 4 to make it so, and pretend the shift is
+	C 32 bits extra.  Low limb of dst (marked xxx) handled here
+	C separately.
+	C
+	C          source          ebx
+	C          --+-------+-------+
+	C            |      mm1      |
+	C          --+-------+-------+
+	C                  4mod8   0mod8
+	C
+	C  dest                    edx
+	C  --+-------+-------+-------+
+	C                    |  xxx  |        
+	C  --+-------+-------+-------+
+	C          4mod8   0mod8   4mod8
+
+	movq	%mm1, %mm0
+	addl	$32, %ecx		C new shift
+
+	psrlq	%mm6, %mm0
+
+	movd	%ecx, %mm6
+
+	movd	%mm0, (%edx)
+	addl	$4, %edx
+L(start_dst_aligned):
+
+
+	movq	8(%ebx), %mm3
+	negl	%ecx
+
+ 	movq	%mm3, %mm2		C mm2 src qword
+        addl    $64, %ecx
+
+        movd    %ecx, %mm7
+ 	psrlq	%mm6, %mm1
+
+	leal	-12(%ebx,%eax,4), %ebx
+	leal	-20(%edx,%eax,4), %edx
+
+ 	psllq	%mm7, %mm3
+	subl	$7, %eax		C size-7
+
+ 	por	%mm1, %mm3		C mm3 ready to store
+	negl	%eax			C -(size-7)
+
+	jns	L(finish)
+
+
+	C This loop is the important bit, the rest is just support.  Careful
+	C instruction scheduling achieves the claimed 1.75 c/l.  The
+	C relevant parts of the pairing rules are:
+	C
+	C - mmx loads and stores execute only in the U pipe
+	C - only one mmx shift in a pair
+	C - wait one cycle before storing an mmx register result
+	C - the usual address generation interlock
+	C
+	C Two qword calculations are slightly interleaved.  The instructions
+	C marked "C" belong to the second qword, and the "C prev" one is for
+	C the second qword from the previous iteration.
+
+	ALIGN(8)
+L(unroll_loop):
+	C eax	counter, limbs, negative
+	C ebx	&src[size-12]
+	C ecx
+	C edx	&dst[size-12]
+	C esi
+	C edi
+	C
+	C mm0
+	C mm1
+	C mm2	src qword from -8(%ebx,%eax,4)
+	C mm3	dst qword ready to store to -8(%edx,%eax,4)
+	C
+	C mm5	return value
+	C mm6	rshift
+	C mm7	lshift
+
+ 	movq	(%ebx,%eax,4), %mm0
+ 	psrlq	%mm6, %mm2
+
+ 	movq	%mm0, %mm1
+ 	psllq	%mm7, %mm0
+
+ 	movq	%mm3, -8(%edx,%eax,4)	C prev
+ 	por	%mm2, %mm0
+
+ 	movq	8(%ebx,%eax,4), %mm3	C
+ 	psrlq	%mm6, %mm1		C
+
+ 	movq	%mm0, (%edx,%eax,4)
+ 	movq	%mm3, %mm2		C
+
+ 	psllq	%mm7, %mm3		C
+	addl	$4, %eax
+
+ 	por	%mm1, %mm3		C
+	js	L(unroll_loop)
+
+
+L(finish):
+	C eax	0 to 3 representing respectively 3 to 0 limbs remaining
+
+	testb	$2, %al
+
+	jnz	L(finish_no_two)
+
+ 	movq	(%ebx,%eax,4), %mm0
+ 	psrlq	%mm6, %mm2
+
+ 	movq	%mm0, %mm1
+ 	psllq	%mm7, %mm0
+
+ 	movq	%mm3, -8(%edx,%eax,4)	C prev
+ 	por	%mm2, %mm0
+
+	movq	%mm1, %mm2
+	movq	%mm0, %mm3
+
+	addl	$2, %eax
+L(finish_no_two):
+
+
+	C eax	2 or 3 representing respectively 1 or 0 limbs remaining
+	C
+	C mm2	src prev qword, from -8(%ebx,%eax,4)
+	C mm3	dst qword, for -8(%edx,%eax,4)
+
+	testb	$1, %al
+	popl	%edi
+
+	movd	%mm5, %eax	C retval
+	jnz	L(finish_zero)
+
+
+	C One extra limb, destination was aligned.
+	C
+	C source                ebx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest                                  edx
+	C +-------+---------------+---------------+--
+	C |       |               |      mm3      |
+	C +-------+---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C One extra limb, destination was unaligned.
+	C
+	C source                ebx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest                          edx
+	C +---------------+---------------+--
+	C |               |      mm3      |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword at 8(%edx), and in the aligned case
+	C there's a further extra limb of dst to be formed.
+
+
+        movd    8(%ebx), %mm0
+ 	psrlq	%mm6, %mm2
+
+        movq    %mm0, %mm1
+        psllq   %mm7, %mm0
+
+	movq	%mm3, (%edx)
+        por     %mm2, %mm0
+
+        psrlq   %mm6, %mm1
+        andl	$32, %ecx
+
+	popl	%ebx
+	jz	L(finish_one_unaligned)
+
+        C dst was aligned, must store one extra limb
+	movd	%mm1, 16(%edx)
+L(finish_one_unaligned):
+
+	movq    %mm0, 8(%edx)
+
+	emms
+
+        ret
+
+
+L(finish_zero):
+
+	C No extra limbs, destination was aligned.
+	C
+	C source        ebx
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest                        edx+4
+	C +---------------+---------------+--
+	C |               |      mm3      |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C No extra limbs, destination was unaligned.
+	C
+	C source        ebx
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest                edx+4
+	C +-------+---------------+--
+	C |       |      mm3      |
+	C +-------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = 64-(shift+32)
+
+
+	C The movd for the unaligned case is clearly the same data as the
+	C movq for the aligned case, it's just a choice between whether one
+	C or two limbs should be written.
+
+
+ 	movq	%mm3, 4(%edx)
+ 	psrlq	%mm6, %mm2
+
+ 	movd	%mm2, 12(%edx)
+	andl	$32, %ecx
+
+	popl	%ebx
+	jz	L(finish_zero_unaligned)
+
+ 	movq	%mm2, 12(%edx)
+L(finish_zero_unaligned):
+
+	emms
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mul_1.asm b/rts/gmp/mpn/x86/pentium/mul_1.asm
new file mode 100644
index 0000000000..08639eca09
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mul_1.asm
@@ -0,0 +1,79 @@
+dnl  Intel Pentium mpn_mul_1 -- mpn by limb multiplication.
+dnl
+dnl  P5: 13.0 cycles/limb
+
+dnl  Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
+dnl  Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA. */
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_mul_1)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST, %edi
+	movl	PARAM_SRC, %esi
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_MULTIPLIER, %ebp
+
+	leal	(%edi,%ecx,4), %edi
+	leal	(%esi,%ecx,4), %esi
+	negl	%ecx
+	xorl	%ebx, %ebx
+	ALIGN(8)
+
+L(oop):	adcl	$0, %ebx
+	movl	(%esi,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%eax, %ebx
+
+	movl	%ebx, (%edi,%ecx,4)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(oop)
+
+	adcl	$0, %ebx
+	movl	%ebx, %eax
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mul_basecase.asm b/rts/gmp/mpn/x86/pentium/mul_basecase.asm
new file mode 100644
index 0000000000..d9f79a0831
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mul_basecase.asm
@@ -0,0 +1,135 @@
+dnl  Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication.
+dnl 
+dnl  P5: 14.2 cycles/crossproduct (approx)
+
+
+dnl  Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+
+defframe(PARAM_YSIZE, 20)
+defframe(PARAM_YP,    16)
+defframe(PARAM_XSIZE, 12)
+defframe(PARAM_XP,    8)
+defframe(PARAM_WP,    4)
+
+defframe(VAR_COUNTER, -4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_mul_basecase)
+
+	pushl	%eax			C dummy push for allocating stack slot
+	pushl	%esi
+	pushl	%ebp
+	pushl	%edi
+deflit(`FRAME',16)
+
+	movl	PARAM_XP,%esi
+	movl	PARAM_WP,%edi
+	movl	PARAM_YP,%ebp
+
+	movl	(%esi),%eax		C load xp[0]
+	mull	(%ebp)			C multiply by yp[0]
+	movl	%eax,(%edi)		C store to wp[0]
+	movl	PARAM_XSIZE,%ecx	C xsize
+	decl	%ecx			C If xsize = 1, ysize = 1 too
+	jz	L(done)
+
+	movl	PARAM_XSIZE,%eax
+	pushl	%ebx
+FRAME_pushl()
+	movl	%edx,%ebx
+	leal	(%esi,%eax,4),%esi	C make xp point at end
+	leal	(%edi,%eax,4),%edi	C offset wp by xsize
+	negl	%ecx			C negate j size/index for inner loop
+	xorl	%eax,%eax		C clear carry
+
+	ALIGN(8)
+L(oop1):	adcl	$0,%ebx
+	movl	(%esi,%ecx,4),%eax	C load next limb at xp[j]
+	mull	(%ebp)
+	addl	%ebx,%eax
+	movl	%eax,(%edi,%ecx,4)
+	incl	%ecx
+	movl	%edx,%ebx
+	jnz	L(oop1)
+
+	adcl	$0,%ebx
+	movl	PARAM_YSIZE,%eax
+	movl	%ebx,(%edi)		C most significant limb of product
+	addl	$4,%edi			C increment wp
+	decl	%eax
+	jz	L(skip)
+	movl	%eax,VAR_COUNTER	C set index i to ysize
+
+L(outer):
+	addl	$4,%ebp			C make ebp point to next y limb
+	movl	PARAM_XSIZE,%ecx
+	negl	%ecx
+	xorl	%ebx,%ebx
+
+	C code at 0x61 here, close enough to aligned
+L(oop2):
+	adcl	$0,%ebx
+	movl	(%esi,%ecx,4),%eax
+	mull	(%ebp)
+	addl	%ebx,%eax
+	movl	(%edi,%ecx,4),%ebx
+	adcl	$0,%edx
+	addl	%eax,%ebx
+	movl	%ebx,(%edi,%ecx,4)
+	incl	%ecx
+	movl	%edx,%ebx
+	jnz	L(oop2)
+
+	adcl	$0,%ebx
+
+	movl	%ebx,(%edi)
+	addl	$4,%edi
+	movl	VAR_COUNTER,%eax
+	decl	%eax
+	movl	%eax,VAR_COUNTER
+	jnz	L(outer)
+
+L(skip):
+	popl	%ebx
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	addl	$4,%esp
+	ret
+
+L(done):
+	movl	%edx,4(%edi)	C store to wp[1]
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	popl	%eax		C dummy pop for deallocating stack slot
+	ret
+
+EPILOGUE()
+
diff --git a/rts/gmp/mpn/x86/pentium/rshift.asm b/rts/gmp/mpn/x86/pentium/rshift.asm
new file mode 100644
index 0000000000..e8f5ae8ec8
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/rshift.asm
@@ -0,0 +1,236 @@
+dnl  Intel Pentium mpn_rshift -- mpn right shift.
+dnl
+dnl          cycles/limb
+dnl  P5,P54:    6.0
+dnl  P55:       5.375
+
+
+dnl  Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl  Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_rshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ebp
+	movl	PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions. 
+	cmp	$1,%ecx
+	jne	L(normal)
+	leal	4(%edi),%eax
+	cmpl	%esi,%eax
+	jnc	L(special)		C jump if res_ptr + 1 >= s_ptr
+	leal	(%edi,%ebp,4),%eax
+	cmpl	%eax,%esi
+	jnc	L(special)		C jump if s_ptr >= res_ptr + size
+
+L(normal):
+	movl	(%esi),%edx
+	addl	$4,%esi
+	xorl	%eax,%eax
+	shrdl(	%cl, %edx, %eax)	C compute carry limb
+	pushl	%eax			C push carry limb onto stack
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+	jz	L(end)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(oop):	movl	28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	shrdl(	%cl, %eax, %ebx)
+	shrdl(	%cl, %edx, %eax)
+	movl	%ebx,(%edi)
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebx
+	movl	12(%esi),%eax
+	shrdl(	%cl, %ebx, %edx)
+	shrdl(	%cl, %eax, %ebx)
+	movl	%edx,8(%edi)
+	movl	%ebx,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	shrdl(	%cl, %edx, %eax)
+	shrdl(	%cl, %ebx, %edx)
+	movl	%eax,16(%edi)
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	shrdl(	%cl, %eax, %ebx)
+	shrdl(	%cl, %edx, %eax)
+	movl	%ebx,24(%edi)
+	movl	%eax,28(%edi)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	decl	%ebp
+	jnz	L(oop)
+
+L(end):	popl	%ebp
+	andl	$7,%ebp
+	jz	L(end2)
+L(oop2):
+	movl	(%esi),%eax
+	shrdl(	%cl,%eax,%edx)		C compute result limb
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	addl	$4,%esi
+	addl	$4,%edi
+	decl	%ebp
+	jnz	L(oop2)
+
+L(end2):
+	shrl	%cl,%edx		C compute most significant limb
+	movl	%edx,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+	leal	-4(%edi,%ebp,4),%edi
+	leal	-4(%esi,%ebp,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+
+	shrl	%edx
+	incl	%ebp
+	decl	%ebp
+	jz	L(Lend)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(Loop):
+	movl	-28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	rcrl	%eax
+	movl	%ebx,(%edi)
+	rcrl	%edx
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebx
+	movl	-12(%esi),%eax
+	rcrl	%ebx
+	movl	%edx,-8(%edi)
+	rcrl	%eax
+	movl	%ebx,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebx
+	rcrl	%edx
+	movl	%eax,-16(%edi)
+	rcrl	%ebx
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	rcrl	%eax
+	movl	%ebx,-24(%edi)
+	rcrl	%edx
+	movl	%eax,-28(%edi)
+
+	leal	-32(%esi),%esi		C use leal not to clobber carry
+	leal	-32(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop)
+
+L(Lend):
+	popl	%ebp
+	sbbl	%eax,%eax		C save carry in %eax
+	andl	$7,%ebp
+	jz	L(Lend2)
+	addl	%eax,%eax		C restore carry from eax
+L(Loop2):
+	movl	%edx,%ebx
+	movl	(%esi),%edx
+	rcrl	%edx
+	movl	%ebx,(%edi)
+
+	leal	-4(%esi),%esi		C use leal not to clobber carry
+	leal	-4(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop2)
+
+	jmp	L(L1)
+L(Lend2):
+	addl	%eax,%eax		C restore carry from eax
+L(L1):	movl	%edx,(%edi)		C store last limb
+
+	movl	$0,%eax
+	rcrl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/sqr_basecase.asm b/rts/gmp/mpn/x86/pentium/sqr_basecase.asm
new file mode 100644
index 0000000000..c8584df13c
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/sqr_basecase.asm
@@ -0,0 +1,520 @@
+dnl  Intel P5 mpn_sqr_basecase -- square an mpn number.
+dnl 
+dnl  P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
+dnl  product at around 20x20 limbs.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Calculate src,size squared, storing the result in dst,2*size.
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the size is
+C small.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %edx
+	movl	PARAM_DST, %ecx
+
+	je	L(two_limbs)
+
+	movl	(%eax), %eax
+	ja	L(three_or_more)
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx
+
+	mull	%eax
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	ret
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx	size
+
+	pushl	%ebp
+	pushl	%edi
+
+	pushl	%esi
+	pushl	%ebx
+
+	movl	%eax, %ebx
+	movl	(%eax), %eax
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)	C dst[0]
+	movl	%edx, %esi	C dst[1]
+
+	movl	4(%ebx), %eax
+
+	mull	%eax		C src[1]^2
+
+	movl	%eax, %edi	C dst[2]
+	movl	%edx, %ebp	C dst[3]
+
+	movl	(%ebx), %eax
+
+	mull	4(%ebx)		C src[0]*src[1]
+
+	addl	%eax, %esi
+	popl	%ebx
+
+	adcl	%edx, %edi
+
+	adcl	$0, %ebp
+	addl	%esi, %eax
+
+	adcl	%edi, %edx
+	movl	%eax, 4(%ecx)
+
+	adcl	$0, %ebp
+	popl	%esi
+
+	movl	%edx, 8(%ecx)
+	movl	%ebp, 12(%ecx)
+
+	popl	%edi
+	popl	%ebp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(three_or_more):
+	C eax	src low limb
+	C ebx
+	C ecx	dst
+	C edx	size
+
+	cmpl	$4, %edx
+	pushl	%ebx
+deflit(`FRAME',4)
+
+	movl	PARAM_SRC, %ebx
+	jae	L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+	C eax	src low limb
+	C ebx	src
+	C ecx	dst
+	C edx	size
+
+	pushl	%ebp
+	pushl	%edi
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	movl	4(%ebx), %eax
+	xorl	%ebp, %ebp
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	%edx, 12(%ecx)
+
+	movl	8(%ebx), %eax
+	pushl	%esi		C risk of cache bank clash
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	%edx, 20(%ecx)
+
+	movl	(%ebx), %eax
+
+	mull	4(%ebx)		C src[0] * src[1]
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	movl	(%ebx), %eax
+
+	mull	8(%ebx)		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	%edx, %ebp
+
+	adcl	$0, %ebp
+	movl	4(%ebx), %eax
+
+	mull	8(%ebx)		C src[1] * src[2]
+
+	xorl	%ebx, %ebx
+	addl	%eax, %ebp
+
+	C eax
+	C ebx	zero, will be dst[5]
+	C ecx	dst
+	C edx	dst[4]
+	C esi	dst[1]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	adcl	$0, %edx
+	addl	%esi, %esi
+
+	adcl	%edi, %edi
+
+	adcl	%ebp, %ebp
+
+	adcl	%edx, %edx
+	movl	4(%ecx), %eax
+
+	adcl	$0, %ebx
+	addl	%esi, %eax
+
+	movl	%eax, 4(%ecx)
+	movl	8(%ecx), %eax
+
+	adcl	%edi, %eax
+	movl	12(%ecx), %esi
+
+	adcl	%ebp, %esi
+	movl	16(%ecx), %edi
+
+	movl	%eax, 8(%ecx)
+	movl	%esi, 12(%ecx)
+
+	adcl	%edx, %edi
+	popl	%esi
+
+	movl	20(%ecx), %eax
+	movl	%edi, 16(%ecx)
+
+	popl	%edi
+	popl	%ebp
+
+	adcl	%ebx, %eax	C no carry out of this
+	popl	%ebx
+
+	movl	%eax, 20(%ecx)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(four_or_more):
+	C eax	src low limb
+	C ebx	src
+	C ecx	dst
+	C edx	size
+	C esi
+	C edi
+	C ebp
+	C
+	C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+deflit(`FRAME',4)
+
+	pushl	%edi
+FRAME_pushl()
+	pushl	%esi
+FRAME_pushl()
+
+	pushl	%ebp
+FRAME_pushl()
+	leal	(%ecx,%edx,4), %edi	C dst end of this mul1
+
+	leal	(%ebx,%edx,4), %esi	C src end
+	movl	%ebx, %ebp		C src
+
+	negl	%edx			C -size
+	xorl	%ebx, %ebx		C clear carry limb and carry flag
+
+	leal	1(%edx), %ecx		C -(size-1)
+
+L(mul1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp	src
+
+	adcl	$0, %ebx
+	movl	(%esi,%ecx,4), %eax
+
+	mull	(%ebp)
+
+	addl	%eax, %ebx
+
+	movl	%ebx, (%edi,%ecx,4)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(mul1)
+
+
+	C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
+	C n=1..size-2.
+	C
+	C The last two products, which are the end corner of the product
+	C triangle, are handled separately to save looping overhead.  These
+	C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
+	C If size is 4 then it's only these that need to be done.
+	C
+	C In the outer loop %esi is a constant, and %edi just advances by 1
+	C limb each time.  The size of the operation decreases by 1 limb
+	C each time.
+
+	C eax
+	C ebx	carry (needing carry flag added)
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+
+	adcl	$0, %ebx
+	movl	PARAM_SIZE, %edx
+
+	movl	%ebx, (%edi)
+	subl	$4, %edx
+
+	negl	%edx
+	jz	L(corner)
+
+
+L(outer):
+	C ebx	previous carry limb to store
+	C edx	outer loop counter (negative)
+	C esi	&src[size]
+	C edi	dst, pointing at stored carry limb of previous loop
+
+	pushl	%edx			C new outer loop counter
+	leal	-2(%edx), %ecx
+
+	movl	%ebx, (%edi)
+	addl	$4, %edi
+
+	addl	$4, %ebp
+	xorl	%ebx, %ebx		C initial carry limb, clear carry flag
+
+L(inner):
+	C eax	scratch
+	C ebx	carry (needing carry flag added)
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	&src[size]
+	C edi	dst end of this addmul
+	C ebp	&src[j]
+
+	adcl	$0, %ebx
+	movl	(%esi,%ecx,4), %eax
+
+	mull	(%ebp)
+
+	addl	%ebx, %eax
+	movl	(%edi,%ecx,4), %ebx
+
+	adcl	$0, %edx
+	addl	%eax, %ebx
+
+	movl	%ebx, (%edi,%ecx,4)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(inner)
+
+
+	adcl	$0, %ebx
+	popl	%edx		C outer loop counter
+
+	incl	%edx
+	jnz	L(outer)
+
+
+	movl	%ebx, (%edi)
+
+L(corner):
+	C esi	&src[size]
+	C edi	&dst[2*size-4]
+
+	movl	-8(%esi), %eax
+	movl	-4(%edi), %ebx		C risk of data cache bank clash here
+
+	mull	-12(%esi)		C src[size-2]*src[size-3]
+
+	addl	%eax, %ebx
+	movl	%edx, %ecx
+
+	adcl	$0, %ecx
+	movl	-4(%esi), %eax
+
+	mull	-12(%esi)		C src[size-1]*src[size-3]
+
+	addl	%ecx, %eax
+	movl	(%edi), %ecx
+
+	adcl	$0, %edx
+	movl	%ebx, -4(%edi)
+
+	addl	%eax, %ecx
+	movl	%edx, %ebx
+
+	adcl	$0, %ebx
+	movl	-4(%esi), %eax
+
+	mull	-8(%esi)		C src[size-1]*src[size-2]
+
+	movl	%ecx, 0(%edi)
+	addl	%eax, %ebx
+
+	adcl	$0, %edx
+	movl	PARAM_SIZE, %eax
+
+	negl	%eax
+	movl	%ebx, 4(%edi)
+
+	addl	$1, %eax		C -(size-1) and clear carry
+	movl	%edx, 8(%edi)
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+L(lshift):
+	C eax	counter, negative
+	C ebx	next limb
+	C ecx
+	C edx
+	C esi
+	C edi	&dst[2*size-4]
+	C ebp
+
+	movl	12(%edi,%eax,8), %ebx
+
+	rcll	%ebx
+	movl	16(%edi,%eax,8), %ecx
+	
+	rcll	%ecx
+	movl	%ebx, 12(%edi,%eax,8)
+
+	movl	%ecx, 16(%edi,%eax,8)
+	incl	%eax
+
+	jnz	L(lshift)
+
+
+	adcl	%eax, %eax		C high bit out
+	movl	PARAM_SRC, %esi
+
+	movl	PARAM_SIZE, %ecx	C risk of cache bank clash
+	movl	%eax, 12(%edi)		C dst most significant limb
+
+
+C -----------------------------------------------------------------------------
+C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+	movl	(%esi), %eax		C src[0]
+	leal	(%esi,%ecx,4), %esi	C src end
+
+	negl	%ecx
+
+	mull	%eax
+
+	movl	%eax, 16(%edi,%ecx,8)	C dst[0]
+	movl	%edx, %ebx
+
+	addl	$1, %ecx		C size-1 and clear carry
+
+L(diag):
+	C eax	scratch (low product)
+	C ebx	carry limb
+	C ecx	counter, negative
+	C edx	scratch (high product)
+	C esi	&src[size]
+	C edi	&dst[2*size-4]
+	C ebp	scratch (fetched dst limbs)
+
+	movl	(%esi,%ecx,4), %eax
+	adcl	$0, %ebx
+
+	mull	%eax
+
+	movl	16-4(%edi,%ecx,8), %ebp
+
+	addl	%ebp, %ebx
+	movl	16(%edi,%ecx,8), %ebp
+
+	adcl	%eax, %ebp
+	movl	%ebx, 16-4(%edi,%ecx,8)
+
+	movl	%ebp, 16(%edi,%ecx,8)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(diag)
+
+
+	adcl	$0, %edx
+	movl	16-4(%edi), %eax	C dst most significant limb
+
+	addl	%eax, %edx
+	popl	%ebp
+
+	movl	%edx, 16-4(%edi)
+	popl	%esi		C risk of cache bank clash
+
+	popl	%edi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/rshift.asm b/rts/gmp/mpn/x86/rshift.asm
new file mode 100644
index 0000000000..c9881fd966
--- /dev/null
+++ b/rts/gmp/mpn/x86/rshift.asm
@@ -0,0 +1,92 @@
+dnl  x86 mpn_rshift -- mpn right shift.
+
+dnl  Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
+dnl  Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_rshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+deflit(`FRAME',12)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%edx
+	movl	PARAM_SHIFT,%ecx
+
+	leal	-4(%edi,%edx,4),%edi
+	leal	(%esi,%edx,4),%esi
+	negl	%edx
+
+	movl	(%esi,%edx,4),%ebx	C read least significant limb
+	xorl	%eax,%eax
+	shrdl(	%cl, %ebx, %eax)	C compute carry limb
+	incl	%edx
+	jz	L(end)
+	pushl	%eax			C push carry limb onto stack
+	testb	$1,%dl
+	jnz	L(1)			C enter loop in the middle
+	movl	%ebx,%eax
+
+	ALIGN(8)
+L(oop):	movl	(%esi,%edx,4),%ebx	C load next higher limb
+	shrdl(	%cl, %ebx, %eax)	C compute result limb
+	movl	%eax,(%edi,%edx,4)	C store it
+	incl	%edx
+L(1):	movl	(%esi,%edx,4),%eax
+	shrdl(	%cl, %eax, %ebx)
+	movl	%ebx,(%edi,%edx,4)
+	incl	%edx
+	jnz	L(oop)
+
+	shrl	%cl,%eax		C compute most significant limb
+	movl	%eax,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+L(end):	shrl	%cl,%ebx		C compute most significant limb
+	movl	%ebx,(%edi)		C store it
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/udiv.asm b/rts/gmp/mpn/x86/udiv.asm
new file mode 100644
index 0000000000..9fe022b107
--- /dev/null
+++ b/rts/gmp/mpn/x86/udiv.asm
@@ -0,0 +1,44 @@
+dnl  x86 mpn_udiv_qrnnd -- 2 by 1 limb division
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_udiv_qrnnd (mp_limb_t *remptr, mp_limb_t high, mp_limb_t low,
+C                           mp_limb_t divisor);
+
+defframe(PARAM_DIVISOR, 16)
+defframe(PARAM_LOW,     12)
+defframe(PARAM_HIGH,    8)
+defframe(PARAM_REMPTR,  4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_udiv_qrnnd)
+deflit(`FRAME',0)
+	movl	PARAM_LOW, %eax
+	movl	PARAM_HIGH, %edx
+	divl	PARAM_DIVISOR
+	movl	PARAM_REMPTR, %ecx
+	movl	%edx, (%ecx)
+	ret
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/umul.asm b/rts/gmp/mpn/x86/umul.asm
new file mode 100644
index 0000000000..3d289d1784
--- /dev/null
+++ b/rts/gmp/mpn/x86/umul.asm
@@ -0,0 +1,43 @@
+dnl  mpn_umul_ppmm -- 1x1->2 limb multiplication
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2);
+C
+
+defframe(PARAM_M2,    12)
+defframe(PARAM_M1,     8)
+defframe(PARAM_LOWPTR, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_umul_ppmm)
+deflit(`FRAME',0)
+	movl	PARAM_LOWPTR, %ecx
+	movl	PARAM_M1, %eax
+	mull	PARAM_M2
+	movl	%eax, (%ecx)
+	movl	%edx, %eax
+	ret
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/x86-defs.m4 b/rts/gmp/mpn/x86/x86-defs.m4
new file mode 100644
index 0000000000..2dad698002
--- /dev/null
+++ b/rts/gmp/mpn/x86/x86-defs.m4
@@ -0,0 +1,713 @@
+divert(-1)
+
+dnl  m4 macros for x86 assembler.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl  Notes:
+dnl
+dnl  m4 isn't perfect for processing BSD style x86 assembler code, the main
+dnl  problems are,
+dnl
+dnl  1. Doing define(foo,123) and then using foo in an addressing mode like
+dnl     foo(%ebx) expands as a macro rather than a constant.  This is worked
+dnl     around by using deflit() from asm-defs.m4, instead of define().
+dnl
+dnl  2. Immediates in macro definitions need a space or `' to stop the $
+dnl     looking like a macro parameter.  For example,
+dnl
+dnl  	        define(foo, `mov $ 123, %eax')
+dnl
+dnl     This is only a problem in macro definitions, not in ordinary text,
+dnl     nor in macro parameters like text passed to forloop() or ifdef().
+
+
+deflit(BYTES_PER_MP_LIMB, 4)
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Replacement PROLOGUE/EPILOGUE with more sophisticated error checking.
+dnl  Nesting and overlapping not allowed.
+dnl
+
+
+dnl  Usage: PROLOGUE(functionname)
+dnl
+dnl  Generate a function prologue.  functionname gets GSYM_PREFIX added.
+dnl  Examples,
+dnl
+dnl         PROLOGUE(mpn_add_n)
+dnl         PROLOGUE(somefun)
+
+define(`PROLOGUE',
+m4_assert_numargs(1)
+m4_assert_defined(`PROLOGUE_cpu')
+`ifdef(`PROLOGUE_current_function',
+`m4_error(`PROLOGUE'(`PROLOGUE_current_function') needs an `EPILOGUE'() before `PROLOGUE'($1)
+)')dnl
+m4_file_seen()dnl
+define(`PROLOGUE_current_function',`$1')dnl
+PROLOGUE_cpu(GSYM_PREFIX`'$1)')
+
+
+dnl  Usage: EPILOGUE()
+dnl
+dnl  Notice the function name is passed to EPILOGUE_cpu(), letting it use $1
+dnl  instead of the long PROLOGUE_current_function symbol.
+
+define(`EPILOGUE',
+m4_assert_numargs(0)
+m4_assert_defined(`EPILOGUE_cpu')
+`ifdef(`PROLOGUE_current_function',,
+`m4_error(`EPILOGUE'() with no `PROLOGUE'()
+)')dnl
+EPILOGUE_cpu(GSYM_PREFIX`'PROLOGUE_current_function)`'dnl
+undefine(`PROLOGUE_current_function')')
+
+m4wrap_prepend(
+`ifdef(`PROLOGUE_current_function',
+`m4_error(`EPILOGUE() for PROLOGUE('PROLOGUE_current_function`) never seen
+')')')
+
+
+dnl  Usage: PROLOGUE_assert_inside()
+dnl
+dnl  Use this unquoted on a line on its own at the start of a macro
+dnl  definition to add some code to check the macro is only used inside a
+dnl  PROLOGUE/EPILOGUE pair, and that hence PROLOGUE_current_function is
+dnl  defined.
+
+define(PROLOGUE_assert_inside,
+m4_assert_numargs(0)
+``PROLOGUE_assert_inside_internal'(m4_doublequote($`'0))`dnl '')
+
+define(PROLOGUE_assert_inside_internal,
+m4_assert_numargs(1)
+`ifdef(`PROLOGUE_current_function',,
+`m4_error(`$1 used outside a PROLOGUE / EPILOGUE pair
+')')')
+
+
+dnl  Usage: L(labelname)
+dnl         LF(functionname,labelname)
+dnl
+dnl  Generate a local label in the current or given function.  For LF(),
+dnl  functionname gets GSYM_PREFIX added, the same as with PROLOGUE().
+dnl
+dnl  For example, in a function mpn_add_n (and with MPN_PREFIX __gmpn),
+dnl
+dnl         L(bar)          => L__gmpn_add_n__bar
+dnl         LF(somefun,bar) => Lsomefun__bar
+dnl
+dnl  The funtion name and label name get two underscores between them rather
+dnl  than one to guard against clashing with a separate external symbol that
+dnl  happened to be called functionname_labelname.  (Though this would only
+dnl  happen if the local label prefix is is empty.)  Underscores are used so
+dnl  the whole label will still be a valid C identifier and so can be easily
+dnl  used in gdb.
+
+dnl  LSYM_PREFIX can be L$, so defn() is used to prevent L expanding as the
+dnl  L macro and making an infinite recursion.
+define(LF,
+m4_assert_numargs(2)
+m4_assert_defined(`LSYM_PREFIX')
+`defn(`LSYM_PREFIX')GSYM_PREFIX`'$1`'__$2')
+
+define(`L',
+m4_assert_numargs(1)
+PROLOGUE_assert_inside()
+`LF(PROLOGUE_current_function,`$1')')
+
+
+dnl  Called: PROLOGUE_cpu(gsym)
+dnl          EPILOGUE_cpu(gsym)
+
+define(PROLOGUE_cpu,
+m4_assert_numargs(1)
+	`GLOBL	$1
+	TYPE($1,`function')
+$1:')
+
+define(EPILOGUE_cpu,
+m4_assert_numargs(1)
+`	SIZE($1,.-$1)')
+
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Various x86 macros.
+dnl
+
+
+dnl  Usage: ALIGN_OFFSET(bytes,offset)
+dnl
+dnl  Align to `offset' away from a multiple of `bytes'.
+dnl
+dnl  This is useful for testing, for example align to something very strict
+dnl  and see what effect offsets from it have, "ALIGN_OFFSET(256,32)".
+dnl
+dnl  Generally you wouldn't execute across the padding, but it's done with
+dnl  nop's so it'll work.
+
+define(ALIGN_OFFSET,
+m4_assert_numargs(2)
+`ALIGN($1)
+forloop(`i',1,$2,`	nop
+')')
+
+
+dnl  Usage: defframe(name,offset)
+dnl
+dnl  Make a definition like the following with which to access a parameter
+dnl  or variable on the stack.
+dnl
+dnl         define(name,`FRAME+offset(%esp)')
+dnl
+dnl  Actually m4_empty_if_zero(FRAME+offset) is used, which will save one
+dnl  byte if FRAME+offset is zero, by putting (%esp) rather than 0(%esp).
+dnl  Use define(`defframe_empty_if_zero_disabled',1) if for some reason the
+dnl  zero offset is wanted.
+dnl
+dnl  The new macro also gets a check that when it's used FRAME is actually
+dnl  defined, and that the final %esp offset isn't negative, which would
+dnl  mean an attempt to access something below the current %esp.
+dnl
+dnl  deflit() is used rather than a plain define(), so the new macro won't
+dnl  delete any following parenthesized expression.  name(%edi) will come
+dnl  out say as 16(%esp)(%edi).  This isn't valid assembler and should
+dnl  provoke an error, which is better than silently giving just 16(%esp).
+dnl
+dnl  See README.family for more on the suggested way to access the stack
+dnl  frame.
+
+define(defframe,
+m4_assert_numargs(2)
+`deflit(`$1',
+m4_assert_defined(`FRAME')
+`defframe_check_notbelow(`$1',$2,FRAME)dnl
+defframe_empty_if_zero(FRAME+($2))(%esp)')')
+
+dnl  Called: defframe_empty_if_zero(expression)
+define(defframe_empty_if_zero,
+`ifelse(defframe_empty_if_zero_disabled,1,
+`eval($1)',
+`m4_empty_if_zero($1)')')
+
+dnl  Called: defframe_check_notbelow(`name',offset,FRAME)
+define(defframe_check_notbelow,
+m4_assert_numargs(3)
+`ifelse(eval(($3)+($2)<0),1,
+`m4_error(`$1 at frame offset $2 used when FRAME is only $3 bytes
+')')')
+
+
+dnl  Usage: FRAME_pushl()
+dnl         FRAME_popl()
+dnl         FRAME_addl_esp(n)
+dnl         FRAME_subl_esp(n)
+dnl
+dnl  Adjust FRAME appropriately for a pushl or popl, or for an addl or subl
+dnl  %esp of n bytes.
+dnl
+dnl  Using these macros is completely optional.  Sometimes it makes more
+dnl  sense to put explicit deflit(`FRAME',N) forms, especially when there's
+dnl  jumps and different sequences of FRAME values need to be used in
+dnl  different places.
+
+define(FRAME_pushl,
+m4_assert_numargs(0)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME+4))')
+
+define(FRAME_popl,
+m4_assert_numargs(0)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME-4))')
+
+define(FRAME_addl_esp,
+m4_assert_numargs(1)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME-($1)))')
+
+define(FRAME_subl_esp,
+m4_assert_numargs(1)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME+($1)))')
+
+
+dnl  Usage: defframe_pushl(name)
+dnl
+dnl  Do a combination of a FRAME_pushl() and a defframe() to name the stack
+dnl  location just pushed.  This should come after a pushl instruction.
+dnl  Putting it on the same line works and avoids lengthening the code.  For
+dnl  example,
+dnl
+dnl         pushl   %eax     defframe_pushl(VAR_COUNTER)
+dnl
+dnl  Notice the defframe() is done with an unquoted -FRAME thus giving its
+dnl  current value without tracking future changes.
+
+define(defframe_pushl,
+`FRAME_pushl()defframe(`$1',-FRAME)')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Assembler instruction macros.
+dnl
+
+
+dnl  Usage: emms_or_femms
+dnl         femms_available_p
+dnl
+dnl  femms_available_p expands to 1 or 0 according to whether the AMD 3DNow
+dnl  femms instruction is available.  emms_or_femms expands to femms if
+dnl  available, or emms if not.
+dnl
+dnl  emms_or_femms is meant for use in the K6 directory where plain K6
+dnl  (without femms) and K6-2 and K6-3 (with a slightly faster femms) are
+dnl  supported together.
+dnl
+dnl  On K7 femms is no longer faster and is just an alias for emms, so plain
+dnl  emms may as well be used.
+
+define(femms_available_p,
+m4_assert_numargs(-1)
+`m4_ifdef_anyof_p(
+	`HAVE_TARGET_CPU_k62',
+	`HAVE_TARGET_CPU_k63',
+	`HAVE_TARGET_CPU_athlon')')
+
+define(emms_or_femms,
+m4_assert_numargs(-1)
+`ifelse(femms_available_p,1,`femms',`emms')')
+
+
+dnl  Usage: femms
+dnl
+dnl  The gas 2.9.1 that comes with FreeBSD 3.4 doesn't support femms, so the
+dnl  following is a replacement using .byte.
+dnl
+dnl  If femms isn't available, an emms is generated instead, for convenience
+dnl  when testing on a machine without femms.
+
+define(femms,
+m4_assert_numargs(-1)
+`ifelse(femms_available_p,1,
+`.byte	15,14	C AMD 3DNow femms',
+`emms`'dnl
+m4_warning(`warning, using emms in place of femms, use for testing only
+')')')
+
+
+dnl  Usage: jadcl0(op)
+dnl
+dnl  Issue a jnc/incl as a substitute for adcl $0,op.  This isn't an exact
+dnl  replacement, since it doesn't set the flags like adcl does.
+dnl
+dnl  This finds a use in K6 mpn_addmul_1, mpn_submul_1, mpn_mul_basecase and
+dnl  mpn_sqr_basecase because on K6 an adcl is slow, the branch
+dnl  misprediction penalty is small, and the multiply algorithm used leads
+dnl  to a carry bit on average only 1/4 of the time.
+dnl
+dnl  jadcl0_disabled can be set to 1 to instead issue an ordinary adcl for
+dnl  comparison.  For example,
+dnl
+dnl		define(`jadcl0_disabled',1)
+dnl
+dnl  When using a register operand, eg. "jadcl0(%edx)", the jnc/incl code is
+dnl  the same size as an adcl.  This makes it possible to use the exact same
+dnl  computed jump code when testing the relative speed of jnc/incl and adcl
+dnl  with jadcl0_disabled.
+
+define(jadcl0,
+m4_assert_numargs(1)
+`ifelse(jadcl0_disabled,1,
+	`adcl	$`'0, $1',
+	`jnc	1f
+	incl	$1
+1:dnl')')
+
+
+dnl  Usage: cmov_available_p
+dnl
+dnl  Expand to 1 if cmov is available, 0 if not.
+
+define(cmov_available_p,
+`m4_ifdef_anyof_p(
+	`HAVE_TARGET_CPU_pentiumpro',
+	`HAVE_TARGET_CPU_pentium2',
+	`HAVE_TARGET_CPU_pentium3',
+	`HAVE_TARGET_CPU_athlon')')
+
+
+dnl  Usage: x86_lookup(target, key,value, key,value, ...)
+dnl         x86_lookup_p(target, key,value, key,value, ...)
+dnl
+dnl  Look for `target' among the `key' parameters.
+dnl
+dnl  x86_lookup expands to the corresponding `value', or generates an error
+dnl  if `target' isn't found.
+dnl
+dnl  x86_lookup_p expands to 1 if `target' is found, or 0 if not.
+
+define(x86_lookup,
+`ifelse(eval($#<3),1,
+`m4_error(`unrecognised part of x86 instruction: $1
+')',
+`ifelse(`$1',`$2', `$3',
+`x86_lookup(`$1',shift(shift(shift($@))))')')')
+
+define(x86_lookup_p,
+`ifelse(eval($#<3),1, `0',
+`ifelse(`$1',`$2',    `1',
+`x86_lookup_p(`$1',shift(shift(shift($@))))')')')
+
+
+dnl  Usage: x86_opcode_reg32(reg)
+dnl         x86_opcode_reg32_p(reg)
+dnl
+dnl  x86_opcode_reg32 expands to the standard 3 bit encoding for the given
+dnl  32-bit register, eg. `%ebp' turns into 5.
+dnl
+dnl  x86_opcode_reg32_p expands to 1 if reg is a valid 32-bit register, or 0
+dnl  if not.
+
+define(x86_opcode_reg32,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_reg32_list)')
+
+define(x86_opcode_reg32_p,
+m4_assert_onearg()
+`x86_lookup_p(`$1',x86_opcode_reg32_list)')
+
+define(x86_opcode_reg32_list,
+``%eax',0,
+`%ecx',1,
+`%edx',2,
+`%ebx',3,
+`%esp',4,
+`%ebp',5,
+`%esi',6,
+`%edi',7')
+
+
+dnl  Usage: x86_opcode_tttn(cond)
+dnl
+dnl  Expand to the 4-bit "tttn" field value for the given x86 branch
+dnl  condition (like `c', `ae', etc).
+
+define(x86_opcode_tttn,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_ttn_list)')
+
+define(x86_opcode_tttn_list,
+``o',  0,
+`no',  1,
+`b',   2, `c',  2, `nae',2,
+`nb',  3, `nc', 3, `ae', 3,
+`e',   4, `z',  4,
+`ne',  5, `nz', 5,
+`be',  6, `na', 6,
+`nbe', 7, `a',  7,
+`s',   8,
+`ns',  9,
+`p',  10, `pe', 10, `npo',10,
+`np', 11, `npe',11, `po', 11,
+`l',  12, `nge',12,
+`nl', 13, `ge', 13,
+`le', 14, `ng', 14,
+`nle',15, `g',  15')
+
+
+dnl  Usage: cmovCC(srcreg,dstreg)
+dnl
+dnl  Generate a cmov instruction if the target supports cmov, or simulate it
+dnl  with a conditional jump if not (the latter being meant only for
+dnl  testing).  For example,
+dnl
+dnl         cmovz(  %eax, %ebx)
+dnl
+dnl  cmov instructions are generated using .byte sequences, since only
+dnl  recent versions of gas know cmov.
+dnl
+dnl  The source operand can only be a plain register.  (m4 code implementing
+dnl  full memory addressing modes exists, believe it or not, but isn't
+dnl  currently needed and isn't included.)
+dnl
+dnl  All the standard conditions are defined.  Attempting to use one without
+dnl  the macro parentheses, such as just "cmovbe %eax, %ebx", will provoke
+dnl  an error.  This ensures the necessary .byte sequences aren't
+dnl  accidentally missed.
+
+dnl  Called: define_cmov_many(cond,tttn,cond,tttn,...)
+define(define_cmov_many,
+`ifelse(m4_length(`$1'),0,,
+`define_cmov(`$1',`$2')define_cmov_many(shift(shift($@)))')')
+
+dnl  Called: define_cmov(cond,tttn)
+define(define_cmov,
+m4_assert_numargs(2)
+`define(`cmov$1',
+m4_instruction_wrapper()
+m4_assert_numargs(2)
+`cmov_internal'(m4_doublequote($`'0),``$1',`$2'',dnl
+m4_doublequote($`'1),m4_doublequote($`'2)))')
+
+define_cmov_many(x86_opcode_tttn_list)
+
+
+dnl  Called: cmov_internal(name,cond,tttn,src,dst)
+define(cmov_internal,
+m4_assert_numargs(5)
+`ifelse(cmov_available_p,1,
+`cmov_bytes_tttn(`$1',`$3',`$4',`$5')',
+`m4_warning(`warning, simulating cmov with jump, use for testing only
+')cmov_simulate(`$2',`$4',`$5')')')
+
+dnl  Called: cmov_simulate(cond,src,dst)
+dnl  If this is going to be used with memory operands for the source it will
+dnl  need to be changed to do a fetch even if the condition is false, so as
+dnl  to trigger exceptions the same way a real cmov does.
+define(cmov_simulate,
+m4_assert_numargs(3)
+	`j$1	1f	C cmov$1 $2, $3
+	jmp	2f
+1:	movl	$2, $3
+2:')
+
+dnl  Called: cmov_bytes_tttn(name,tttn,src,dst)
+define(cmov_bytes_tttn,
+m4_assert_numargs(4)
+`.byte	dnl
+15, dnl
+eval(64+$2), dnl
+eval(192+8*x86_opcode_reg32(`$4')+x86_opcode_reg32(`$3')) dnl
+	C `$1 $3, $4'')
+
+
+dnl  Usage: loop_or_decljnz label
+dnl
+dnl  Generate either a "loop" instruction or a "decl %ecx / jnz", whichever
+dnl  is better.  "loop" is better on K6 and probably on 386, on other chips
+dnl  separate decl/jnz is better.
+dnl
+dnl  This macro is just for mpn/x86/divrem_1.asm and mpn/x86/mod_1.asm where
+dnl  this loop_or_decljnz variation is enough to let the code be shared by
+dnl  all chips.
+
+define(loop_or_decljnz,
+`ifelse(loop_is_better_p,1,
+	`loop',
+	`decl	%ecx
+	jnz')')
+
+define(loop_is_better_p,
+`m4_ifdef_anyof_p(`HAVE_TARGET_CPU_k6',
+                  `HAVE_TARGET_CPU_k62',
+                  `HAVE_TARGET_CPU_k63',
+                  `HAVE_TARGET_CPU_i386')')
+
+
+dnl  Usage: Zdisp(inst,op,op,op)
+dnl
+dnl  Generate explicit .byte sequences if necessary to force a byte-sized
+dnl  zero displacement on an instruction.  For example,
+dnl
+dnl         Zdisp(  movl,   0,(%esi), %eax)
+dnl
+dnl  expands to
+dnl
+dnl                 .byte   139,70,0  C movl 0(%esi), %eax
+dnl
+dnl  If the displacement given isn't 0, then normal assembler code is
+dnl  generated.  For example,
+dnl
+dnl         Zdisp(  movl,   4,(%esi), %eax)
+dnl
+dnl  expands to
+dnl
+dnl                 movl    4(%esi), %eax
+dnl
+dnl  This means a single Zdisp() form can be used with an expression for the
+dnl  displacement, and .byte will be used only if necessary.  The
+dnl  displacement argument is eval()ed.
+dnl
+dnl  Because there aren't many places a 0(reg) form is wanted, Zdisp is
+dnl  implemented with a table of instructions and encodings.  A new entry is
+dnl  needed for any different operation or registers.
+
+define(Zdisp,
+`define(`Zdisp_found',0)dnl
+Zdisp_match( movl, %eax, 0,(%edi), `137,71,0',    $@)`'dnl
+Zdisp_match( movl, %ebx, 0,(%edi), `137,95,0',    $@)`'dnl
+Zdisp_match( movl, %esi, 0,(%edi), `137,119,0',   $@)`'dnl
+Zdisp_match( movl, 0,(%ebx), %eax, `139,67,0',    $@)`'dnl
+Zdisp_match( movl, 0,(%ebx), %esi, `139,115,0',   $@)`'dnl
+Zdisp_match( movl, 0,(%esi), %eax, `139,70,0',    $@)`'dnl
+Zdisp_match( movl, 0,(%esi,%ecx,4), %eax, `0x8b,0x44,0x8e,0x00',      $@)`'dnl
+Zdisp_match( addl, %ebx, 0,(%edi), `1,95,0',      $@)`'dnl
+Zdisp_match( addl, %ecx, 0,(%edi), `1,79,0',      $@)`'dnl
+Zdisp_match( addl, %esi, 0,(%edi), `1,119,0',     $@)`'dnl
+Zdisp_match( subl, %ecx, 0,(%edi), `41,79,0',     $@)`'dnl
+Zdisp_match( adcl, 0,(%edx), %esi, `19,114,0',    $@)`'dnl
+Zdisp_match( sbbl, 0,(%edx), %esi, `27,114,0',    $@)`'dnl
+Zdisp_match( movq, 0,(%eax,%ecx,8), %mm0, `0x0f,0x6f,0x44,0xc8,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%ebx,%eax,4), %mm0, `0x0f,0x6f,0x44,0x83,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%ebx,%eax,4), %mm2, `0x0f,0x6f,0x54,0x83,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%esi),        %mm0, `15,111,70,0',     $@)`'dnl
+Zdisp_match( movq, %mm0,        0,(%edi), `15,127,71,0',     $@)`'dnl
+Zdisp_match( movq, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7f,0x54,0x81,0x00', $@)`'dnl
+Zdisp_match( movq, %mm2, 0,(%edx,%eax,4), `0x0f,0x7f,0x54,0x82,0x00', $@)`'dnl
+Zdisp_match( movq, %mm0, 0,(%edx,%ecx,8), `0x0f,0x7f,0x44,0xca,0x00', $@)`'dnl
+Zdisp_match( movd, 0,(%eax,%ecx,8), %mm1, `0x0f,0x6e,0x4c,0xc8,0x00', $@)`'dnl
+Zdisp_match( movd, 0,(%edx,%ecx,8), %mm0, `0x0f,0x6e,0x44,0xca,0x00', $@)`'dnl
+Zdisp_match( movd, %mm0, 0,(%eax,%ecx,4), `0x0f,0x7e,0x44,0x88,0x00', $@)`'dnl
+Zdisp_match( movd, %mm0, 0,(%ecx,%eax,4), `0x0f,0x7e,0x44,0x81,0x00', $@)`'dnl
+Zdisp_match( movd, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7e,0x54,0x81,0x00', $@)`'dnl
+ifelse(Zdisp_found,0,
+`m4_error(`unrecognised instruction in Zdisp: $1 $2 $3 $4
+')')')
+
+define(Zdisp_match,
+`ifelse(eval(m4_stringequal_p(`$1',`$6')
+	&& m4_stringequal_p(`$2',0)
+	&& m4_stringequal_p(`$3',`$8')
+	&& m4_stringequal_p(`$4',`$9')),1,
+`define(`Zdisp_found',1)dnl
+ifelse(eval(`$7'),0,
+`	.byte	$5  C `$1 0$3, $4'',
+`	$6	$7$8, $9')',
+
+`ifelse(eval(m4_stringequal_p(`$1',`$6')
+	&& m4_stringequal_p(`$2',`$7')
+	&& m4_stringequal_p(`$3',0)
+	&& m4_stringequal_p(`$4',`$9')),1,
+`define(`Zdisp_found',1)dnl
+ifelse(eval(`$8'),0,
+`	.byte	$5  C `$1 $2, 0$4'',
+`	$6	$7, $8$9')')')')
+
+
+dnl  Usage: shldl(count,src,dst)
+dnl         shrdl(count,src,dst)
+dnl         shldw(count,src,dst)
+dnl         shrdw(count,src,dst)
+dnl
+dnl  Generate a double-shift instruction, possibly omitting a %cl count
+dnl  parameter if that's what the assembler requires, as indicated by
+dnl  WANT_SHLDL_CL in config.m4.  For example,
+dnl
+dnl         shldl(  %cl, %eax, %ebx)
+dnl
+dnl  turns into either
+dnl
+dnl         shldl   %cl, %eax, %ebx
+dnl  or
+dnl         shldl   %eax, %ebx
+dnl
+dnl  Immediate counts are always passed through unchanged.  For example,
+dnl
+dnl         shrdl(  $2, %esi, %edi)
+dnl  becomes
+dnl         shrdl   $2, %esi, %edi
+dnl
+dnl
+dnl  If you forget to use the macro form "shldl( ...)" and instead write
+dnl  just a plain "shldl ...", an error results.  This ensures the necessary
+dnl  variant treatment of %cl isn't accidentally bypassed.
+
+define(define_shd_instruction,
+`define($1,
+m4_instruction_wrapper()
+m4_assert_numargs(3)
+`shd_instruction'(m4_doublequote($`'0),m4_doublequote($`'1),dnl
+m4_doublequote($`'2),m4_doublequote($`'3)))')
+
+dnl  Effectively: define(shldl,`shd_instruction(`$0',`$1',`$2',`$3')') etc
+define_shd_instruction(shldl)
+define_shd_instruction(shrdl)
+define_shd_instruction(shldw)
+define_shd_instruction(shrdw)
+
+dnl  Called: shd_instruction(op,count,src,dst)
+define(shd_instruction,
+m4_assert_numargs(4)
+m4_assert_defined(`WANT_SHLDL_CL')
+`ifelse(eval(m4_stringequal_p(`$2',`%cl') && !WANT_SHLDL_CL),1,
+``$1'	`$3', `$4'',
+``$1'	`$2', `$3', `$4'')')
+
+
+dnl  Usage: ASSERT(cond, instructions)
+dnl
+dnl  If WANT_ASSERT is 1, output the given instructions and expect the given
+dnl  flags condition to then be satisfied.  For example,
+dnl
+dnl         ASSERT(ne, `cmpl %eax, %ebx')
+dnl
+dnl  The instructions can be omitted to just assert a flags condition with
+dnl  no extra calculation.  For example,
+dnl
+dnl         ASSERT(nc)
+dnl
+dnl  When `instructions' is not empty, a pushf/popf is added to preserve the
+dnl  flags, but the instructions themselves must preserve any registers that
+dnl  matter.  FRAME is adjusted for the push and pop, so the instructions
+dnl  given can use defframe() stack variables.
+
+define(ASSERT,
+m4_assert_numargs_range(1,2)
+`ifelse(WANT_ASSERT,1,
+	`C ASSERT
+ifelse(`$2',,,`	pushf	ifdef(`FRAME',`FRAME_pushl()')')
+	$2
+	j`$1'	1f
+	ud2	C assertion failed
+1:
+ifelse(`$2',,,`	popf	ifdef(`FRAME',`FRAME_popl()')')
+')')
+
+
+dnl  Usage: movl_text_address(label,register)
+dnl
+dnl  Get the address of a text segment label, using either a plain movl or a
+dnl  position-independent calculation, as necessary.  For example,
+dnl
+dnl         movl_code_address(L(foo),%eax)
+dnl
+dnl  This macro is only meant for use in ASSERT()s or when testing, since
+dnl  the PIC sequence it generates will want to be done with a ret balancing
+dnl  the call on CPUs with return address branch predition.
+dnl
+dnl  The addl generated here has a backward reference to 1b, and so won't
+dnl  suffer from the two forwards references bug in old gas (described in
+dnl  mpn/x86/README.family).
+
+define(movl_text_address,
+`ifdef(`PIC',
+	`call	1f
+1:	popl	$2	C %eip
+	addl	`$'$1-1b, $2',
+	`movl	`$'$1, $2')')
+
+
+divert`'dnl
diff --git a/rts/gmp/mpn/z8000/add_n.s b/rts/gmp/mpn/z8000/add_n.s
new file mode 100644
index 0000000000..3a136107fe
--- /dev/null
+++ b/rts/gmp/mpn/z8000/add_n.s
@@ -0,0 +1,53 @@
+! Z8000 __gmpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r7
+! s1_ptr	r6
+! s2_ptr	r5
+! size		r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp.  We'd
+! then add 2x the number of words written to r7...
+
+	unseg
+	.text
+	even
+	global ___gmpn_add_n
+___gmpn_add_n:
+	pop	r0,@r6
+	pop	r1,@r5
+	add	r0,r1
+	ld	@r7,r0
+	dec	r4
+	jr	eq,Lend
+Loop:	pop	r0,@r6
+	pop	r1,@r5
+	adc	r0,r1
+	inc	r7,#2
+	ld	@r7,r0
+	dec	r4
+	jr	ne,Loop
+Lend:	ld	r2,r4		! use 0 already in r4
+	adc	r2,r2
+	ret	t
diff --git a/rts/gmp/mpn/z8000/gmp-mparam.h b/rts/gmp/mpn/z8000/gmp-mparam.h
new file mode 100644
index 0000000000..4216df673c
--- /dev/null
+++ b/rts/gmp/mpn/z8000/gmp-mparam.h
@@ -0,0 +1,27 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 16
+#define BYTES_PER_MP_LIMB 2
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 16
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
diff --git a/rts/gmp/mpn/z8000/mul_1.s b/rts/gmp/mpn/z8000/mul_1.s
new file mode 100644
index 0000000000..20fadd340a
--- /dev/null
+++ b/rts/gmp/mpn/z8000/mul_1.s
@@ -0,0 +1,68 @@
+! Z8000 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+! the result in a second limb vector.
+
+! Copyright (C) 1993, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r7
+! s1_ptr	r6
+! size		r5
+! s2_limb	r4
+
+	unseg
+	.text
+	even
+	global ___gmpn_mul_1
+___gmpn_mul_1:
+	sub	r2,r2		! zero carry limb
+	and	r4,r4
+	jr	mi,Lneg
+
+Lpos:	pop	r1,@r6
+	ld	r9,r1
+	mult	rr8,r4
+	and	r1,r1		! shift msb of loaded limb into cy
+	jr	mi,Lp		! branch if loaded limb's msb is set
+	add	r8,r4		! hi_limb += sign_comp2
+Lp:	add	r9,r2		! lo_limb += cy_limb
+	xor	r2,r2
+	adc	r2,r8
+	ld	@r7,r9
+	inc	r7,#2
+	dec	r5
+	jr	ne,Lpos
+	ret t
+
+Lneg:	pop	r1,@r6
+	ld	r9,r1
+	mult	rr8,r4
+	add	r8,r1		! hi_limb += sign_comp1
+	and	r1,r1
+	jr	mi,Ln
+	add	r8,r4		! hi_limb += sign_comp2
+Ln:	add	r9,r2		! lo_limb += cy_limb
+	xor	r2,r2
+	adc	r2,r8
+	ld	@r7,r9
+	inc	r7,#2
+	dec	r5
+	jr	ne,Lneg
+	ret t
diff --git a/rts/gmp/mpn/z8000/sub_n.s b/rts/gmp/mpn/z8000/sub_n.s
new file mode 100644
index 0000000000..bd9a7ad409
--- /dev/null
+++ b/rts/gmp/mpn/z8000/sub_n.s
@@ -0,0 +1,54 @@
+! Z8000 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+! store difference in a third limb vector.
+
+! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r7
+! s1_ptr	r6
+! s2_ptr	r5
+! size		r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp.  We'd
+! then add 2x the number of words written to r7...
+
+	unseg
+	.text
+	even
+	global ___gmpn_sub_n
+___gmpn_sub_n:
+	pop	r0,@r6
+	pop	r1,@r5
+	sub	r0,r1
+	ld	@r7,r0
+	dec	r4
+	jr	eq,Lend
+Loop:	pop	r0,@r6
+	pop	r1,@r5
+	sbc	r0,r1
+	inc	r7,#2
+	ld	@r7,r0
+	dec	r4
+	jr	ne,Loop
+Lend:	ld	r2,r4		! use 0 already in r4
+	adc	r2,r2
+	ret	t
diff --git a/rts/gmp/mpn/z8000x/add_n.s b/rts/gmp/mpn/z8000x/add_n.s
new file mode 100644
index 0000000000..7f130785c5
--- /dev/null
+++ b/rts/gmp/mpn/z8000x/add_n.s
@@ -0,0 +1,56 @@
+! Z8000 (32 bit limb version) __gmpn_add_n -- Add two limb vectors of equal,
+! non-zero length.
+
+! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r7
+! s1_ptr	r6
+! s2_ptr	r5
+! size		r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp.  We'd
+! then add 2x the number of words written to r7...
+
+	segm
+	.text
+	even
+	global ___gmpn_add_n
+___gmpn_add_n:
+	popl	rr0,@r6
+	popl	rr8,@r5
+	addl	rr0,rr8
+	ldl	@r7,rr0
+	dec	r4
+	jr	eq,Lend
+Loop:	popl	rr0,@r6
+	popl	rr8,@r5
+	adc	r1,r9
+	adc	r0,r8
+	inc	r7,#4
+	ldl	@r7,rr0
+	dec	r4
+	jr	ne,Loop
+Lend:	ld	r2,r4		! use 0 already in r4
+	ld	r3,r4
+	adc	r2,r2
+	ret	t
diff --git a/rts/gmp/mpn/z8000x/sub_n.s b/rts/gmp/mpn/z8000x/sub_n.s
new file mode 100644
index 0000000000..f416d1d6eb
--- /dev/null
+++ b/rts/gmp/mpn/z8000x/sub_n.s
@@ -0,0 +1,56 @@
+! Z8000 (32 bit limb version) __gmpn_sub_n -- Subtract two limb vectors of the
+! same length > 0 and store difference in a third limb vector.
+
+! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r7
+! s1_ptr	r6
+! s2_ptr	r5
+! size		r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp.  We'd
+! then add 2x the number of words written to r7...
+
+	segm
+	.text
+	even
+	global ___gmpn_sub_n
+___gmpn_sub_n:
+	popl	rr0,@r6
+	popl	rr8,@r5
+	subl	rr0,rr8
+	ldl	@r7,rr0
+	dec	r4
+	jr	eq,Lend
+Loop:	popl	rr0,@r6
+	popl	rr8,@r5
+	sbc	r1,r9
+	sbc	r0,r8
+	inc	r7,#4
+	ldl	@r7,rr0
+	dec	r4
+	jr	ne,Loop
+Lend:	ld	r2,r4		! use 0 already in r4
+	ld	r3,r4
+	adc	r2,r2
+	ret	t
diff --git a/rts/gmp/mpz/Makefile.am b/rts/gmp/mpz/Makefile.am
new file mode 100644
index 0000000000..cd6fec4e21
--- /dev/null
+++ b/rts/gmp/mpz/Makefile.am
@@ -0,0 +1,58 @@
+## Process this file with automake to generate Makefile.in
+
+# Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+AUTOMAKE_OPTIONS = gnu no-dependencies
+
+SUBDIRS = tests
+
+INCLUDES = -I$(top_srcdir) -DOPERATION_$*
+
+noinst_LTLIBRARIES = libmpz.la
+libmpz_la_SOURCES = \
+  abs.c add.c add_ui.c addmul_ui.c and.c array_init.c \
+  bin_ui.c bin_uiui.c cdiv_q.c \
+  cdiv_q_ui.c cdiv_qr.c cdiv_qr_ui.c cdiv_r.c cdiv_r_ui.c cdiv_ui.c \
+  clear.c clrbit.c cmp.c cmp_si.c cmp_ui.c cmpabs.c cmpabs_ui.c com.c \
+  divexact.c dump.c fac_ui.c fdiv_q.c fdiv_q_2exp.c fdiv_q_ui.c \
+  fdiv_qr.c fdiv_qr_ui.c fdiv_r.c fdiv_r_2exp.c fdiv_r_ui.c fdiv_ui.c \
+  fib_ui.c fits_sint_p.c fits_slong_p.c fits_sshort_p.c fits_uint_p.c \
+  fits_ulong_p.c fits_ushort_p.c gcd.c gcd_ui.c gcdext.c get_d.c get_si.c \
+  get_str.c get_ui.c getlimbn.c hamdist.c init.c inp_raw.c inp_str.c \
+  invert.c ior.c iset.c iset_d.c iset_si.c iset_str.c iset_ui.c \
+  jacobi.c kronsz.c kronuz.c kronzs.c kronzu.c \
+  lcm.c legendre.c mod.c mul.c mul_2exp.c neg.c nextprime.c \
+  out_raw.c out_str.c perfpow.c perfsqr.c popcount.c pow_ui.c powm.c \
+  powm_ui.c pprime_p.c random.c random2.c realloc.c remove.c root.c rrandomb.c \
+  scan0.c scan1.c set.c set_d.c set_f.c set_q.c set_si.c set_str.c \
+  set_ui.c setbit.c size.c sizeinbase.c sqrt.c sqrtrem.c sub.c \
+  sub_ui.c swap.c tdiv_ui.c tdiv_q.c tdiv_q_2exp.c tdiv_q_ui.c tdiv_qr.c \
+  tdiv_qr_ui.c tdiv_r.c tdiv_r_2exp.c tdiv_r_ui.c tstbit.c ui_pow_ui.c \
+  urandomb.c urandomm.c xor.c
+
+EXTRA_DIST = mul_siui.c
+nodist_libmpz_la_SOURCES = mul_si.c mul_ui.c
+CLEANFILES = $(nodist_libmpz_la_SOURCES)
+
+mul_si.c: $(srcdir)/mul_siui.c
+	cp $(srcdir)/mul_siui.c mul_si.c
+mul_ui.c: $(srcdir)/mul_siui.c
+	cp $(srcdir)/mul_siui.c mul_ui.c
diff --git a/rts/gmp/mpz/Makefile.in b/rts/gmp/mpz/Makefile.in
new file mode 100644
index 0000000000..e0f2cdc133
--- /dev/null
+++ b/rts/gmp/mpz/Makefile.in
@@ -0,0 +1,457 @@
+# Makefile.in generated automatically by automake 1.4a from Makefile.am
+
+# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+SHELL = @SHELL@
+
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+prefix = @prefix@
+exec_prefix = @exec_prefix@
+
+bindir = @bindir@
+sbindir = @sbindir@
+libexecdir = @libexecdir@
+datadir = @datadir@
+sysconfdir = @sysconfdir@
+sharedstatedir = @sharedstatedir@
+localstatedir = @localstatedir@
+libdir = @libdir@
+infodir = @infodir@
+mandir = @mandir@
+includedir = @includedir@
+oldincludedir = /usr/include
+
+DESTDIR =
+
+pkgdatadir = $(datadir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+
+top_builddir = ..
+
+ACLOCAL = @ACLOCAL@
+AUTOCONF = @AUTOCONF@
+AUTOMAKE = @AUTOMAKE@
+AUTOHEADER = @AUTOHEADER@
+
+INSTALL = @INSTALL@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_FLAG =
+transform = @program_transform_name@
+
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+
+@SET_MAKE@
+build_alias = @build_alias@
+build_triplet = @build@
+host_alias = @host_alias@
+host_triplet = @host@
+target_alias = @target_alias@
+target_triplet = @target@
+AMDEP = @AMDEP@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AWK = @AWK@
+CALLING_CONVENTIONS_OBJS = @CALLING_CONVENTIONS_OBJS@
+CC = @CC@
+CCAS = @CCAS@
+CPP = @CPP@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+EXEEXT = @EXEEXT@
+LIBTOOL = @LIBTOOL@
+LN_S = @LN_S@
+M4 = @M4@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+PACKAGE = @PACKAGE@
+RANLIB = @RANLIB@
+SPEED_CYCLECOUNTER_OBJS = @SPEED_CYCLECOUNTER_OBJS@
+STRIP = @STRIP@
+U = @U@
+VERSION = @VERSION@
+gmp_srclinks = @gmp_srclinks@
+install_sh = @install_sh@
+mpn_objects = @mpn_objects@
+mpn_objs_in_libgmp = @mpn_objs_in_libgmp@
+
+# Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+AUTOMAKE_OPTIONS = gnu no-dependencies
+
+SUBDIRS =
+
+INCLUDES = -I$(top_srcdir) -DOPERATION_$*
+
+noinst_LTLIBRARIES = libmpz.la
+libmpz_la_SOURCES = \
+  abs.c add.c add_ui.c addmul_ui.c and.c array_init.c \
+  bin_ui.c bin_uiui.c cdiv_q.c \
+  cdiv_q_ui.c cdiv_qr.c cdiv_qr_ui.c cdiv_r.c cdiv_r_ui.c cdiv_ui.c \
+  clear.c clrbit.c cmp.c cmp_si.c cmp_ui.c cmpabs.c cmpabs_ui.c com.c \
+  divexact.c dump.c fac_ui.c fdiv_q.c fdiv_q_2exp.c fdiv_q_ui.c \
+  fdiv_qr.c fdiv_qr_ui.c fdiv_r.c fdiv_r_2exp.c fdiv_r_ui.c fdiv_ui.c \
+  fib_ui.c fits_sint_p.c fits_slong_p.c fits_sshort_p.c fits_uint_p.c \
+  fits_ulong_p.c fits_ushort_p.c gcd.c gcd_ui.c gcdext.c get_d.c get_si.c \
+  get_str.c get_ui.c getlimbn.c hamdist.c init.c inp_raw.c inp_str.c \
+  invert.c ior.c iset.c iset_d.c iset_si.c iset_str.c iset_ui.c \
+  jacobi.c kronsz.c kronuz.c kronzs.c kronzu.c \
+  lcm.c legendre.c mod.c mul.c mul_2exp.c neg.c nextprime.c \
+  out_raw.c out_str.c perfpow.c perfsqr.c popcount.c pow_ui.c powm.c \
+  powm_ui.c pprime_p.c random.c random2.c realloc.c remove.c root.c rrandomb.c \
+  scan0.c scan1.c set.c set_d.c set_f.c set_q.c set_si.c set_str.c \
+  set_ui.c setbit.c size.c sizeinbase.c sqrt.c sqrtrem.c sub.c \
+  sub_ui.c swap.c tdiv_ui.c tdiv_q.c tdiv_q_2exp.c tdiv_q_ui.c tdiv_qr.c \
+  tdiv_qr_ui.c tdiv_r.c tdiv_r_2exp.c tdiv_r_ui.c tstbit.c ui_pow_ui.c \
+  urandomb.c urandomm.c xor.c
+
+
+EXTRA_DIST = mul_siui.c
+nodist_libmpz_la_SOURCES = mul_si.c mul_ui.c
+CLEANFILES = $(nodist_libmpz_la_SOURCES)
+subdir = mpz
+mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs
+CONFIG_HEADER = ../config.h
+CONFIG_CLEAN_FILES = 
+LTLIBRARIES =  $(noinst_LTLIBRARIES)
+
+
+DEFS = @DEFS@ -I. -I$(srcdir) -I..
+CPPFLAGS = @CPPFLAGS@
+LDFLAGS = @LDFLAGS@
+LIBS = @LIBS@
+libmpz_la_LDFLAGS = 
+libmpz_la_LIBADD = 
+am_libmpz_la_OBJECTS =  abs.lo add.lo add_ui.lo addmul_ui.lo and.lo \
+array_init.lo bin_ui.lo bin_uiui.lo cdiv_q.lo cdiv_q_ui.lo cdiv_qr.lo \
+cdiv_qr_ui.lo cdiv_r.lo cdiv_r_ui.lo cdiv_ui.lo clear.lo clrbit.lo \
+cmp.lo cmp_si.lo cmp_ui.lo cmpabs.lo cmpabs_ui.lo com.lo divexact.lo \
+dump.lo fac_ui.lo fdiv_q.lo fdiv_q_2exp.lo fdiv_q_ui.lo fdiv_qr.lo \
+fdiv_qr_ui.lo fdiv_r.lo fdiv_r_2exp.lo fdiv_r_ui.lo fdiv_ui.lo \
+fib_ui.lo fits_sint_p.lo fits_slong_p.lo fits_sshort_p.lo \
+fits_uint_p.lo fits_ulong_p.lo fits_ushort_p.lo gcd.lo gcd_ui.lo \
+gcdext.lo get_d.lo get_si.lo get_str.lo get_ui.lo getlimbn.lo \
+hamdist.lo init.lo inp_raw.lo inp_str.lo invert.lo ior.lo iset.lo \
+iset_d.lo iset_si.lo iset_str.lo iset_ui.lo jacobi.lo kronsz.lo \
+kronuz.lo kronzs.lo kronzu.lo lcm.lo legendre.lo mod.lo mul.lo \
+mul_2exp.lo neg.lo nextprime.lo out_raw.lo out_str.lo perfpow.lo \
+perfsqr.lo popcount.lo pow_ui.lo powm.lo powm_ui.lo pprime_p.lo \
+random.lo random2.lo realloc.lo remove.lo root.lo rrandomb.lo scan0.lo \
+scan1.lo set.lo set_d.lo set_f.lo set_q.lo set_si.lo set_str.lo \
+set_ui.lo setbit.lo size.lo sizeinbase.lo sqrt.lo sqrtrem.lo sub.lo \
+sub_ui.lo swap.lo tdiv_ui.lo tdiv_q.lo tdiv_q_2exp.lo tdiv_q_ui.lo \
+tdiv_qr.lo tdiv_qr_ui.lo tdiv_r.lo tdiv_r_2exp.lo tdiv_r_ui.lo \
+tstbit.lo ui_pow_ui.lo urandomb.lo urandomm.lo xor.lo
+nodist_libmpz_la_OBJECTS =  mul_si.lo mul_ui.lo
+libmpz_la_OBJECTS =  $(am_libmpz_la_OBJECTS) $(nodist_libmpz_la_OBJECTS)
+COMPILE = $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CFLAGS = @CFLAGS@
+CCLD = $(CC)
+LINK = $(LIBTOOL) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+DIST_SOURCES =  $(libmpz_la_SOURCES)
+DIST_COMMON =  README Makefile.am Makefile.in
+
+
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+
+GZIP_ENV = --best
+depcomp = 
+SOURCES = $(libmpz_la_SOURCES) $(nodist_libmpz_la_SOURCES)
+OBJECTS = $(am_libmpz_la_OBJECTS) $(nodist_libmpz_la_OBJECTS)
+
+all: all-redirect
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) 
+	cd $(top_srcdir) && $(AUTOMAKE) --gnu mpz/Makefile
+
+Makefile: $(srcdir)/Makefile.in  $(top_builddir)/config.status
+	cd $(top_builddir) \
+	  && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
+
+
+mostlyclean-noinstLTLIBRARIES:
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+
+distclean-noinstLTLIBRARIES:
+
+maintainer-clean-noinstLTLIBRARIES:
+
+mostlyclean-compile:
+	-rm -f *.o core *.core
+	-rm -f *.$(OBJEXT)
+
+clean-compile:
+
+distclean-compile:
+	-rm -f *.tab.c
+
+maintainer-clean-compile:
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+distclean-libtool:
+
+maintainer-clean-libtool:
+
+libmpz.la: $(libmpz_la_OBJECTS) $(libmpz_la_DEPENDENCIES)
+	$(LINK)  $(libmpz_la_LDFLAGS) $(libmpz_la_OBJECTS) $(libmpz_la_LIBADD) $(LIBS)
+.c.o:
+	$(COMPILE) -c $<
+.c.obj:
+	$(COMPILE) -c `cygpath -w $<`
+.c.lo:
+	$(LTCOMPILE) -c -o $@ $<
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+#     (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+
+all-recursive install-data-recursive install-exec-recursive \
+installdirs-recursive install-recursive uninstall-recursive  \
+check-recursive installcheck-recursive info-recursive dvi-recursive:
+	@set fnord $(MAKEFLAGS); amf=$$2; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	   || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+mostlyclean-recursive clean-recursive distclean-recursive \
+maintainer-clean-recursive:
+	@set fnord $(MAKEFLAGS); amf=$$2; \
+	dot_seen=no; \
+	rev=''; list='$(SUBDIRS)'; for subdir in $$list; do \
+	  rev="$$subdir $$rev"; \
+	  if test "$$subdir" = "."; then dot_seen=yes; else :; fi; \
+	done; \
+	test "$$dot_seen" = "no" && rev=". $$rev"; \
+	target=`echo $@ | sed s/-recursive//`; \
+	for subdir in $$rev; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	   || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
+	done && test -z "$$fail"
+tags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+	done
+
+tags: TAGS
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	mkid -f$$here/ID $$unique $(LISP)
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	tags=; \
+	here=`pwd`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+   if test "$$subdir" = .; then :; else \
+	    test -f $$subdir/TAGS && tags="$$tags -i $$here/$$subdir/TAGS"; \
+   fi; \
+	done; \
+	list='$(SOURCES) $(HEADERS) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	test -z "$(ETAGS_ARGS)$$unique$(LISP)$$tags" \
+	  || etags $(ETAGS_ARGS) $$tags  $$unique $(LISP)
+
+mostlyclean-tags:
+
+clean-tags:
+
+distclean-tags:
+	-rm -f TAGS ID
+
+maintainer-clean-tags:
+
+distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir)
+
+distdir: $(DISTFILES)
+	@for file in $(DISTFILES); do \
+	  d=$(srcdir); \
+	  if test -d $$d/$$file; then \
+	    cp -pR $$d/$$file $(distdir); \
+	  else \
+	    test -f $(distdir)/$$file \
+	    || cp -p $$d/$$file $(distdir)/$$file || :; \
+	  fi; \
+	done
+	for subdir in $(SUBDIRS); do \
+	  if test "$$subdir" = .; then :; else \
+	    test -d $(distdir)/$$subdir \
+	    || mkdir $(distdir)/$$subdir \
+	    || exit 1; \
+	    (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir=../$(top_distdir) distdir=../$(distdir)/$$subdir distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+info-am:
+info: info-recursive
+dvi-am:
+dvi: dvi-recursive
+check-am: all-am
+check: check-recursive
+installcheck-am:
+installcheck: installcheck-recursive
+install-exec-am:
+install-exec: install-exec-recursive
+
+install-data-am:
+install-data: install-data-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+install: install-recursive
+uninstall-am:
+uninstall: uninstall-recursive
+all-am: Makefile $(LTLIBRARIES)
+all-redirect: all-recursive
+install-strip:
+	$(MAKE) $(AM_MAKEFLAGS) INSTALL_STRIP_FLAG=-s install
+installdirs: installdirs-recursive
+installdirs-am:
+
+
+mostlyclean-generic:
+
+clean-generic:
+	-test -z "$(CLEANFILES)" || rm -f $(CLEANFILES)
+
+distclean-generic:
+	-rm -f Makefile $(CONFIG_CLEAN_FILES)
+	-rm -f config.cache config.log stamp-h stamp-h[0-9]*
+
+maintainer-clean-generic:
+	-rm -f Makefile.in
+mostlyclean-am:  mostlyclean-noinstLTLIBRARIES mostlyclean-compile \
+		mostlyclean-libtool mostlyclean-tags \
+		mostlyclean-generic
+
+mostlyclean: mostlyclean-recursive
+
+clean-am:  clean-noinstLTLIBRARIES clean-compile clean-libtool \
+		clean-tags clean-generic mostlyclean-am
+
+clean: clean-recursive
+
+distclean-am:  distclean-noinstLTLIBRARIES distclean-compile \
+		distclean-libtool distclean-tags distclean-generic \
+		clean-am
+	-rm -f libtool
+
+distclean: distclean-recursive
+
+maintainer-clean-am:  maintainer-clean-noinstLTLIBRARIES \
+		maintainer-clean-compile maintainer-clean-libtool \
+		maintainer-clean-tags maintainer-clean-generic \
+		distclean-am
+	@echo "This command is intended for maintainers to use;"
+	@echo "it deletes files that may require special tools to rebuild."
+
+maintainer-clean: maintainer-clean-recursive
+
+.PHONY: mostlyclean-noinstLTLIBRARIES distclean-noinstLTLIBRARIES \
+clean-noinstLTLIBRARIES maintainer-clean-noinstLTLIBRARIES \
+mostlyclean-compile distclean-compile clean-compile \
+maintainer-clean-compile mostlyclean-libtool distclean-libtool \
+clean-libtool maintainer-clean-libtool install-recursive \
+uninstall-recursive install-data-recursive uninstall-data-recursive \
+install-exec-recursive uninstall-exec-recursive installdirs-recursive \
+uninstalldirs-recursive all-recursive check-recursive \
+installcheck-recursive info-recursive dvi-recursive \
+mostlyclean-recursive distclean-recursive clean-recursive \
+maintainer-clean-recursive tags tags-recursive mostlyclean-tags \
+distclean-tags clean-tags maintainer-clean-tags distdir info-am info \
+dvi-am dvi check check-am installcheck-am installcheck install-exec-am \
+install-exec install-data-am install-data install-am install \
+uninstall-am uninstall all-redirect all-am all install-strip \
+installdirs-am installdirs mostlyclean-generic distclean-generic \
+clean-generic maintainer-clean-generic clean mostlyclean distclean \
+maintainer-clean
+
+
+mul_si.c: $(srcdir)/mul_siui.c
+	cp $(srcdir)/mul_siui.c mul_si.c
+mul_ui.c: $(srcdir)/mul_siui.c
+	cp $(srcdir)/mul_siui.c mul_ui.c
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/rts/gmp/mpz/README b/rts/gmp/mpz/README
new file mode 100644
index 0000000000..06b481d770
--- /dev/null
+++ b/rts/gmp/mpz/README
@@ -0,0 +1,23 @@
+This directory contains functions for GMP's integer function layer.
+
+In this version of GMP, integers are represented like in the figure below.
+(Please note that the format might change between every version, and that
+depending on the internal format in any way is a bad idea.)
+
+   most				least
+significant		     significant
+   limb				limb
+
+				 _mp_d
+				  /
+				 /
+				\/
+	 ____ ____ ____ ____ ____
+	|____|____|____|____|____|
+
+	<------- _mp_size ------->
+
+
+The most significant limb will be non-zero.  The _mp_size field's sign
+reflects the sign of the number.  Its absolute value is the count of limbs
+in the number.
diff --git a/rts/gmp/mpz/abs.c b/rts/gmp/mpz/abs.c
new file mode 100644
index 0000000000..0b5eab1ce6
--- /dev/null
+++ b/rts/gmp/mpz/abs.c
@@ -0,0 +1,51 @@
+/* mpz_abs(dst, src) -- Assign the absolute value of SRC to DST.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_abs (mpz_ptr w, mpz_srcptr u)
+#else
+mpz_abs (w, u)
+     mpz_ptr w;
+     mpz_srcptr u;
+#endif
+{
+  mp_ptr wp, up;
+  mp_size_t size;
+
+  size = ABS (u->_mp_size);
+
+  if (u != w)
+    {
+      if (w->_mp_alloc < size)
+	_mpz_realloc (w, size);
+
+      wp = w->_mp_d;
+      up = u->_mp_d;
+
+      MPN_COPY (wp, up, size);
+    }
+
+  w->_mp_size = size;
+}
diff --git a/rts/gmp/mpz/add.c b/rts/gmp/mpz/add.c
new file mode 100644
index 0000000000..a22c3778fb
--- /dev/null
+++ b/rts/gmp/mpz/add.c
@@ -0,0 +1,123 @@
+/* mpz_add -- Add two integers.
+
+Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+
+#ifndef BERKELEY_MP
+void
+#if __STDC__
+mpz_add (mpz_ptr w, mpz_srcptr u, mpz_srcptr v)
+#else
+mpz_add (w, u, v)
+     mpz_ptr w;
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+#else /* BERKELEY_MP */
+void
+#if __STDC__
+madd (mpz_srcptr u, mpz_srcptr v, mpz_ptr w)
+#else
+madd (u, v, w)
+     mpz_srcptr u;
+     mpz_srcptr v;
+     mpz_ptr w;
+#endif
+#endif /* BERKELEY_MP */
+{
+  mp_srcptr up, vp;
+  mp_ptr wp;
+  mp_size_t usize, vsize, wsize;
+  mp_size_t abs_usize;
+  mp_size_t abs_vsize;
+
+  usize = u->_mp_size;
+  vsize = v->_mp_size;
+  abs_usize = ABS (usize);
+  abs_vsize = ABS (vsize);
+
+  if (abs_usize < abs_vsize)
+    {
+      /* Swap U and V. */
+      MPZ_SRCPTR_SWAP (u, v);
+      MP_SIZE_T_SWAP (usize, vsize);
+      MP_SIZE_T_SWAP (abs_usize, abs_vsize);
+    }
+
+  /* True: ABS_USIZE >= ABS_VSIZE.  */
+
+  /* If not space for w (and possible carry), increase space.  */
+  wsize = abs_usize + 1;
+  if (w->_mp_alloc < wsize)
+    _mpz_realloc (w, wsize);
+
+  /* These must be after realloc (u or v may be the same as w).  */
+  up = u->_mp_d;
+  vp = v->_mp_d;
+  wp = w->_mp_d;
+
+  if ((usize ^ vsize) < 0)
+    {
+      /* U and V have different sign.  Need to compare them to determine
+	 which operand to subtract from which.  */
+
+      /* This test is right since ABS_USIZE >= ABS_VSIZE.  */
+      if (abs_usize != abs_vsize)
+	{
+	  mpn_sub (wp, up, abs_usize, vp, abs_vsize);
+	  wsize = abs_usize;
+	  MPN_NORMALIZE (wp, wsize);
+	  if (usize < 0)
+	    wsize = -wsize;
+	}
+      else if (mpn_cmp (up, vp, abs_usize) < 0)
+	{
+	  mpn_sub_n (wp, vp, up, abs_usize);
+	  wsize = abs_usize;
+	  MPN_NORMALIZE (wp, wsize);
+	  if (usize >= 0)
+	    wsize = -wsize;
+	}
+      else
+	{
+	  mpn_sub_n (wp, up, vp, abs_usize);
+	  wsize = abs_usize;
+	  MPN_NORMALIZE (wp, wsize);
+	  if (usize < 0)
+	    wsize = -wsize;
+	}
+    }
+  else
+    {
+      /* U and V have same sign.  Add them.  */
+      mp_limb_t cy_limb = mpn_add (wp, up, abs_usize, vp, abs_vsize);
+      wp[abs_usize] = cy_limb;
+      wsize = abs_usize + cy_limb;
+      if (usize < 0)
+	wsize = -wsize;
+    }
+
+  w->_mp_size = wsize;
+}
diff --git a/rts/gmp/mpz/add_ui.c b/rts/gmp/mpz/add_ui.c
new file mode 100644
index 0000000000..28dbd71f45
--- /dev/null
+++ b/rts/gmp/mpz/add_ui.c
@@ -0,0 +1,84 @@
+/* mpz_add_ui -- Add an mpz_t and an unsigned one-word integer.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_add_ui (mpz_ptr w, mpz_srcptr u, unsigned long int v)
+#else
+mpz_add_ui (w, u, v)
+     mpz_ptr w;
+     mpz_srcptr u;
+     unsigned long int v;
+#endif
+{
+  mp_srcptr up;
+  mp_ptr wp;
+  mp_size_t usize, wsize;
+  mp_size_t abs_usize;
+
+  usize = u->_mp_size;
+  abs_usize = ABS (usize);
+
+  /* If not space for W (and possible carry), increase space.  */
+  wsize = abs_usize + 1;
+  if (w->_mp_alloc < wsize)
+    _mpz_realloc (w, wsize);
+
+  /* These must be after realloc (U may be the same as W).  */
+  up = u->_mp_d;
+  wp = w->_mp_d;
+
+  if (abs_usize == 0)
+    {
+      wp[0] = v;
+      w->_mp_size = v != 0;
+      return;
+    }
+
+  if (usize >= 0)
+    {
+      mp_limb_t cy;
+      cy = mpn_add_1 (wp, up, abs_usize, (mp_limb_t) v);
+      wp[abs_usize] = cy;
+      wsize = abs_usize + cy;
+    }
+  else
+    {
+      /* The signs are different.  Need exact comparison to determine
+	 which operand to subtract from which.  */
+      if (abs_usize == 1 && up[0] < v)
+	{
+	  wp[0] = v - up[0];
+	  wsize = 1;
+	}
+      else
+	{
+	  mpn_sub_1 (wp, up, abs_usize, (mp_limb_t) v);
+	  /* Size can decrease with at most one limb.  */
+	  wsize = -(abs_usize - (wp[abs_usize - 1] == 0));
+	}
+    }
+
+  w->_mp_size = wsize;
+}
diff --git a/rts/gmp/mpz/addmul_ui.c b/rts/gmp/mpz/addmul_ui.c
new file mode 100644
index 0000000000..7b38d3624d
--- /dev/null
+++ b/rts/gmp/mpz/addmul_ui.c
@@ -0,0 +1,214 @@
+/* mpz_addmul_ui(prodsum, multiplier, small_multiplicand) --
+   Add MULTIPLICATOR times SMALL_MULTIPLICAND to PRODSUM.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+static mp_limb_t mpn_neg1 _PROTO ((mp_ptr, mp_size_t));
+
+#if 0
+#undef  MPN_NORMALIZE
+#define MPN_NORMALIZE(DST, NLIMBS) \
+  do {									\
+    while (--(NLIMBS) >= 0 && (DST)[NLIMBS] == 0)			\
+      ;									\
+    (NLIMBS)++;								\
+  } while (0)
+#undef  MPN_NORMALIZE_NOT_ZERO
+#define MPN_NORMALIZE_NOT_ZERO(DST, NLIMBS) \
+  do {									\
+    while ((DST)[--(NLIMBS)] == 0)					\
+      ;									\
+    (NLIMBS)++;								\
+  } while (0)
+#endif
+
+void
+#if __STDC__
+mpz_addmul_ui (mpz_ptr rz, mpz_srcptr az, unsigned long int bu)
+#else
+mpz_addmul_ui (rz, az, bu)
+     mpz_ptr rz;
+     mpz_srcptr az;
+     unsigned long int bu;
+#endif
+{
+  mp_size_t rn, an;
+  mp_ptr rp, ap;
+
+  an = SIZ (az);
+
+  /* If either multiplier is zero, result is unaffected.  */
+  if (bu == 0 || an == 0)
+    return;
+
+  rn = SIZ (rz);
+
+  if (rn == 0)
+    {
+      mp_limb_t cy;
+
+      an = ABS (an);
+      if (ALLOC (rz) <= an)
+	_mpz_realloc (rz, an + 1);
+      rp = PTR (rz);
+      ap = PTR (az);
+      cy = mpn_mul_1 (rp, ap, an, (mp_limb_t) bu);
+      rp[an] = cy;
+      an += cy != 0;
+      SIZ (rz) = SIZ (az) >= 0 ? an : -an;
+      return;
+    }
+
+  if ((an ^ rn) >= 0)
+    {
+      /* Sign of operands are the same--really add.  */
+      an = ABS (an);
+      rn = ABS (rn);
+      if (rn > an)
+	{
+	  mp_limb_t cy;
+	  if (ALLOC (rz) <= rn)
+	    _mpz_realloc (rz, rn + 1);
+	  rp = PTR (rz);
+	  ap = PTR (az);
+	  cy = mpn_addmul_1 (rp, ap, an, (mp_limb_t) bu);
+	  cy = mpn_add_1 (rp + an, rp + an, rn - an, cy);
+	  rp[rn] = cy;
+	  rn += cy != 0;
+	  SIZ (rz) = SIZ (rz) >= 0 ? rn : -rn;
+	  return;
+	}
+      else
+	{
+	  mp_limb_t cy;
+	  if (ALLOC (rz) <= an)
+	    _mpz_realloc (rz, an + 1);
+	  rp = PTR (rz);
+	  ap = PTR (az);
+	  cy = mpn_addmul_1 (rp, ap, rn, (mp_limb_t) bu);
+	  if (an != rn)
+	    {
+	      mp_limb_t cy2;
+	      cy2 = mpn_mul_1 (rp + rn, ap + rn, an - rn, (mp_limb_t) bu);
+	      cy = cy2 + mpn_add_1 (rp + rn, rp + rn, an - rn, cy);
+	    }
+	  rn = an;
+	  rp[rn] = cy;
+	  rn += cy != 0;
+	  SIZ (rz) = SIZ (rz) >= 0 ? rn : -rn;
+	  return;
+	}
+    }
+  else
+    {
+      /* Sign of operands are different--actually subtract.  */
+      an = ABS (an);
+      rn = ABS (rn);
+      if (rn > an)
+	{
+	  mp_limb_t cy;
+	  rp = PTR (rz);
+	  ap = PTR (az);
+	  cy = mpn_submul_1 (rp, ap, an, (mp_limb_t) bu);
+	  cy = mpn_sub_1 (rp + an, rp + an, rn - an, cy);
+	  if (cy != 0)
+	    {
+	      mpn_neg1 (rp, rn);
+	      MPN_NORMALIZE_NOT_ZERO (rp, rn);
+	    }
+	  else
+	    {
+	      MPN_NORMALIZE (rp, rn);
+	      rn = -rn;
+	    }
+
+	  SIZ (rz) = SIZ (rz) >= 0 ? -rn : rn;
+	  return;
+	}
+      else
+	{
+	  /* Tricky case.  We need to subtract an operand that might be larger
+	     than the minuend.  To avoid allocating temporary space, we compute
+	     a*b-r instead of r-a*b and then negate.  */
+	  mp_limb_t cy;
+	  if (ALLOC (rz) <= an)
+	    _mpz_realloc (rz, an + 1);
+	  rp = PTR (rz);
+	  ap = PTR (az);
+	  cy = mpn_submul_1 (rp, ap, rn, (mp_limb_t) bu);
+	  if (an != rn)
+	    {
+	      mp_limb_t cy2;
+	      cy -= mpn_neg1 (rp, rn);
+	      cy2 = mpn_mul_1 (rp + rn, ap + rn, an - rn, (mp_limb_t) bu);
+	      if (cy == ~(mp_limb_t) 0)
+		cy = cy2 - mpn_sub_1 (rp + rn, rp + rn, an - rn, (mp_limb_t) 1);
+	      else
+		cy = cy2 + mpn_add_1 (rp + rn, rp + rn, an - rn, cy);
+	      rp[an] = cy;
+	      rn = an + (cy != 0);
+	      rn -= rp[rn - 1] == 0;
+	    }
+	  else if (cy != 0)
+	    {
+	      cy -= mpn_neg1 (rp, rn);
+	      rp[an] = cy;
+	      rn = an + 1;
+	      MPN_NORMALIZE_NOT_ZERO (rp, rn);
+	    }
+	  else
+	    {
+	      rn = an;
+	      MPN_NORMALIZE (rp, rn);
+	      rn = -rn;
+	    }
+
+	  SIZ (rz) = SIZ (rz) >= 0 ? -rn : rn;
+	  return;
+	}
+    }
+}
+
+static mp_limb_t
+#if __STDC__
+mpn_neg1 (mp_ptr rp, mp_size_t rn)
+#else
+mpn_neg1 (rp, rn)
+     mp_ptr rp;
+     mp_size_t rn;
+#endif
+{
+  mp_size_t i;
+
+  while (rn != 0 && rp[0] == 0)
+    rp++, rn--;
+
+  if (rn != 0)
+    {
+      rp[0] = -rp[0];
+      for (i = 1; i < rn; i++)
+	rp[i] = ~rp[i];
+      return 1;
+    }
+  return 0;
+}
diff --git a/rts/gmp/mpz/and.c b/rts/gmp/mpz/and.c
new file mode 100644
index 0000000000..354e9455bf
--- /dev/null
+++ b/rts/gmp/mpz/and.c
@@ -0,0 +1,278 @@
+/* mpz_and -- Logical and.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_and (mpz_ptr res, mpz_srcptr op1, mpz_srcptr op2)
+#else
+mpz_and (res, op1, op2)
+     mpz_ptr res;
+     mpz_srcptr op1;
+     mpz_srcptr op2;
+#endif
+{
+  mp_srcptr op1_ptr, op2_ptr;
+  mp_size_t op1_size, op2_size;
+  mp_ptr res_ptr;
+  mp_size_t res_size;
+  mp_size_t i;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+  op1_size = op1->_mp_size;
+  op2_size = op2->_mp_size;
+
+  op1_ptr = op1->_mp_d;
+  op2_ptr = op2->_mp_d;
+  res_ptr = res->_mp_d;
+
+  if (op1_size >= 0)
+    {
+      if (op2_size >= 0)
+	{
+	  res_size = MIN (op1_size, op2_size);
+	  /* First loop finds the size of the result.  */
+	  for (i = res_size - 1; i >= 0; i--)
+	    if ((op1_ptr[i] & op2_ptr[i]) != 0)
+	      break;
+	  res_size = i + 1;
+
+	  /* Handle allocation, now then we know exactly how much space is
+	     needed for the result.  */
+	  if (res->_mp_alloc < res_size)
+	    {
+	      _mpz_realloc (res, res_size);
+	      op1_ptr = op1->_mp_d;
+	      op2_ptr = op2->_mp_d;
+	      res_ptr = res->_mp_d;
+	    }
+
+	  /* Second loop computes the real result.  */
+	  for (i = res_size - 1; i >= 0; i--)
+	    res_ptr[i] = op1_ptr[i] & op2_ptr[i];
+
+	  res->_mp_size = res_size;
+	  return;
+	}
+      else /* op2_size < 0 */
+	{
+	  /* Fall through to the code at the end of the function.  */
+	}
+    }
+  else
+    {
+      if (op2_size < 0)
+	{
+	  mp_ptr opx;
+	  mp_limb_t cy;
+	  mp_size_t res_alloc;
+
+	  /* Both operands are negative, so will be the result.
+	     -((-OP1) & (-OP2)) = -(~(OP1 - 1) & ~(OP2 - 1)) =
+	     = ~(~(OP1 - 1) & ~(OP2 - 1)) + 1 =
+	     = ((OP1 - 1) | (OP2 - 1)) + 1      */
+
+	  /* It might seem as we could end up with an (invalid) result with
+	     a leading zero-limb here when one of the operands is of the
+	     type 1,,0,,..,,.0.  But some analysis shows that we surely
+	     would get carry into the zero-limb in this situation...  */
+
+	  op1_size = -op1_size;
+	  op2_size = -op2_size;
+
+	  res_alloc = 1 + MAX (op1_size, op2_size);
+
+	  opx = (mp_ptr) TMP_ALLOC (op1_size * BYTES_PER_MP_LIMB);
+	  mpn_sub_1 (opx, op1_ptr, op1_size, (mp_limb_t) 1);
+	  op1_ptr = opx;
+
+	  opx = (mp_ptr) TMP_ALLOC (op2_size * BYTES_PER_MP_LIMB);
+	  mpn_sub_1 (opx, op2_ptr, op2_size, (mp_limb_t) 1);
+	  op2_ptr = opx;
+
+	  if (res->_mp_alloc < res_alloc)
+	    {
+	      _mpz_realloc (res, res_alloc);
+	      res_ptr = res->_mp_d;
+	      /* Don't re-read OP1_PTR and OP2_PTR.  They point to
+		 temporary space--never to the space RES->_mp_d used
+		 to point to before reallocation.  */
+	    }
+
+	  if (op1_size >= op2_size)
+	    {
+	      MPN_COPY (res_ptr + op2_size, op1_ptr + op2_size,
+			op1_size - op2_size);
+	      for (i = op2_size - 1; i >= 0; i--)
+		res_ptr[i] = op1_ptr[i] | op2_ptr[i];
+	      res_size = op1_size;
+	    }
+	  else
+	    {
+	      MPN_COPY (res_ptr + op1_size, op2_ptr + op1_size,
+			op2_size - op1_size);
+	      for (i = op1_size - 1; i >= 0; i--)
+		res_ptr[i] = op1_ptr[i] | op2_ptr[i];
+	      res_size = op2_size;
+	    }
+
+	  cy = mpn_add_1 (res_ptr, res_ptr, res_size, (mp_limb_t) 1);
+	  if (cy)
+	    {
+	      res_ptr[res_size] = cy;
+	      res_size++;
+	    }
+
+	  res->_mp_size = -res_size;
+	  TMP_FREE (marker);
+	  return;
+	}
+      else
+	{
+	  /* We should compute -OP1 & OP2.  Swap OP1 and OP2 and fall
+	     through to the code that handles OP1 & -OP2.  */
+          MPZ_SRCPTR_SWAP (op1, op2);
+          MPN_SRCPTR_SWAP (op1_ptr,op1_size, op2_ptr,op2_size);
+	}
+
+    }
+
+  {
+#if ANDNEW
+    mp_size_t op2_lim;
+    mp_size_t count;
+
+    /* OP2 must be negated as with infinite precision.
+
+       Scan from the low end for a non-zero limb.  The first non-zero
+       limb is simply negated (two's complement).  Any subsequent
+       limbs are one's complemented.  Of course, we don't need to
+       handle more limbs than there are limbs in the other, positive
+       operand as the result for those limbs is going to become zero
+       anyway.  */
+
+    /* Scan for the least significant non-zero OP2 limb, and zero the
+       result meanwhile for those limb positions.  (We will surely
+       find a non-zero limb, so we can write the loop with one
+       termination condition only.)  */
+    for (i = 0; op2_ptr[i] == 0; i++)
+      res_ptr[i] = 0;
+    op2_lim = i;
+
+    op2_size = -op2_size;
+
+    if (op1_size <= op2_size)
+      {
+	/* The ones-extended OP2 is >= than the zero-extended OP1.
+	   RES_SIZE <= OP1_SIZE.  Find the exact size.  */
+	for (i = op1_size - 1; i > op2_lim; i--)
+	  if ((op1_ptr[i] & ~op2_ptr[i]) != 0)
+	    break;
+	res_size = i + 1;
+	for (i = res_size - 1; i > op2_lim; i--)
+	  res_ptr[i] = op1_ptr[i] & ~op2_ptr[i];
+	res_ptr[op2_lim] = op1_ptr[op2_lim] & -op2_ptr[op2_lim];
+	/* Yes, this *can* happen!  */
+	MPN_NORMALIZE (res_ptr, res_size);
+      }
+    else
+      {
+	/* The ones-extended OP2 is < than the zero-extended OP1.
+	   RES_SIZE == OP1_SIZE, since OP1 is normalized.  */
+	res_size = op1_size;
+	MPN_COPY (res_ptr + op2_size, op1_ptr + op2_size, op1_size - op2_size);
+	for (i = op2_size - 1; i > op2_lim; i--)
+	  res_ptr[i] = op1_ptr[i] & ~op2_ptr[i];
+	res_ptr[op2_lim] = op1_ptr[op2_lim] & -op2_ptr[op2_lim];
+      }
+
+    res->_mp_size = res_size;
+#else
+
+    /* OP1 is positive and zero-extended,
+       OP2 is negative and ones-extended.
+       The result will be positive.
+       OP1 & -OP2 = OP1 & ~(OP2 - 1).  */
+
+    mp_ptr opx;
+
+    op2_size = -op2_size;
+    opx = (mp_ptr) TMP_ALLOC (op2_size * BYTES_PER_MP_LIMB);
+    mpn_sub_1 (opx, op2_ptr, op2_size, (mp_limb_t) 1);
+    op2_ptr = opx;
+
+    if (op1_size > op2_size)
+      {
+	/* The result has the same size as OP1, since OP1 is normalized
+	   and longer than the ones-extended OP2.  */
+	res_size = op1_size;
+
+	/* Handle allocation, now then we know exactly how much space is
+	   needed for the result.  */
+	if (res->_mp_alloc < res_size)
+	  {
+	    _mpz_realloc (res, res_size);
+	    res_ptr = res->_mp_d;
+	    op1_ptr = op1->_mp_d;
+	    /* Don't re-read OP2_PTR.  It points to temporary space--never
+	       to the space RES->_mp_d used to point to before reallocation.  */
+	  }
+
+	MPN_COPY (res_ptr + op2_size, op1_ptr + op2_size,
+		  res_size - op2_size);
+	for (i = op2_size - 1; i >= 0; i--)
+	  res_ptr[i] = op1_ptr[i] & ~op2_ptr[i];
+
+	res->_mp_size = res_size;
+      }
+    else
+      {
+	/* Find out the exact result size.  Ignore the high limbs of OP2,
+	   OP1 is zero-extended and would make the result zero.  */
+	for (i = op1_size - 1; i >= 0; i--)
+	  if ((op1_ptr[i] & ~op2_ptr[i]) != 0)
+	    break;
+	res_size = i + 1;
+
+	/* Handle allocation, now then we know exactly how much space is
+	   needed for the result.  */
+	if (res->_mp_alloc < res_size)
+	  {
+	    _mpz_realloc (res, res_size);
+	    res_ptr = res->_mp_d;
+	    op1_ptr = op1->_mp_d;
+	    /* Don't re-read OP2_PTR.  It points to temporary space--never
+	       to the space RES->_mp_d used to point to before reallocation.  */
+	  }
+
+	for (i = res_size - 1; i >= 0; i--)
+	  res_ptr[i] = op1_ptr[i] & ~op2_ptr[i];
+
+	res->_mp_size = res_size;
+      }
+#endif
+  }
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/array_init.c b/rts/gmp/mpz/array_init.c
new file mode 100644
index 0000000000..1c22046986
--- /dev/null
+++ b/rts/gmp/mpz/array_init.c
@@ -0,0 +1,48 @@
+/* mpz_array_init (array, array_size, size_per_elem) --
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_array_init (mpz_ptr arr, mp_size_t arr_size, mp_size_t nbits)
+#else
+mpz_array_init (arr, arr_size, nbits)
+     mpz_ptr arr;
+     mp_size_t arr_size;
+     mp_size_t nbits;
+#endif
+{
+  register mp_ptr p;
+  register size_t i;
+  mp_size_t nlimbs;
+
+  nlimbs = (nbits + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB;
+  p = (mp_ptr) (*_mp_allocate_func) (arr_size * nlimbs * BYTES_PER_MP_LIMB);
+
+  for (i = 0; i < arr_size; i++)
+    {
+      arr[i]._mp_alloc = nlimbs + 1; /* Yes, lie a little... */
+      arr[i]._mp_size = 0;
+      arr[i]._mp_d = p + i * nlimbs;
+    }
+}
diff --git a/rts/gmp/mpz/bin_ui.c b/rts/gmp/mpz/bin_ui.c
new file mode 100644
index 0000000000..a7a6c98218
--- /dev/null
+++ b/rts/gmp/mpz/bin_ui.c
@@ -0,0 +1,141 @@
+/* mpz_bin_uiui - compute n over k.
+
+Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* This is a poor implementation.  Look at bin_uiui.c for improvement ideas.
+   In fact consider calling mpz_bin_uiui() when the arguments fit, leaving
+   the code here only for big n.
+
+   The identity bin(n,k) = (-1)^k * bin(-n+k-1,k) can be found in Knuth vol
+   1 section 1.2.6 part G. */
+
+
+/* Enhancement: use mpn_divexact_1 when it exists */
+#define DIVIDE()                                        \
+  ASSERT (SIZ(r) > 0);                                  \
+  ASSERT_NOCARRY (mpn_divrem_1 (PTR(r), (mp_size_t) 0,  \
+                                PTR(r), SIZ(r), kacc)); \
+  SIZ(r) -= (PTR(r)[SIZ(r)-1] == 0);
+
+void
+#if __STDC__
+mpz_bin_ui (mpz_ptr r, mpz_srcptr n, unsigned long int k)
+#else
+mpz_bin_ui (r, n, k)
+     mpz_ptr r;
+     mpz_srcptr n;
+     unsigned long int k;
+#endif
+{
+  mpz_t      ni;
+  mp_limb_t  i;
+  mpz_t      nacc;
+  mp_limb_t  kacc;
+  mp_size_t  negate;
+  
+  if (mpz_sgn (n) < 0)
+    {
+      /* bin(n,k) = (-1)^k * bin(-n+k-1,k), and set ni = -n+k-1 - k = -n-1 */
+      mpz_init (ni);
+      mpz_neg (ni, n);
+      mpz_sub_ui (ni, ni, 1L);
+      negate = (k & 1);   /* (-1)^k */
+    }
+  else
+    {
+      /* bin(n,k) == 0 if k>n
+         (no test for this under the n<0 case, since -n+k-1 >= k there) */
+      if (mpz_cmp_ui (n, k) < 0)
+        {
+          mpz_set_ui (r, 0L);
+          return;
+        }
+
+      /* set ni = n-k */
+      mpz_init (ni);
+      mpz_sub_ui (ni, n, k);
+      negate = 0;
+    }
+
+  /* Now wanting bin(ni+k,k), with ni positive, and "negate" is the sign (0
+     for positive, 1 for negative). */
+  mpz_set_ui (r, 1L);
+
+  /* Rewrite bin(n,k) as bin(n,n-k) if that is smaller.  In this case it's
+     whether ni+k-k < k meaning ni<k, and if so change to denominator ni+k-k
+     = ni, and new ni of ni+k-ni = k.  */
+  if (mpz_cmp_ui (ni, k) < 0)
+    {
+      unsigned long  tmp;
+      tmp = k;
+      k = mpz_get_ui (ni);
+      mpz_set_ui (ni, tmp);
+    }
+
+  kacc = 1;
+  mpz_init_set_ui (nacc, 1);
+
+  for (i = 1; i <= k; i++)
+    {
+      mp_limb_t k1, k0;
+
+#if 0
+      mp_limb_t nacclow;
+      int c;
+
+      nacclow = PTR(nacc)[0];
+      for (c = 0; (((kacc | nacclow) & 1) == 0); c++)
+	{
+	  kacc >>= 1;
+	  nacclow >>= 1;
+	}
+      mpz_div_2exp (nacc, nacc, c);
+#endif
+
+      mpz_add_ui (ni, ni, 1);
+      mpz_mul (nacc, nacc, ni);
+      umul_ppmm (k1, k0, kacc, i);
+      if (k1 != 0)
+	{
+	  /* Accumulator overflow.  Perform bignum step.  */
+	  mpz_mul (r, r, nacc);
+	  mpz_set_ui (nacc, 1);
+          DIVIDE ();
+	  kacc = i;
+	}
+      else
+	{
+	  /* Save new products in accumulators to keep accumulating.  */
+	  kacc = k0;
+	}
+    }
+
+  mpz_mul (r, r, nacc);
+  DIVIDE ();
+  SIZ(r) = (SIZ(r) ^ -negate) + negate;
+
+  mpz_clear (nacc);
+  mpz_clear (ni);
+}
diff --git a/rts/gmp/mpz/bin_uiui.c b/rts/gmp/mpz/bin_uiui.c
new file mode 100644
index 0000000000..b37541ba54
--- /dev/null
+++ b/rts/gmp/mpz/bin_uiui.c
@@ -0,0 +1,120 @@
+/* mpz_bin_uiui - compute n over k.
+
+Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* Avoid reallocs by rounding up any new size */
+#define ROUNDUP_MASK  15
+
+/* Enhancement: use mpn_divexact_1 when it exists */
+#define MULDIV()                                                \
+  MPZ_REALLOC (r, (SIZ(r)+1)|ROUNDUP_MASK);                     \
+  PTR(r)[SIZ(r)] = mpn_mul_1 (PTR(r), PTR(r), SIZ(r), nacc);    \
+  ASSERT_NOCARRY (mpn_divrem_1 (PTR(r), (mp_size_t) 0,          \
+                                PTR(r), SIZ(r)+1, kacc));       \
+  SIZ(r) += (PTR(r)[SIZ(r)] != 0);
+
+void
+#if __STDC__
+mpz_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
+#else
+mpz_bin_uiui (r, n, k)
+     mpz_ptr r;
+     unsigned long int n;
+     unsigned long int k;
+#endif
+{
+  unsigned long int  i, j;
+  mp_limb_t          nacc, kacc;
+  unsigned long int  cnt;
+
+  /* bin(n,k) = 0 if k>n. */
+  if (n < k)
+    {
+      mpz_set_ui (r, 0);
+      return;
+    }
+
+  /* Rewrite bin(n,k) as bin(n,n-k) if that is smaller. */
+  k = MIN (k, n-k);
+
+  /* bin(n,0) = 1 */
+  if (k == 0)
+    {
+      mpz_set_ui (r, 1);
+      return;
+    }
+
+  j = n - k + 1;
+  mpz_set_ui (r, j);
+
+  /* Initialize accumulators.  */
+  nacc = 1;
+  kacc = 1;
+
+  cnt = 0;
+  for (i = 2; i <= k; i++)
+    {
+      mp_limb_t n1, n0, k1, k0;
+
+      j++;
+#if 0
+      /* Remove common multiples of 2.  This will allow us to accumulate
+         more in nacc and kacc before we need a bignum step.  It would make
+         sense to cancel factors of 3, 5, etc too, but this would be best
+         handled by sieving out factors.  Alternatively, we could perform a
+         gcd of the accumulators just as they have overflown, and keep
+         accumulating until the gcd doesn't remove a significant factor.  */
+      while (((nacc | kacc) & 1) == 0)
+        {
+          nacc >>= 1;
+          kacc >>= 1;
+        }
+#else
+      cnt = ((nacc | kacc) & 1) ^ 1;
+      nacc >>= cnt;
+      kacc >>= cnt;
+#endif
+      /* Accumulate next multiples.  */
+      umul_ppmm (n1, n0, nacc, j);
+      umul_ppmm (k1, k0, kacc, i);
+      if (n1 != 0)
+        {
+          /* Accumulator overflow.  Perform bignum step.  */
+          MULDIV ();
+          nacc = j;
+          kacc = i;
+        }
+      else
+        {
+          if (k1 != 0) abort ();
+          /* Save new products in accumulators to keep accumulating.  */
+          nacc = n0;
+          kacc = k0;
+        }
+    }
+
+  /* Take care of whatever is left in accumulators.  */
+  MULDIV ();
+}
diff --git a/rts/gmp/mpz/cdiv_q.c b/rts/gmp/mpz/cdiv_q.c
new file mode 100644
index 0000000000..b15ba8aaa9
--- /dev/null
+++ b/rts/gmp/mpz/cdiv_q.c
@@ -0,0 +1,51 @@
+/* mpz_cdiv_q -- Division rounding the quotient towards +infinity.  The
+   remainder gets the opposite sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_cdiv_q (mpz_ptr quot, mpz_srcptr dividend, mpz_srcptr divisor)
+#else
+mpz_cdiv_q (quot, dividend, divisor)
+     mpz_ptr quot;
+     mpz_srcptr dividend;
+     mpz_srcptr divisor;
+#endif
+{
+  mp_size_t dividend_size = dividend->_mp_size;
+  mp_size_t divisor_size = divisor->_mp_size;
+  mpz_t rem;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  MPZ_TMP_INIT (rem, ABS (divisor_size));
+
+  mpz_tdiv_qr (quot, rem, dividend, divisor);
+
+  if ((divisor_size ^ dividend_size) >= 0 && rem->_mp_size != 0)
+    mpz_add_ui (quot, quot, 1L);
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/cdiv_q_ui.c b/rts/gmp/mpz/cdiv_q_ui.c
new file mode 100644
index 0000000000..74f3a90b83
--- /dev/null
+++ b/rts/gmp/mpz/cdiv_q_ui.c
@@ -0,0 +1,67 @@
+/* mpz_cdiv_q_ui -- Division rounding the quotient towards +infinity.  The
+   remainder gets the opposite sign as the denominator.  In order to make it
+   always fit into the return type, the negative of the true remainder is
+   returned.
+
+Copyright (C) 1994, 1996, 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_cdiv_q_ui (mpz_ptr quot, mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_cdiv_q_ui (quot, dividend, divisor)
+     mpz_ptr quot;
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_ptr quot_ptr;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  if (quot->_mp_alloc < size)
+    _mpz_realloc (quot, size);
+
+  quot_ptr = quot->_mp_d;
+
+  remainder_limb = mpn_divmod_1 (quot_ptr, dividend->_mp_d, size,
+				   (mp_limb_t) divisor);
+
+  if (remainder_limb != 0 && dividend_size >= 0)
+    {
+      mpn_incr_u (quot_ptr, (mp_limb_t) 1);
+      remainder_limb = divisor - remainder_limb;
+    }
+
+  size -= size != 0 && quot_ptr[size - 1] == 0;
+  quot->_mp_size = dividend_size >= 0 ? size : -size;
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/cdiv_qr.c b/rts/gmp/mpz/cdiv_qr.c
new file mode 100644
index 0000000000..29c7c41a4e
--- /dev/null
+++ b/rts/gmp/mpz/cdiv_qr.c
@@ -0,0 +1,64 @@
+/* mpz_cdiv_qr -- Division rounding the quotient towards +infinity.  The
+   remainder gets the opposite sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_cdiv_qr (mpz_ptr quot, mpz_ptr rem, mpz_srcptr dividend, mpz_srcptr divisor)
+#else
+mpz_cdiv_qr (quot, rem, dividend, divisor)
+     mpz_ptr quot;
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     mpz_srcptr divisor;
+#endif
+{
+  mp_size_t divisor_size = divisor->_mp_size;
+  mp_size_t xsize;
+  mpz_t temp_divisor;		/* N.B.: lives until function returns! */
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* We need the original value of the divisor after the quotient and
+     remainder have been preliminary calculated.  We have to copy it to
+     temporary space if it's the same variable as either QUOT or REM.  */
+  if (quot == divisor || rem == divisor)
+    {
+      MPZ_TMP_INIT (temp_divisor, ABS (divisor_size));
+      mpz_set (temp_divisor, divisor);
+      divisor = temp_divisor;
+    }
+
+  xsize = dividend->_mp_size ^ divisor_size;;
+  mpz_tdiv_qr (quot, rem, dividend, divisor);
+
+  if (xsize >= 0 && rem->_mp_size != 0)
+    {
+      mpz_add_ui (quot, quot, 1L);
+      mpz_sub (rem, rem, divisor);
+    }
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/cdiv_qr_ui.c b/rts/gmp/mpz/cdiv_qr_ui.c
new file mode 100644
index 0000000000..a7873c6e20
--- /dev/null
+++ b/rts/gmp/mpz/cdiv_qr_ui.c
@@ -0,0 +1,71 @@
+/* mpz_cdiv_qr_ui -- Division rounding the quotient towards +infinity.  The
+   remainder gets the opposite sign as the denominator.  In order to make it
+   always fit into the return type, the negative of the true remainder is
+   returned.
+
+Copyright (C) 1994, 1995, 1996, 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_cdiv_qr_ui (mpz_ptr quot, mpz_ptr rem, mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_cdiv_qr_ui (quot, rem, dividend, divisor)
+     mpz_ptr quot;
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_ptr quot_ptr;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  if (quot->_mp_alloc < size)
+    _mpz_realloc (quot, size);
+
+  quot_ptr = quot->_mp_d;
+
+  remainder_limb = mpn_divmod_1 (quot_ptr, dividend->_mp_d, size,
+				 (mp_limb_t) divisor);
+
+  if (remainder_limb != 0 && dividend_size >= 0)
+    {
+      mpn_incr_u (quot_ptr, (mp_limb_t) 1);
+      remainder_limb = divisor - remainder_limb;
+    }
+
+  size -= size != 0 && quot_ptr[size - 1] == 0;
+  quot->_mp_size = dividend_size >= 0 ? size : -size;
+
+  rem->_mp_d[0] = remainder_limb;
+  rem->_mp_size = -(remainder_limb != 0);
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/cdiv_r.c b/rts/gmp/mpz/cdiv_r.c
new file mode 100644
index 0000000000..e96ce7e677
--- /dev/null
+++ b/rts/gmp/mpz/cdiv_r.c
@@ -0,0 +1,59 @@
+/* mpz_cdiv_r -- Division rounding the quotient towards +infinity.  The
+   remainder gets the opposite sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_cdiv_r (mpz_ptr rem, mpz_srcptr dividend, mpz_srcptr divisor)
+#else
+mpz_cdiv_r (rem, dividend, divisor)
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     mpz_srcptr divisor;
+#endif
+{
+  mp_size_t divisor_size = divisor->_mp_size;
+  mpz_t temp_divisor;		/* N.B.: lives until function returns! */
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* We need the original value of the divisor after the remainder has been
+     preliminary calculated.  We have to copy it to temporary space if it's
+     the same variable as REM.  */
+  if (rem == divisor)
+    {
+
+      MPZ_TMP_INIT (temp_divisor, ABS (divisor_size));
+      mpz_set (temp_divisor, divisor);
+      divisor = temp_divisor;
+    }
+
+  mpz_tdiv_r (rem, dividend, divisor);
+
+  if ((divisor_size ^ dividend->_mp_size) >= 0 && rem->_mp_size != 0)
+    mpz_sub (rem, rem, divisor);
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/cdiv_r_ui.c b/rts/gmp/mpz/cdiv_r_ui.c
new file mode 100644
index 0000000000..e17e2381c0
--- /dev/null
+++ b/rts/gmp/mpz/cdiv_r_ui.c
@@ -0,0 +1,57 @@
+/* mpz_cdiv_r_ui -- Division rounding the quotient towards +infinity.  The
+   remainder gets the opposite sign as the denominator.  In order to make it
+   always fit into the return type, the negative of the true remainder is
+   returned.
+
+Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_cdiv_r_ui (mpz_ptr rem, mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_cdiv_r_ui (rem, dividend, divisor)
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  remainder_limb = mpn_mod_1 (dividend->_mp_d, size, (mp_limb_t) divisor);
+
+  if (remainder_limb != 0 && dividend_size >= 0)
+    remainder_limb = divisor - remainder_limb;
+
+  rem->_mp_d[0] = remainder_limb;
+  rem->_mp_size = -(remainder_limb != 0);
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/cdiv_ui.c b/rts/gmp/mpz/cdiv_ui.c
new file mode 100644
index 0000000000..63547a78c0
--- /dev/null
+++ b/rts/gmp/mpz/cdiv_ui.c
@@ -0,0 +1,50 @@
+/* mpz_cdiv_ui -- Division rounding the quotient towards +infinity.  The
+   remainder gets the opposite sign as the denominator.  In order to make it
+   always fit into the return type, the negative of the true remainder is
+   returned.
+
+Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_cdiv_ui (mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_cdiv_ui (dividend, divisor)
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_limb_t remainder_limb;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  remainder_limb = mpn_mod_1 (dividend->_mp_d, size, (mp_limb_t) divisor);
+
+  if (remainder_limb != 0 && dividend_size >= 0)
+    remainder_limb = divisor - remainder_limb;
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/clear.c b/rts/gmp/mpz/clear.c
new file mode 100644
index 0000000000..5224553f9e
--- /dev/null
+++ b/rts/gmp/mpz/clear.c
@@ -0,0 +1,35 @@
+/* mpz_clear -- de-allocate the space occupied by the dynamic digit space of
+   an integer.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_clear (mpz_ptr m)
+#else
+mpz_clear (m)
+     mpz_ptr m;
+#endif
+{
+  (*_mp_free_func) (m->_mp_d, m->_mp_alloc * BYTES_PER_MP_LIMB);
+}
diff --git a/rts/gmp/mpz/clrbit.c b/rts/gmp/mpz/clrbit.c
new file mode 100644
index 0000000000..865d84902f
--- /dev/null
+++ b/rts/gmp/mpz/clrbit.c
@@ -0,0 +1,114 @@
+/* mpz_clrbit -- clear a specified bit.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_clrbit (mpz_ptr d, unsigned long int bit_index)
+#else
+mpz_clrbit (d, bit_index)
+     mpz_ptr d;
+     unsigned long int bit_index;
+#endif
+{
+  mp_size_t dsize = d->_mp_size;
+  mp_ptr dp = d->_mp_d;
+  mp_size_t limb_index;
+
+  limb_index = bit_index / BITS_PER_MP_LIMB;
+  if (dsize >= 0)
+    {
+      if (limb_index < dsize)
+	{
+	  dp[limb_index] &= ~((mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB));
+	  MPN_NORMALIZE (dp, dsize);
+	  d->_mp_size = dsize;
+	}
+      else
+	;
+    }
+  else
+    {
+      mp_size_t zero_bound;
+
+      /* Simulate two's complement arithmetic, i.e. simulate
+	 1. Set OP = ~(OP - 1) [with infinitely many leading ones].
+	 2. clear the bit.
+	 3. Set OP = ~OP + 1.  */
+
+      dsize = -dsize;
+
+      /* No upper bound on this loop, we're sure there's a non-zero limb
+	 sooner ot later.  */
+      for (zero_bound = 0; ; zero_bound++)
+	if (dp[zero_bound] != 0)
+	  break;
+
+      if (limb_index > zero_bound)
+	{
+	  if (limb_index < dsize)
+	    dp[limb_index] |= (mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB);
+	  else
+	    {
+	      /* Ugh.  The bit should be cleared outside of the end of the
+		 number.  We have to increase the size of the number.  */
+	      if (d->_mp_alloc < limb_index + 1)
+		{
+		  _mpz_realloc (d, limb_index + 1);
+		  dp = d->_mp_d;
+		}
+	      MPN_ZERO (dp + dsize, limb_index - dsize);
+	      dp[limb_index] = (mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB);
+	      d->_mp_size = -(limb_index + 1);
+	    }
+	}
+      else if (limb_index == zero_bound)
+	{
+	  dp[limb_index] = ((dp[limb_index] - 1)
+			    | ((mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB))) + 1;
+	  if (dp[limb_index] == 0)
+	    {
+	      mp_size_t i;
+	      for (i = limb_index + 1; i < dsize; i++)
+		{
+		  dp[i] += 1;
+		  if (dp[i] != 0)
+		    goto fin;
+		}
+	      /* We got carry all way out beyond the end of D.  Increase
+		 its size (and allocation if necessary).  */
+	      dsize++;
+	      if (d->_mp_alloc < dsize)
+		{
+		  _mpz_realloc (d, dsize);
+		  dp = d->_mp_d;
+		}
+	      dp[i] = 1;
+	      d->_mp_size = -dsize;
+	    fin:;
+	    }
+	}
+      else
+	;
+    }
+}
diff --git a/rts/gmp/mpz/cmp.c b/rts/gmp/mpz/cmp.c
new file mode 100644
index 0000000000..60628348e5
--- /dev/null
+++ b/rts/gmp/mpz/cmp.c
@@ -0,0 +1,75 @@
+/* mpz_cmp(u,v) -- Compare U, V.  Return postive, zero, or negative
+   based on if U > V, U == V, or U < V.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#ifndef BERKELEY_MP
+int
+#if __STDC__
+mpz_cmp (mpz_srcptr u, mpz_srcptr v)
+#else
+mpz_cmp (u, v)
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+#else /* BERKELEY_MP */
+int
+#if __STDC__
+mcmp (mpz_srcptr u, mpz_srcptr v)
+#else
+mcmp (u, v)
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+#endif /* BERKELEY_MP */
+{
+  mp_size_t usize = u->_mp_size;
+  mp_size_t vsize = v->_mp_size;
+  mp_size_t size;
+  mp_srcptr up, vp;
+  int cmp;
+
+  if (usize != vsize)
+    return usize - vsize;
+
+  if (usize == 0)
+    return 0;
+
+  size = ABS (usize);
+
+  up = u->_mp_d;
+  vp = v->_mp_d;
+
+  cmp = mpn_cmp (up, vp, size);
+
+  if (cmp == 0)
+    return 0;
+
+  if ((cmp < 0) == (usize < 0))
+    return 1;
+  else
+    return -1;
+}
diff --git a/rts/gmp/mpz/cmp_si.c b/rts/gmp/mpz/cmp_si.c
new file mode 100644
index 0000000000..0c2212fbe9
--- /dev/null
+++ b/rts/gmp/mpz/cmp_si.c
@@ -0,0 +1,64 @@
+/* mpz_cmp_si(u,v) -- Compare an integer U with a single-word int V.
+   Return positive, zero, or negative based on if U > V, U == V, or U < V.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+_mpz_cmp_si (mpz_srcptr u, signed long int v_digit)
+#else
+_mpz_cmp_si (u, v_digit)
+     mpz_srcptr u;
+     signed long int v_digit;
+#endif
+{
+  mp_size_t usize = u->_mp_size;
+  mp_size_t vsize;
+  mp_limb_t u_digit;
+
+  vsize = 0;
+  if (v_digit > 0)
+    vsize = 1;
+  else if (v_digit < 0)
+    {
+      vsize = -1;
+      v_digit = -v_digit;
+    }
+
+  if (usize != vsize)
+    return usize - vsize;
+
+  if (usize == 0)
+    return 0;
+
+  u_digit = u->_mp_d[0];
+
+  if (u_digit == (mp_limb_t) (unsigned long) v_digit)
+    return 0;
+
+  if (u_digit > (mp_limb_t) (unsigned long) v_digit)
+    return usize;
+  else
+    return -usize;
+}
diff --git a/rts/gmp/mpz/cmp_ui.c b/rts/gmp/mpz/cmp_ui.c
new file mode 100644
index 0000000000..fd84f301c1
--- /dev/null
+++ b/rts/gmp/mpz/cmp_ui.c
@@ -0,0 +1,53 @@
+/* mpz_cmp_ui.c -- Compare a mpz_t a with an mp_limb_t b.  Return positive,
+  zero, or negative based on if a > b, a == b, or a < b.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+_mpz_cmp_ui (mpz_srcptr u, unsigned long int v_digit)
+#else
+_mpz_cmp_ui (u, v_digit)
+     mpz_srcptr u;
+     unsigned long int v_digit;
+#endif
+{
+  mp_size_t usize = u->_mp_size;
+
+  if (usize == 0)
+    return -(v_digit != 0);
+
+  if (usize == 1)
+    {
+      mp_limb_t u_digit;
+
+      u_digit = u->_mp_d[0];
+      if (u_digit > v_digit)
+	return 1;
+      if (u_digit < v_digit)
+	return -1;
+      return 0;
+    }
+
+  return (usize > 0) ? 1 : -1;
+}
diff --git a/rts/gmp/mpz/cmpabs.c b/rts/gmp/mpz/cmpabs.c
new file mode 100644
index 0000000000..037d7a9145
--- /dev/null
+++ b/rts/gmp/mpz/cmpabs.c
@@ -0,0 +1,57 @@
+/* mpz_cmpabs(u,v) -- Compare U, V.  Return postive, zero, or negative
+   based on if U > V, U == V, or U < V.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_cmpabs (mpz_srcptr u, mpz_srcptr v)
+#else
+mpz_cmpabs (u, v)
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+{
+  mp_size_t usize = u->_mp_size;
+  mp_size_t vsize = v->_mp_size;
+  mp_size_t size;
+  mp_srcptr up, vp;
+  int cmp;
+
+  usize = ABS (usize);
+  vsize = ABS (vsize);
+
+  if (usize != vsize)
+    return usize - vsize;
+
+  if (usize == 0)
+    return 0;
+
+  up = u->_mp_d;
+  vp = v->_mp_d;
+
+  cmp = mpn_cmp (up, vp, usize);
+
+  return cmp;
+}
diff --git a/rts/gmp/mpz/cmpabs_ui.c b/rts/gmp/mpz/cmpabs_ui.c
new file mode 100644
index 0000000000..db816b5820
--- /dev/null
+++ b/rts/gmp/mpz/cmpabs_ui.c
@@ -0,0 +1,56 @@
+/* mpz_cmpabs_ui.c -- Compare a mpz_t a with an mp_limb_t b.  Return positive,
+  zero, or negative based on if a > b, a == b, or a < b.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_cmpabs_ui (mpz_srcptr u, unsigned long int v_digit)
+#else
+mpz_cmpabs_ui (u, v_digit)
+     mpz_srcptr u;
+     unsigned long int v_digit;
+#endif
+{
+  mp_size_t usize = u->_mp_size;
+
+  if (usize == 0)
+    return -(v_digit != 0);
+
+  usize = ABS (usize);
+
+  if (usize == 1)
+    {
+      mp_limb_t u_digit;
+
+      u_digit = u->_mp_d[0];
+      if (u_digit > v_digit)
+	return 1;
+      if (u_digit < v_digit)
+	return -1;
+      return 0;
+    }
+
+  return 1;
+}
diff --git a/rts/gmp/mpz/com.c b/rts/gmp/mpz/com.c
new file mode 100644
index 0000000000..18d6427779
--- /dev/null
+++ b/rts/gmp/mpz/com.c
@@ -0,0 +1,93 @@
+/* mpz_com(mpz_ptr dst, mpz_ptr src) -- Assign the bit-complemented value of
+   SRC to DST.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_com (mpz_ptr dst, mpz_srcptr src)
+#else
+mpz_com (dst, src)
+     mpz_ptr dst;
+     mpz_srcptr src;
+#endif
+{
+  mp_size_t size = src->_mp_size;
+  mp_srcptr src_ptr;
+  mp_ptr dst_ptr;
+
+  if (size >= 0)
+    {
+      /* As with infinite precision: one's complement, two's complement.
+	 But this can be simplified using the identity -x = ~x + 1.
+	 So we're going to compute (~~x) + 1 = x + 1!  */
+
+      if (dst->_mp_alloc < size + 1)
+	_mpz_realloc (dst, size + 1);
+
+      src_ptr = src->_mp_d;
+      dst_ptr = dst->_mp_d;
+
+      if (size == 0)
+	{
+	  /* Special case, as mpn_add wants the first arg's size >= the
+	     second arg's size.  */
+	  dst_ptr[0] = 1;
+	  dst->_mp_size = -1;
+	  return;
+	}
+
+      {
+	mp_limb_t cy;
+
+	cy = mpn_add_1 (dst_ptr, src_ptr, size, (mp_limb_t) 1);
+	if (cy)
+	  {
+	    dst_ptr[size] = cy;
+	    size++;
+	  }
+      }
+
+      /* Store a negative size, to indicate ones-extension.  */
+      dst->_mp_size = -size;
+    }
+  else
+    {
+      /* As with infinite precision: two's complement, then one's complement.
+	 But that can be simplified using the identity -x = ~(x - 1).
+	 So we're going to compute ~~(x - 1) = x - 1!  */
+      size = -size;
+
+      if (dst->_mp_alloc < size)
+	_mpz_realloc (dst, size);
+
+      src_ptr = src->_mp_d;
+      dst_ptr = dst->_mp_d;
+
+      mpn_sub_1 (dst_ptr, src_ptr, size, (mp_limb_t) 1);
+      size -= dst_ptr[size - 1] == 0;
+
+      /* Store a positive size, to indicate zero-extension.  */
+      dst->_mp_size = size;
+    }
+}
diff --git a/rts/gmp/mpz/divexact.c b/rts/gmp/mpz/divexact.c
new file mode 100644
index 0000000000..c2970454fd
--- /dev/null
+++ b/rts/gmp/mpz/divexact.c
@@ -0,0 +1,125 @@
+/* mpz_divexact -- finds quotient when known that quot * den == num && den != 0.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+/*  Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
+
+    Funding for this work has been partially provided by Conselho Nacional
+    de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
+    301314194-2, and was done while I was a visiting reseacher in the Instituto
+    de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
+
+    References:
+	T. Jebelean, An algorithm for exact division, Journal of Symbolic
+	Computation, v. 15, 1993, pp. 169-180.	*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+#if __STDC__
+mpz_divexact (mpz_ptr quot, mpz_srcptr num, mpz_srcptr den)
+#else
+mpz_divexact (quot, num, den)
+     mpz_ptr quot;
+     mpz_srcptr num;
+     mpz_srcptr den;
+#endif
+{
+  mp_ptr qp, tp;
+  mp_size_t qsize, tsize;
+  mp_srcptr np, dp;
+  mp_size_t nsize, dsize;
+  TMP_DECL (marker);
+
+  nsize = ABS (num->_mp_size);
+  dsize = ABS (den->_mp_size);
+
+  qsize = nsize - dsize + 1;
+  if (quot->_mp_alloc < qsize)
+    _mpz_realloc (quot, qsize);
+
+  np = num->_mp_d;
+  dp = den->_mp_d;
+  qp = quot->_mp_d;
+
+  if (nsize == 0)
+    {
+      if (dsize == 0)
+	DIVIDE_BY_ZERO;
+      quot->_mp_size = 0;
+      return;
+    }
+
+  if (dsize <= 1)
+    {
+      if (dsize == 1)
+	{
+	  mpn_divmod_1 (qp, np, nsize, dp[0]);
+	  qsize -= qp[qsize - 1] == 0;
+	  quot->_mp_size = (num->_mp_size ^ den->_mp_size) >= 0 ? qsize : -qsize;
+	  return;
+	}
+
+      /*  Generate divide-by-zero error since dsize == 0.  */
+      DIVIDE_BY_ZERO;
+    }
+
+  TMP_MARK (marker);
+
+  /*  QUOT <-- NUM/2^r, T <-- DEN/2^r where = r number of twos in DEN.  */
+  while (dp[0] == 0)
+    np += 1, nsize -= 1, dp += 1, dsize -= 1;
+  tsize = MIN (qsize, dsize);
+  if ((dp[0] & 1) != 0)
+    {
+      if (quot == den)		/*  QUOT and DEN overlap.  */
+	{
+	  tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+	  MPN_COPY (tp, dp, tsize);
+	}
+      else
+	tp = (mp_ptr) dp;
+      if (qp != np)
+	MPN_COPY_INCR (qp, np, qsize);
+    }
+  else
+    {
+      unsigned int r;
+      tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+      count_trailing_zeros (r, dp[0]);
+      mpn_rshift (tp, dp, tsize, r);
+      if (dsize > tsize)
+	tp[tsize - 1] |= dp[tsize] << (BITS_PER_MP_LIMB - r);
+      mpn_rshift (qp, np, qsize, r);
+      if (nsize > qsize)
+	qp[qsize - 1] |= np[qsize] << (BITS_PER_MP_LIMB - r);
+    }
+
+  /*  Now QUOT <-- QUOT/T.  */
+  mpn_bdivmod (qp, qp, qsize, tp, tsize, qsize * BITS_PER_MP_LIMB);
+  MPN_NORMALIZE (qp, qsize);
+
+  quot->_mp_size = (num->_mp_size ^ den->_mp_size) >= 0 ? qsize : -qsize;
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/dump.c b/rts/gmp/mpz/dump.c
new file mode 100644
index 0000000000..dc318ac8cf
--- /dev/null
+++ b/rts/gmp/mpz/dump.c
@@ -0,0 +1,44 @@
+/* mpz_dump - Dump an integer to stdout.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS NOT SAFE TO
+   CALL THIS FUNCTION DIRECTLY.  IN FACT, IT IS ALMOST GUARANTEED THAT THIS
+   FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_dump (mpz_srcptr u)
+#else
+mpz_dump (u)
+     mpz_srcptr u;
+#endif
+{
+  char *str;
+
+  str = mpz_get_str (0, 10, u);
+  printf ("%s\n", str);
+  (*_mp_free_func) (str, 0);/* ??? broken alloc interface, pass what size ??? */
+}
diff --git a/rts/gmp/mpz/fac_ui.c b/rts/gmp/mpz/fac_ui.c
new file mode 100644
index 0000000000..85f40f271c
--- /dev/null
+++ b/rts/gmp/mpz/fac_ui.c
@@ -0,0 +1,157 @@
+/* mpz_fac_ui(result, n) -- Set RESULT to N!.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#ifdef DBG
+#include <stdio.h>
+#endif
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+#if __STDC__
+mpz_fac_ui (mpz_ptr result, unsigned long int n)
+#else
+mpz_fac_ui (result, n)
+     mpz_ptr result;
+     unsigned long int n;
+#endif
+{
+#if SIMPLE_FAC
+
+  /* Be silly.  Just multiply the numbers in ascending order.  O(n**2).  */
+
+  unsigned long int k;
+
+  mpz_set_ui (result, 1L);
+
+  for (k = 2; k <= n; k++)
+    mpz_mul_ui (result, result, k);
+#else
+
+  /* Be smarter.  Multiply groups of numbers in ascending order until the
+     product doesn't fit in a limb.  Multiply these partial product in a
+     balanced binary tree fashion, to make the operand have as equal sizes
+     as possible.  When the operands have about the same size, mpn_mul
+     becomes faster.  */
+
+  unsigned long int p, k;
+  mp_limb_t p1, p0;
+
+  /* Stack of partial products, used to make the computation balanced
+     (i.e. make the sizes of the multiplication operands equal).  The
+     topmost position of MP_STACK will contain a one-limb partial product,
+     the second topmost will contain a two-limb partial product, and so
+     on.  MP_STACK[0] will contain a partial product with 2**t limbs.
+     To compute n! MP_STACK needs to be less than
+     log(n)**2/log(BITS_PER_MP_LIMB), so 30 is surely enough.  */
+#define MP_STACK_SIZE 30
+  mpz_t mp_stack[MP_STACK_SIZE];
+
+  /* TOP is an index into MP_STACK, giving the topmost element.
+     TOP_LIMIT_SO_FAR is the largets value it has taken so far.  */
+  int top, top_limit_so_far;
+
+  /* Count of the total number of limbs put on MP_STACK so far.  This
+     variable plays an essential role in making the compututation balanced.
+     See below.  */
+  unsigned int tree_cnt;
+
+  top = top_limit_so_far = -1;
+  tree_cnt = 0;
+  p = 1;
+  for (k = 2; k <= n; k++)
+    {
+      /* Multiply the partial product in P with K.  */
+      umul_ppmm (p1, p0, (mp_limb_t) p, (mp_limb_t) k);
+
+      /* Did we get overflow into the high limb, i.e. is the partial
+	 product now more than one limb?  */
+      if (p1 != 0)
+	{
+	  tree_cnt++;
+
+	  if (tree_cnt % 2 == 0)
+	    {
+	      mp_size_t i;
+
+	      /* TREE_CNT is even (i.e. we have generated an even number of
+		 one-limb partial products), which means that we have a
+		 single-limb product on the top of MP_STACK.  */
+
+	      mpz_mul_ui (mp_stack[top], mp_stack[top], p);
+
+	      /* If TREE_CNT is divisable by 4, 8,..., we have two
+		 similar-sized partial products with 2, 4,... limbs at
+		 the topmost two positions of MP_STACK.  Multiply them
+		 to form a new partial product with 4, 8,... limbs.  */
+	      for (i = 4; (tree_cnt & (i - 1)) == 0; i <<= 1)
+		{
+		  mpz_mul (mp_stack[top - 1],
+			   mp_stack[top], mp_stack[top - 1]);
+		  top--;
+		}
+	    }
+	  else
+	    {
+	      /* Put the single-limb partial product in P on the stack.
+		 (The next time we get a single-limb product, we will
+		 multiply the two together.)  */
+	      top++;
+	      if (top > top_limit_so_far)
+		{
+		  if (top > MP_STACK_SIZE)
+		    abort();
+		  /* The stack is now bigger than ever, initialize the top
+		     element.  */
+		  mpz_init_set_ui (mp_stack[top], p);
+		  top_limit_so_far++;
+		}
+	      else
+		mpz_set_ui (mp_stack[top], p);
+	    }
+
+	  /* We ignored the last result from umul_ppmm.  Put K in P as the
+	     first component of the next single-limb partial product.  */
+	  p = k;
+	}
+      else
+	/* We didn't get overflow in umul_ppmm.  Put p0 in P and try
+	   with one more value of K.  */
+	p = p0;			/* bogus if long != mp_limb_t */
+    }
+
+  /* We have partial products in mp_stack[0..top], in descending order.
+     We also have a small partial product in p.
+     Their product is the final result.  */
+  if (top < 0)
+    mpz_set_ui (result, p);
+  else
+    mpz_mul_ui (result, mp_stack[top--], p);
+  while (top >= 0)
+    mpz_mul (result, result, mp_stack[top--]);
+
+  /* Free the storage allocated for MP_STACK.  */
+  for (top = top_limit_so_far; top >= 0; top--)
+    mpz_clear (mp_stack[top]);
+#endif
+}
diff --git a/rts/gmp/mpz/fdiv_q.c b/rts/gmp/mpz/fdiv_q.c
new file mode 100644
index 0000000000..9d75ca33d2
--- /dev/null
+++ b/rts/gmp/mpz/fdiv_q.c
@@ -0,0 +1,51 @@
+/* mpz_fdiv_q -- Division rounding the quotient towards -infinity.
+   The remainder gets the same sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_fdiv_q (mpz_ptr quot, mpz_srcptr dividend, mpz_srcptr divisor)
+#else
+mpz_fdiv_q (quot, dividend, divisor)
+     mpz_ptr quot;
+     mpz_srcptr dividend;
+     mpz_srcptr divisor;
+#endif
+{
+  mp_size_t dividend_size = dividend->_mp_size;
+  mp_size_t divisor_size = divisor->_mp_size;
+  mpz_t rem;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  MPZ_TMP_INIT (rem, ABS (divisor_size));
+
+  mpz_tdiv_qr (quot, rem, dividend, divisor);
+
+  if ((divisor_size ^ dividend_size) < 0 && rem->_mp_size != 0)
+    mpz_sub_ui (quot, quot, 1L);
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/fdiv_q_2exp.c b/rts/gmp/mpz/fdiv_q_2exp.c
new file mode 100644
index 0000000000..8e02180ecc
--- /dev/null
+++ b/rts/gmp/mpz/fdiv_q_2exp.c
@@ -0,0 +1,104 @@
+/* mpz_fdiv_q_2exp -- Divide an integer by 2**CNT.  Round the quotient
+   towards -infinity.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1998, 1999 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_fdiv_q_2exp (mpz_ptr w, mpz_srcptr u, unsigned long int cnt)
+#else
+mpz_fdiv_q_2exp (w, u, cnt)
+     mpz_ptr w;
+     mpz_srcptr u;
+     unsigned long int cnt;
+#endif
+{
+  mp_size_t usize = u->_mp_size;
+  mp_size_t wsize;
+  mp_size_t abs_usize = ABS (usize);
+  mp_size_t limb_cnt;
+  mp_ptr wp;
+  mp_limb_t round = 0;
+
+  limb_cnt = cnt / BITS_PER_MP_LIMB;
+  wsize = abs_usize - limb_cnt;
+  if (wsize <= 0)
+    {
+      wp = w->_mp_d;
+      wsize = 0;
+      /* Set ROUND since we know we skip some non-zero words in this case.
+	 Well, if U is zero, we don't, but then this will be taken care of
+	 below, since rounding only really takes place for negative U.  */
+      round = 1;
+      wp[0] = 1;
+      w->_mp_size = -(usize < 0);
+      return;
+    }
+  else
+    {
+      mp_size_t i;
+      mp_ptr up;
+
+      /* Make sure there is enough space.  We make an extra limb
+	 here to account for possible rounding at the end.  */
+      if (w->_mp_alloc < wsize + 1)
+	_mpz_realloc (w, wsize + 1);
+
+      wp = w->_mp_d;
+      up = u->_mp_d;
+
+      /* Set ROUND if we are about skip some non-zero limbs.  */
+      for (i = 0; i < limb_cnt && round == 0; i++)
+	round = up[i];
+
+      cnt %= BITS_PER_MP_LIMB;
+      if (cnt != 0)
+	{
+	  round |= mpn_rshift (wp, up + limb_cnt, wsize, cnt);
+	  wsize -= wp[wsize - 1] == 0;
+	}
+      else
+	{
+	  MPN_COPY_INCR (wp, up + limb_cnt, wsize);
+	}
+    }
+
+  if (usize < 0 && round != 0)
+    {
+      mp_limb_t cy;
+      if (wsize != 0)
+	{
+	  cy = mpn_add_1 (wp, wp, wsize, (mp_limb_t) 1);
+	  wp[wsize] = cy;
+	  wsize += cy;
+	}
+      else
+	{
+	  /* We shifted something negative to zero.  The result is -1.  */
+	  wp[0] = 1;
+	  wsize = 1;
+	}
+    }
+  w->_mp_size = usize >= 0 ? wsize : -wsize;
+}
diff --git a/rts/gmp/mpz/fdiv_q_ui.c b/rts/gmp/mpz/fdiv_q_ui.c
new file mode 100644
index 0000000000..55d2498693
--- /dev/null
+++ b/rts/gmp/mpz/fdiv_q_ui.c
@@ -0,0 +1,65 @@
+/* mpz_fdiv_q_ui -- Division rounding the quotient towards -infinity.
+   The remainder gets the same sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996, 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_fdiv_q_ui (mpz_ptr quot, mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_fdiv_q_ui (quot, dividend, divisor)
+     mpz_ptr quot;
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_ptr quot_ptr;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  if (quot->_mp_alloc < size)
+    _mpz_realloc (quot, size);
+
+  quot_ptr = quot->_mp_d;
+
+  remainder_limb = mpn_divmod_1 (quot_ptr, dividend->_mp_d, size,
+				   (mp_limb_t) divisor);
+
+  if (remainder_limb != 0 && dividend_size < 0)
+    {
+      mpn_incr_u (quot_ptr, (mp_limb_t) 1);
+      remainder_limb = divisor - remainder_limb;
+    }
+
+  size -= size != 0 && quot_ptr[size - 1] == 0;
+  quot->_mp_size = dividend_size >= 0 ? size : -size;
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/fdiv_qr.c b/rts/gmp/mpz/fdiv_qr.c
new file mode 100644
index 0000000000..06ce50607b
--- /dev/null
+++ b/rts/gmp/mpz/fdiv_qr.c
@@ -0,0 +1,64 @@
+/* mpz_fdiv_qr -- Division rounding the quotient towards -infinity.
+   The remainder gets the same sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_fdiv_qr (mpz_ptr quot, mpz_ptr rem, mpz_srcptr dividend, mpz_srcptr divisor)
+#else
+mpz_fdiv_qr (quot, rem, dividend, divisor)
+     mpz_ptr quot;
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     mpz_srcptr divisor;
+#endif
+{
+  mp_size_t divisor_size = divisor->_mp_size;
+  mp_size_t xsize;
+  mpz_t temp_divisor;		/* N.B.: lives until function returns! */
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* We need the original value of the divisor after the quotient and
+     remainder have been preliminary calculated.  We have to copy it to
+     temporary space if it's the same variable as either QUOT or REM.  */
+  if (quot == divisor || rem == divisor)
+    {
+      MPZ_TMP_INIT (temp_divisor, ABS (divisor_size));
+      mpz_set (temp_divisor, divisor);
+      divisor = temp_divisor;
+    }
+
+  xsize = dividend->_mp_size ^ divisor_size;;
+  mpz_tdiv_qr (quot, rem, dividend, divisor);
+
+  if (xsize < 0 && rem->_mp_size != 0)
+    {
+      mpz_sub_ui (quot, quot, 1L);
+      mpz_add (rem, rem, divisor);
+    }
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/fdiv_qr_ui.c b/rts/gmp/mpz/fdiv_qr_ui.c
new file mode 100644
index 0000000000..600c0dacfc
--- /dev/null
+++ b/rts/gmp/mpz/fdiv_qr_ui.c
@@ -0,0 +1,69 @@
+/* mpz_fdiv_qr_ui -- Division rounding the quotient towards -infinity.
+   The remainder gets the same sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996, 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_fdiv_qr_ui (mpz_ptr quot, mpz_ptr rem, mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_fdiv_qr_ui (quot, rem, dividend, divisor)
+     mpz_ptr quot;
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_ptr quot_ptr;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  if (quot->_mp_alloc < size)
+    _mpz_realloc (quot, size);
+
+  quot_ptr = quot->_mp_d;
+
+  remainder_limb = mpn_divmod_1 (quot_ptr, dividend->_mp_d, size,
+				 (mp_limb_t) divisor);
+
+  if (remainder_limb != 0 && dividend_size < 0)
+    {
+      mpn_incr_u (quot_ptr, (mp_limb_t) 1);
+      remainder_limb = divisor - remainder_limb;
+    }
+
+  size -= size != 0 && quot_ptr[size - 1] == 0;
+  quot->_mp_size = dividend_size >= 0 ? size : -size;
+
+  rem->_mp_d[0] = remainder_limb;
+  rem->_mp_size = remainder_limb != 0;
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/fdiv_r.c b/rts/gmp/mpz/fdiv_r.c
new file mode 100644
index 0000000000..a3652838d2
--- /dev/null
+++ b/rts/gmp/mpz/fdiv_r.c
@@ -0,0 +1,58 @@
+/* mpz_fdiv_r -- Division rounding the quotient towards -infinity.
+   The remainder gets the same sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_fdiv_r (mpz_ptr rem, mpz_srcptr dividend, mpz_srcptr divisor)
+#else
+mpz_fdiv_r (rem, dividend, divisor)
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     mpz_srcptr divisor;
+#endif
+{
+  mp_size_t divisor_size = divisor->_mp_size;
+  mpz_t temp_divisor;		/* N.B.: lives until function returns! */
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* We need the original value of the divisor after the remainder has been
+     preliminary calculated.  We have to copy it to temporary space if it's
+     the same variable as REM.  */
+  if (rem == divisor)
+    {
+      MPZ_TMP_INIT (temp_divisor, ABS (divisor_size));
+      mpz_set (temp_divisor, divisor);
+      divisor = temp_divisor;
+    }
+
+  mpz_tdiv_r (rem, dividend, divisor);
+
+  if ((divisor_size ^ dividend->_mp_size) < 0 && rem->_mp_size != 0)
+    mpz_add (rem, rem, divisor);
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/fdiv_r_2exp.c b/rts/gmp/mpz/fdiv_r_2exp.c
new file mode 100644
index 0000000000..081ce19203
--- /dev/null
+++ b/rts/gmp/mpz/fdiv_r_2exp.c
@@ -0,0 +1,156 @@
+/* mpz_fdiv_r_2exp -- Divide a integer by 2**CNT and produce a remainder.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1998, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_fdiv_r_2exp (mpz_ptr res, mpz_srcptr in, unsigned long int cnt)
+#else
+mpz_fdiv_r_2exp (res, in, cnt)
+     mpz_ptr res;
+     mpz_srcptr in;
+     unsigned long int cnt;
+#endif
+{
+  mp_size_t in_size = ABS (in->_mp_size);
+  mp_size_t res_size;
+  mp_size_t limb_cnt = cnt / BITS_PER_MP_LIMB;
+  mp_srcptr in_ptr = in->_mp_d;
+
+  if (in_size > limb_cnt)
+    {
+      /* The input operand is (probably) greater than 2**CNT.  */
+      mp_limb_t x;
+
+      x = in_ptr[limb_cnt] & (((mp_limb_t) 1 << cnt % BITS_PER_MP_LIMB) - 1);
+      if (x != 0)
+	{
+	  res_size = limb_cnt + 1;
+	  if (res->_mp_alloc < res_size)
+	    _mpz_realloc (res, res_size);
+
+	  res->_mp_d[limb_cnt] = x;
+	}
+      else
+	{
+	  res_size = limb_cnt;
+	  MPN_NORMALIZE (in_ptr, res_size);
+
+	  if (res->_mp_alloc < res_size)
+	    _mpz_realloc (res, res_size);
+
+	  limb_cnt = res_size;
+	}
+    }
+  else
+    {
+      /* The input operand is smaller than 2**CNT.  We perform a no-op,
+	 apart from that we might need to copy IN to RES, and may need
+	 to round the result.  */
+      res_size = in_size;
+      if (res->_mp_alloc < res_size)
+	_mpz_realloc (res, res_size);
+
+      limb_cnt = res_size;
+    }
+
+  if (res != in)
+    MPN_COPY (res->_mp_d, in->_mp_d, limb_cnt);
+  in_size = in->_mp_size;
+  res->_mp_size = res_size;
+  if (in_size < 0 && res_size != 0)
+    {
+      /* Result should be 2^CNT - RES */
+      mpz_t tmp;
+      TMP_DECL (marker);
+      TMP_MARK (marker);
+      MPZ_TMP_INIT (tmp, cnt/BITS_PER_MP_LIMB + 2);
+      mpz_set_ui (tmp, 1L);
+      mpz_mul_2exp (tmp, tmp, cnt);
+      mpz_sub (res, tmp, res);
+      TMP_FREE (marker);
+    }
+}
+
+/* This is an alternative ending of the above function using just low-level
+   functions.  Tested, but perhaps excessive?  */
+#if 0
+  if (in->_mp_size < 0 && res_size != 0)
+    {
+      /* Result should be 2^CNT - RES */
+
+      mp_ptr rp;
+
+      limb_cnt = cnt / BITS_PER_MP_LIMB;
+
+      if (res->_mp_alloc <= limb_cnt)
+	_mpz_realloc (res, limb_cnt + 1);
+      rp = PTR(res);
+      if (res_size > limb_cnt)
+	{
+	  mpn_nz_neg (rp, rp, res_size);
+	  rp[limb_cnt] &= ~(~(mp_limb_t) 0 << cnt % BITS_PER_MP_LIMB);
+	  MPN_NORMALIZE_NOT_ZERO (rp, res_size);
+	}
+      else
+	{
+	  mp_size_t i;
+	  mpn_nz_neg (rp, rp, res_size);
+	  for (i = res_size; i < limb_cnt; i++)
+	    rp[i] = ~ (mp_limb_t) 0;
+	  res_size = limb_cnt;
+	  if (cnt % BITS_PER_MP_LIMB != 0)
+	    {
+	      rp[res_size] = ((mp_limb_t) 1 << (cnt % BITS_PER_MP_LIMB)) - 1;
+	      res_size++;
+	    }
+	  else
+	    MPN_NORMALIZE_NOT_ZERO (rp, res_size);
+	}
+    }
+  SIZ(res) = res_size;
+}
+
+static void
+mpn_nz_neg (rp, sp, n)
+     mp_ptr rp, sp;
+     mp_size_t n;
+{
+  mp_size_t i;
+  mp_limb_t x;
+
+  x = sp[0];
+  rp[0] = -x;
+  for (i = 1; x == 0; i++)
+    {
+      x = sp[i];
+      rp[i] = -x;
+    }
+
+  for (; i < n; i++)
+    {
+      rp[i] = ~sp[i];
+    }
+}
+#endif
diff --git a/rts/gmp/mpz/fdiv_r_ui.c b/rts/gmp/mpz/fdiv_r_ui.c
new file mode 100644
index 0000000000..dd5c743d27
--- /dev/null
+++ b/rts/gmp/mpz/fdiv_r_ui.c
@@ -0,0 +1,55 @@
+/* mpz_fdiv_r_ui -- Division rounding the quotient towards -infinity.
+   The remainder gets the same sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_fdiv_r_ui (mpz_ptr rem, mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_fdiv_r_ui (rem, dividend, divisor)
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  remainder_limb = mpn_mod_1 (dividend->_mp_d, size, (mp_limb_t) divisor);
+
+  if (remainder_limb != 0 && dividend_size < 0)
+    remainder_limb = divisor - remainder_limb;
+
+  rem->_mp_d[0] = remainder_limb;
+  rem->_mp_size = remainder_limb != 0;
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/fdiv_ui.c b/rts/gmp/mpz/fdiv_ui.c
new file mode 100644
index 0000000000..f937b5f6d0
--- /dev/null
+++ b/rts/gmp/mpz/fdiv_ui.c
@@ -0,0 +1,48 @@
+/* mpz_fdiv_ui -- Division rounding the quotient towards -infinity.
+   The remainder gets the same sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_fdiv_ui (mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_fdiv_ui (dividend, divisor)
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_limb_t remainder_limb;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  remainder_limb = mpn_mod_1 (dividend->_mp_d, size, (mp_limb_t) divisor);
+
+  if (remainder_limb != 0 && dividend_size < 0)
+    remainder_limb = divisor - remainder_limb;
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/fib_ui.c b/rts/gmp/mpz/fib_ui.c
new file mode 100644
index 0000000000..4bebb80d94
--- /dev/null
+++ b/rts/gmp/mpz/fib_ui.c
@@ -0,0 +1,165 @@
+/* mpz_fib_ui(result, n) -- Set RESULT to the Nth Fibonacci number.
+
+Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* This is fast, but could be made somewhat faster and neater.
+   The timing is somewhat fluctuating for even/odd sizes because
+   of the extra hair used to save variables and operations.  Here
+   are a few things one might want to address:
+     1. Avoid using 4 intermediate variables in mpz_fib_bigcase.
+     2. Call mpn functions directly.  Straightforward for these functions.
+     3. Merge the three functions into one.
+
+Said by Kevin:
+   Consider using the Lucas numbers L[n] as an auxiliary sequence, making
+   it possible to do the "doubling" operation in mpz_fib_bigcase with two
+   squares rather than two multiplies.  The formulas are a little more
+   complicated, something like the following (untested).
+
+       F[2n] = ((F[n]+L[n])^2 - 6*F[n]^2 - 4*(-1)^n) / 2
+       L[2n] = 5*F[n]^2 + 2*(-1)^n
+
+       F[2n+1] = (F[2n] + L[2n]) / 2
+       L[2n+1] = (5*F[2n] + L[2n]) / 2
+
+   The Lucas number that comes for free here could even be returned.
+
+   Maybe there's formulas with two squares using just F[n], but I don't
+   know of any.
+*/
+
+/* Determine the needed storage for Fib(n).  */
+#define FIB_SIZE(n) (((mp_size_t) ((n)*0.695)) / BITS_PER_MP_LIMB + 2)
+
+static void mpz_fib_bigcase _PROTO ((mpz_t, mpz_t, unsigned long int));
+static void mpz_fib_basecase _PROTO ((mpz_t, mpz_t, unsigned long int));
+
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 60
+#endif
+
+void
+#if __STDC__
+mpz_fib_ui (mpz_t r, unsigned long int n)
+#else
+mpz_fib_ui (r, n)
+     mpz_t r;
+     unsigned long int n;
+#endif
+{
+  if (n == 0)
+    mpz_set_ui (r, 0);
+  else
+    {
+      mpz_t t1;
+      mpz_init (t1);
+      if (n < FIB_THRESHOLD)
+	mpz_fib_basecase (t1, r, n);
+      else
+	mpz_fib_bigcase (t1, r, n);
+      mpz_clear (t1);
+    }
+}
+
+static void
+#if __STDC__
+mpz_fib_basecase (mpz_t t1, mpz_t t2, unsigned long int n)
+#else
+mpz_fib_basecase (t1, t2, n)
+     mpz_t t1;
+     mpz_t t2;
+     unsigned long int n;
+#endif
+{
+  unsigned long int m, i;
+
+  mpz_set_ui (t1, 0);
+  mpz_set_ui (t2, 1);
+  m = n/2;
+  for (i = 0; i < m; i++)
+    {
+      mpz_add (t1, t1, t2);
+      mpz_add (t2, t1, t2);
+    }
+  if ((n & 1) == 0)
+    {
+      mpz_sub (t1, t2, t1);
+      mpz_sub (t2, t2, t1);	/* trick: recover t1 value just overwritten */
+    }
+}
+
+static void
+#if __STDC__
+mpz_fib_bigcase (mpz_t t1, mpz_t t2, unsigned long int n)
+#else
+mpz_fib_bigcase (t1, t2, n)
+     mpz_t t1;
+     mpz_t t2;
+     unsigned long int n;
+#endif
+{
+  unsigned long int n2;
+  int ni, i;
+  mpz_t x1, x2, u1, u2;
+
+  ni = 0;
+  for (n2 = n; n2 >= FIB_THRESHOLD; n2 /= 2)
+    ni++;
+
+  mpz_fib_basecase (t1, t2, n2);
+
+  mpz_init (x1);
+  mpz_init (x2);
+  mpz_init (u1);
+  mpz_init (u2);
+
+  for (i = ni - 1; i >= 0; i--)
+    {
+      mpz_mul_2exp (x1, t1, 1);
+      mpz_mul_2exp (x2, t2, 1);
+
+      mpz_add (x1, x1, t2);
+      mpz_sub (x2, x2, t1);
+
+      mpz_mul (u1, t2, x1);
+      mpz_mul (u2, t1, x2);
+
+      if (((n >> i) & 1) == 0)
+	{
+	  mpz_sub (t1, u1, u2);
+	  mpz_set (t2, u1);
+	}
+      else
+	{
+	  mpz_set (t1, u1);
+	  mpz_mul_2exp (t2, u1, 1);
+	  mpz_sub (t2, t2, u2);
+	}
+    }
+
+  mpz_clear (x1);
+  mpz_clear (x2);
+  mpz_clear (u1);
+  mpz_clear (u2);
+}
diff --git a/rts/gmp/mpz/fits_sint_p.c b/rts/gmp/mpz/fits_sint_p.c
new file mode 100644
index 0000000000..82e32a24d5
--- /dev/null
+++ b/rts/gmp/mpz/fits_sint_p.c
@@ -0,0 +1,50 @@
+/* int mpz_fits_X_p (mpz_t src) -- Return whether src fits the C type X.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_fits_sint_p (mpz_srcptr src)
+#else
+mpz_fits_sint_p (src)
+     mpz_srcptr src;
+#endif
+{
+  mp_size_t size;
+  mp_limb_t mpl;
+
+  mpl = PTR(src)[0];
+  size = SIZ(src);
+  if (size > 0)
+    {
+      if (size > 1)
+	return 0;
+      return mpl < ~((~(unsigned int) 0) >> 1);
+    }
+  else
+    {
+      if (size < -1)
+	return 0;
+      return mpl <= ~((~(unsigned int) 0) >> 1);
+    }
+}
diff --git a/rts/gmp/mpz/fits_slong_p.c b/rts/gmp/mpz/fits_slong_p.c
new file mode 100644
index 0000000000..e0669b5aaa
--- /dev/null
+++ b/rts/gmp/mpz/fits_slong_p.c
@@ -0,0 +1,50 @@
+/* int mpz_fits_X_p (mpz_t src) -- Return whether src fits the C type X.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_fits_slong_p (mpz_srcptr src)
+#else
+mpz_fits_slong_p (src)
+     mpz_srcptr src;
+#endif
+{
+  mp_size_t size;
+  mp_limb_t mpl;
+
+  mpl = PTR(src)[0];
+  size = SIZ(src);
+  if (size > 0)
+    {
+      if (size > 1)
+	return 0;
+      return mpl < ~((~(unsigned long int) 0) >> 1);
+    }
+  else
+    {
+      if (size < -1)
+	return 0;
+      return mpl <= ~((~(unsigned long int) 0) >> 1);
+    }
+}
diff --git a/rts/gmp/mpz/fits_sshort_p.c b/rts/gmp/mpz/fits_sshort_p.c
new file mode 100644
index 0000000000..5b8e31afae
--- /dev/null
+++ b/rts/gmp/mpz/fits_sshort_p.c
@@ -0,0 +1,50 @@
+/* int mpz_fits_X_p (mpz_t src) -- Return whether src fits the C type X.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_fits_sshort_p (mpz_srcptr src)
+#else
+mpz_fits_sshort_p (src)
+     mpz_srcptr src;
+#endif
+{
+  mp_size_t size;
+  mp_limb_t mpl;
+
+  mpl = PTR(src)[0];
+  size = SIZ(src);
+  if (size > 0)
+    {
+      if (size > 1)
+	return 0;
+      return mpl <= (((unsigned short int) ~(unsigned int) 0) >> 1);
+    }
+  else
+    {
+      if (size < -1)
+	return 0;
+      return mpl <= (((unsigned short int) ~(unsigned int) 0) >> 1) + 1;
+    }
+}
diff --git a/rts/gmp/mpz/fits_uint_p.c b/rts/gmp/mpz/fits_uint_p.c
new file mode 100644
index 0000000000..72f62fa723
--- /dev/null
+++ b/rts/gmp/mpz/fits_uint_p.c
@@ -0,0 +1,41 @@
+/* int mpz_fits_X_p (mpz_t src) -- Return whether src fits the C type X.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_fits_uint_p (mpz_srcptr src)
+#else
+mpz_fits_uint_p (src)
+     mpz_srcptr src;
+#endif
+{
+  mp_size_t size;
+  mp_limb_t mpl;
+
+  mpl = PTR(src)[0];
+  size = SIZ(src);
+  if (size < 0 || size > 1)
+    return 0;
+  return mpl <= (~(unsigned int) 0);
+}
diff --git a/rts/gmp/mpz/fits_ulong_p.c b/rts/gmp/mpz/fits_ulong_p.c
new file mode 100644
index 0000000000..92eb42e86e
--- /dev/null
+++ b/rts/gmp/mpz/fits_ulong_p.c
@@ -0,0 +1,41 @@
+/* int mpz_fits_X_p (mpz_t src) -- Return whether src fits the C type X.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_fits_ulong_p (mpz_srcptr src)
+#else
+mpz_fits_ulong_p (src)
+     mpz_srcptr src;
+#endif
+{
+  mp_size_t size;
+  mp_limb_t mpl;
+
+  mpl = PTR(src)[0];
+  size = SIZ(src);
+  if (size < 0 || size > 1)
+    return 0;
+  return mpl <= (~(unsigned long int) 0);
+}
diff --git a/rts/gmp/mpz/fits_ushort_p.c b/rts/gmp/mpz/fits_ushort_p.c
new file mode 100644
index 0000000000..bde0edae6e
--- /dev/null
+++ b/rts/gmp/mpz/fits_ushort_p.c
@@ -0,0 +1,41 @@
+/* int mpz_fits_X_p (mpz_t src) -- Return whether src fits the C type X.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_fits_ushort_p (mpz_srcptr src)
+#else
+mpz_fits_ushort_p (src)
+     mpz_srcptr src;
+#endif
+{
+  mp_size_t size;
+  mp_limb_t mpl;
+
+  mpl = PTR(src)[0];
+  size = SIZ(src);
+  if (size < 0 || size > 1)
+    return 0;
+  return mpl <= ((unsigned short int) ~(unsigned int) 0);
+}
diff --git a/rts/gmp/mpz/gcd.c b/rts/gmp/mpz/gcd.c
new file mode 100644
index 0000000000..0d950dd609
--- /dev/null
+++ b/rts/gmp/mpz/gcd.c
@@ -0,0 +1,180 @@
+/* mpz/gcd.c:   Calculate the greatest common divisor of two integers.
+
+Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+
+
+#ifndef BERKELEY_MP
+void
+#if __STDC__
+mpz_gcd (mpz_ptr g, mpz_srcptr u, mpz_srcptr v)
+#else
+mpz_gcd (g, u, v)
+     mpz_ptr g;
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+#else /* BERKELEY_MP */
+void
+#if __STDC__
+gcd (mpz_srcptr u, mpz_srcptr v, mpz_ptr g)
+#else
+gcd (u, v, g)
+     mpz_ptr g;
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+#endif /* BERKELEY_MP */
+
+{
+  unsigned long int g_zero_bits, u_zero_bits, v_zero_bits;
+  mp_size_t g_zero_limbs, u_zero_limbs, v_zero_limbs;
+  mp_ptr tp;
+  mp_ptr up = u->_mp_d;
+  mp_size_t usize = ABS (u->_mp_size);
+  mp_ptr vp = v->_mp_d;
+  mp_size_t vsize = ABS (v->_mp_size);
+  mp_size_t gsize;
+  TMP_DECL (marker);
+
+  /* GCD(0, V) == V.  */
+  if (usize == 0)
+    {
+      g->_mp_size = vsize;
+      if (g == v)
+	return;
+      if (g->_mp_alloc < vsize)
+	_mpz_realloc (g, vsize);
+      MPN_COPY (g->_mp_d, vp, vsize);
+      return;
+    }
+
+  /* GCD(U, 0) == U.  */
+  if (vsize == 0)
+    {
+      g->_mp_size = usize;
+      if (g == u)
+	return;
+      if (g->_mp_alloc < usize)
+	_mpz_realloc (g, usize);
+      MPN_COPY (g->_mp_d, up, usize);
+      return;
+    }
+
+  if (usize == 1)
+    {
+      g->_mp_size = 1;
+      g->_mp_d[0] = mpn_gcd_1 (vp, vsize, up[0]);
+      return;
+    }
+
+  if (vsize == 1)
+    {
+      g->_mp_size = 1;
+      g->_mp_d[0] = mpn_gcd_1 (up, usize, vp[0]);
+      return;
+    }
+
+  TMP_MARK (marker);
+
+  /*  Eliminate low zero bits from U and V and move to temporary storage.  */
+  while (*up == 0)
+    up++;
+  u_zero_limbs = up - u->_mp_d;
+  usize -= u_zero_limbs;
+  count_trailing_zeros (u_zero_bits, *up);
+  tp = up;
+  up = (mp_ptr) TMP_ALLOC (usize * BYTES_PER_MP_LIMB);
+  if (u_zero_bits != 0)
+    {
+      mpn_rshift (up, tp, usize, u_zero_bits);
+      usize -= up[usize - 1] == 0;
+    }
+  else
+    MPN_COPY (up, tp, usize);
+
+  while (*vp == 0)
+    vp++;
+  v_zero_limbs = vp - v->_mp_d;
+  vsize -= v_zero_limbs;
+  count_trailing_zeros (v_zero_bits, *vp);
+  tp = vp;
+  vp = (mp_ptr) TMP_ALLOC (vsize * BYTES_PER_MP_LIMB);
+  if (v_zero_bits != 0)
+    {
+      mpn_rshift (vp, tp, vsize, v_zero_bits);
+      vsize -= vp[vsize - 1] == 0;
+    }
+  else
+    MPN_COPY (vp, tp, vsize);
+
+  if (u_zero_limbs > v_zero_limbs)
+    {
+      g_zero_limbs = v_zero_limbs;
+      g_zero_bits = v_zero_bits;
+    }
+  else if (u_zero_limbs < v_zero_limbs)
+    {
+      g_zero_limbs = u_zero_limbs;
+      g_zero_bits = u_zero_bits;
+    }
+  else  /*  Equal.  */
+    {
+      g_zero_limbs = u_zero_limbs;
+      g_zero_bits = MIN (u_zero_bits, v_zero_bits);
+    }
+
+  /*  Call mpn_gcd.  The 2nd argument must not have more bits than the 1st.  */
+  vsize = (usize < vsize || (usize == vsize && up[usize-1] < vp[vsize-1]))
+    ? mpn_gcd (vp, vp, vsize, up, usize)
+    : mpn_gcd (vp, up, usize, vp, vsize);
+
+  /*  Here G <-- V << (g_zero_limbs*BITS_PER_MP_LIMB + g_zero_bits).  */
+  gsize = vsize + g_zero_limbs;
+  if (g_zero_bits != 0)
+    {
+      mp_limb_t cy_limb;
+      gsize += (vp[vsize - 1] >> (BITS_PER_MP_LIMB - g_zero_bits)) != 0;
+      if (g->_mp_alloc < gsize)
+	_mpz_realloc (g, gsize);
+      MPN_ZERO (g->_mp_d, g_zero_limbs);
+
+      tp = g->_mp_d + g_zero_limbs;
+      cy_limb = mpn_lshift (tp, vp, vsize, g_zero_bits);
+      if (cy_limb != 0)
+	tp[vsize] = cy_limb;
+    }
+  else
+    {
+      if (g->_mp_alloc < gsize)
+	_mpz_realloc (g, gsize);
+      MPN_ZERO (g->_mp_d, g_zero_limbs);
+      MPN_COPY (g->_mp_d + g_zero_limbs, vp, vsize);
+    }
+
+  g->_mp_size = gsize;
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/gcd_ui.c b/rts/gmp/mpz/gcd_ui.c
new file mode 100644
index 0000000000..f3bec58829
--- /dev/null
+++ b/rts/gmp/mpz/gcd_ui.c
@@ -0,0 +1,65 @@
+/* mpz_gcd_ui -- Calculate the greatest common divisior of two integers.
+
+Copyright (C) 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_gcd_ui (mpz_ptr w, mpz_srcptr u, unsigned long int v)
+#else
+mpz_gcd_ui (w, u, v)
+     mpz_ptr w;
+     mpz_srcptr u;
+     unsigned long int v;
+#endif
+{
+  mp_size_t size;
+  mp_limb_t res;
+
+  size = ABS (u->_mp_size);
+
+  if (size == 0)
+    res = v;
+  else if (v == 0)
+    {
+      if (w != NULL && u != w)
+	{
+	  if (w->_mp_alloc < size)
+	    _mpz_realloc (w, size);
+
+	  MPN_COPY (w->_mp_d, u->_mp_d, size);
+	}
+      w->_mp_size = size;
+      /* We can't return any useful result for gcd(big,0).  */
+      return size > 1 ? 0 : w->_mp_d[0];
+    }
+  else
+    res = mpn_gcd_1 (u->_mp_d, size, (mp_limb_t) v);
+
+  if (w != NULL)
+    {
+      w->_mp_d[0] = res;
+      w->_mp_size = 1;
+    }
+  return res;
+}
diff --git a/rts/gmp/mpz/gcdext.c b/rts/gmp/mpz/gcdext.c
new file mode 100644
index 0000000000..3ba04c84ff
--- /dev/null
+++ b/rts/gmp/mpz/gcdext.c
@@ -0,0 +1,137 @@
+/* mpz_gcdext(g, s, t, a, b) -- Set G to gcd(a, b), and S and T such that
+   g = as + bt.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_gcdext (mpz_ptr g, mpz_ptr s, mpz_ptr t, mpz_srcptr a, mpz_srcptr b)
+#else
+mpz_gcdext (g, s, t, a, b)
+     mpz_ptr g;
+     mpz_ptr s;
+     mpz_ptr t;
+     mpz_srcptr a;
+     mpz_srcptr b;
+#endif
+{
+  mp_size_t asize, bsize, usize, vsize;
+  mp_srcptr ap, bp;
+  mp_ptr up, vp;
+  mp_size_t gsize, ssize, tmp_ssize;
+  mp_ptr gp, sp, tmp_gp, tmp_sp;
+  mpz_srcptr u, v;
+  mpz_ptr ss, tt;
+  __mpz_struct stmp, gtmp;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* mpn_gcdext requires that U >= V.  Therefore, we often have to swap U and
+     V.  This in turn leads to a lot of complications.  The computed cofactor
+     will be the wrong one, so we have to fix that up at the end.  */
+
+  asize = ABS (SIZ (a));
+  bsize = ABS (SIZ (b));
+  ap = PTR (a);
+  bp = PTR (b);
+  if (asize > bsize || (asize == bsize && mpn_cmp (ap, bp, asize) > 0))
+    {
+      usize = asize;
+      vsize = bsize;
+      up = (mp_ptr) TMP_ALLOC ((usize + 1) * BYTES_PER_MP_LIMB);
+      vp = (mp_ptr) TMP_ALLOC ((vsize + 1) * BYTES_PER_MP_LIMB);
+      MPN_COPY (up, ap, usize);
+      MPN_COPY (vp, bp, vsize);
+      u = a;
+      v = b;
+      ss = s;
+      tt = t;
+    }
+  else
+    {
+      usize = bsize;
+      vsize = asize;
+      up = (mp_ptr) TMP_ALLOC ((usize + 1) * BYTES_PER_MP_LIMB);
+      vp = (mp_ptr) TMP_ALLOC ((vsize + 1) * BYTES_PER_MP_LIMB);
+      MPN_COPY (up, bp, usize);
+      MPN_COPY (vp, ap, vsize);
+      u = b;
+      v = a;
+      ss = t;
+      tt = s;
+    }
+
+  tmp_gp = (mp_ptr) TMP_ALLOC ((usize + 1) * BYTES_PER_MP_LIMB);
+  tmp_sp = (mp_ptr) TMP_ALLOC ((usize + 1) * BYTES_PER_MP_LIMB);
+
+  if (vsize == 0)
+    {
+      tmp_sp[0] = 1;
+      tmp_ssize = 1;
+      MPN_COPY (tmp_gp, up, usize);
+      gsize = usize;
+    }
+  else
+    gsize = mpn_gcdext (tmp_gp, tmp_sp, &tmp_ssize, up, usize, vp, vsize);
+  ssize = ABS (tmp_ssize);
+
+  PTR (&gtmp) = tmp_gp;
+  SIZ (&gtmp) = gsize;
+
+  PTR (&stmp) = tmp_sp;
+  SIZ (&stmp) = (tmp_ssize ^ SIZ (u)) >= 0 ? ssize : -ssize;
+
+  if (tt != NULL)
+    {
+      if (SIZ (v) == 0)
+	SIZ (tt) = 0;
+      else
+	{
+	  mpz_t x;
+	  MPZ_TMP_INIT (x, ssize + usize + 1);
+	  mpz_mul (x, &stmp, u);
+	  mpz_sub (x, &gtmp, x);
+	  mpz_tdiv_q (tt, x, v);
+	}
+    }
+
+  if (ss != NULL)
+    {
+      if (ALLOC (ss) < ssize)
+	_mpz_realloc (ss, ssize);
+      sp = PTR (ss);
+      MPN_COPY (sp, tmp_sp, ssize);
+      SIZ (ss) = SIZ (&stmp);
+    }
+
+  if (ALLOC (g) < gsize)
+    _mpz_realloc (g, gsize);
+  gp = PTR (g);
+  MPN_COPY (gp, tmp_gp, gsize);
+  SIZ (g) = gsize;
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/get_d.c b/rts/gmp/mpz/get_d.c
new file mode 100644
index 0000000000..6a7c5856bb
--- /dev/null
+++ b/rts/gmp/mpz/get_d.c
@@ -0,0 +1,128 @@
+/* double mpz_get_d (mpz_t src) -- Return the double approximation to SRC.
+
+Copyright (C) 1996, 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+static int
+#if __STDC__
+mpn_zero_p (mp_ptr p, mp_size_t n)
+#else
+mpn_zero_p (p, n)
+     mp_ptr p;
+     mp_size_t n;
+#endif
+{
+  mp_size_t i;
+
+  for (i = 0; i < n; i++)
+    {
+      if (p[i] != 0)
+	return 0;
+    }
+
+  return 1;
+}
+
+
+double
+#if __STDC__
+mpz_get_d (mpz_srcptr src)
+#else
+mpz_get_d (src)
+     mpz_srcptr src;
+#endif
+{
+  double res;
+  mp_size_t size;
+  int negative;
+  mp_ptr qp;
+  mp_limb_t hz, lz;
+  int cnt;
+
+  size = SIZ(src);
+  if (size == 0)
+    return 0.0;
+
+  negative = size < 0;
+  size = ABS (size);
+  qp = PTR(src);
+
+  if (size == 1)
+    {
+      res = qp[size - 1];
+    }
+  else if (size == 2)
+    {
+      res = MP_BASE_AS_DOUBLE * qp[size - 1] + qp[size - 2];
+    }
+  else
+    {
+      count_leading_zeros (cnt, qp[size - 1]);
+
+#if BITS_PER_MP_LIMB == 32
+      if (cnt == 0)
+	{
+	  hz = qp[size - 1];
+	  lz = qp[size - 2];
+	}
+      else
+	{
+	  hz = (qp[size - 1] << cnt) | (qp[size - 2] >> BITS_PER_MP_LIMB - cnt);
+	  lz = (qp[size - 2] << cnt) | (qp[size - 3] >> BITS_PER_MP_LIMB - cnt);
+	}
+#if _GMP_IEEE_FLOATS
+      /* Take bits from less significant limbs, but only if they may affect
+	 the result.  */
+      if ((lz & 0x7ff) == 0x400)
+	{
+	  if (cnt != 0)
+	    lz += ((qp[size - 3] << cnt) != 0 || ! mpn_zero_p (qp, size - 3));
+	  else
+	    lz += (! mpn_zero_p (qp, size - 2));
+	}
+#endif
+      res = MP_BASE_AS_DOUBLE * hz + lz;
+      res = __gmp_scale2 (res, (size - 2) * BITS_PER_MP_LIMB - cnt);
+#endif
+#if BITS_PER_MP_LIMB == 64
+      if (cnt == 0)
+	hz = qp[size - 1];
+      else
+	hz = (qp[size - 1] << cnt) | (qp[size - 2] >> BITS_PER_MP_LIMB - cnt);
+#if _GMP_IEEE_FLOATS
+      if ((hz & 0x7ff) == 0x400)
+	{
+	  if (cnt != 0)
+	    hz += ((qp[size - 2] << cnt) != 0 || ! mpn_zero_p (qp, size - 2));
+	  else
+	    hz += (! mpn_zero_p (qp, size - 1));
+	}
+#endif
+      res = hz;
+      res = __gmp_scale2 (res, (size - 1) * BITS_PER_MP_LIMB - cnt);
+#endif
+    }
+
+  return negative ? -res : res;
+}
diff --git a/rts/gmp/mpz/get_si.c b/rts/gmp/mpz/get_si.c
new file mode 100644
index 0000000000..8a5d0e4803
--- /dev/null
+++ b/rts/gmp/mpz/get_si.c
@@ -0,0 +1,43 @@
+/* mpz_get_si(integer) -- Return the least significant digit from INTEGER.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+signed long int
+#if __STDC__
+mpz_get_si (mpz_srcptr op)
+#else
+mpz_get_si (op)
+     mpz_srcptr op;
+#endif
+{
+  mp_size_t size = op->_mp_size;
+  mp_limb_t low_limb = op->_mp_d[0];
+
+  if (size > 0)
+    return low_limb % ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1));
+  else if (size < 0)
+    /* This convoluted expression is necessary to properly handle 0x80000000 */
+    return ~((low_limb - 1) % ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1)));
+  else
+    return 0;
+}
diff --git a/rts/gmp/mpz/get_str.c b/rts/gmp/mpz/get_str.c
new file mode 100644
index 0000000000..c7278afb52
--- /dev/null
+++ b/rts/gmp/mpz/get_str.c
@@ -0,0 +1,118 @@
+/* mpz_get_str (string, base, mp_src) -- Convert the multiple precision
+   number MP_SRC to a string STRING of base BASE.  If STRING is NULL
+   allocate space for the result.  In any case, return a pointer to the
+   result.  If STRING is not NULL, the caller must ensure enough space is
+   available to store the result.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+char *
+#if __STDC__
+mpz_get_str (char *res_str, int base, mpz_srcptr x)
+#else
+mpz_get_str (res_str, base, x)
+     char *res_str;
+     int base;
+     mpz_srcptr x;
+#endif
+{
+  mp_ptr xp;
+  mp_size_t x_size = x->_mp_size;
+  unsigned char *str;
+  char *return_str;
+  size_t str_size;
+  char *num_to_text;
+  int i;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+  if (base >= 0)
+    {
+      if (base == 0)
+	base = 10;
+      num_to_text = "0123456789abcdefghijklmnopqrstuvwxyz";
+    }
+  else
+    {
+      base = -base;
+      num_to_text = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+    }
+
+  /* We allways allocate space for the string.  If the caller passed a
+     NULL pointer for RES_STR, we allocate permanent space and return
+     a pointer to that to the caller.  */
+  str_size = ((size_t) (ABS (x_size) * BITS_PER_MP_LIMB
+			* __mp_bases[base].chars_per_bit_exactly)) + 3;
+  if (res_str == 0)
+    {
+      /* We didn't get a string from the user.  Allocate one (and return
+	 a pointer to it).  */
+      res_str = (char *) (*_mp_allocate_func) (str_size);
+      /* Make str, the variable used for raw result from mpn_get_str,
+	 point to the same string, but just after a possible minus sign.  */
+      str = (unsigned char *) res_str + 1;
+    }
+  else
+    {
+      /* Use TMP_ALLOC to get temporary space, since we need a few extra bytes
+	 that we can't expect to caller to supply us with.  */
+      str = (unsigned char *) TMP_ALLOC (str_size);
+    }
+
+  return_str = res_str;
+
+  if (x_size == 0)
+    {
+      res_str[0] = '0';
+      res_str[1] = 0;
+      TMP_FREE (marker);
+      return res_str;
+    }
+  if (x_size < 0)
+    {
+      *res_str++ = '-';
+      x_size = -x_size;
+    }
+
+  /* Move the number to convert into temporary space, since mpn_get_str
+     clobbers its argument + needs one extra high limb....  */
+  xp = (mp_ptr) TMP_ALLOC ((x_size + 1) * BYTES_PER_MP_LIMB);
+  MPN_COPY (xp, x->_mp_d, x_size);
+
+  str_size = mpn_get_str (str, base, xp, x_size);
+
+  /* mpn_get_str might make some leading zeros.  Skip them.  */
+  while (*str == 0)
+    {
+      str_size--;
+      str++;
+    }
+
+  /* Translate result to printable chars and move result to RES_STR.  */
+  for (i = 0; i < str_size; i++)
+    res_str[i] = num_to_text[str[i]];
+  res_str[str_size] = 0;
+
+  TMP_FREE (marker);
+  return return_str;
+}
diff --git a/rts/gmp/mpz/get_ui.c b/rts/gmp/mpz/get_ui.c
new file mode 100644
index 0000000000..a8ec9e01a4
--- /dev/null
+++ b/rts/gmp/mpz/get_ui.c
@@ -0,0 +1,37 @@
+/* mpz_get_ui(integer) -- Return the least significant digit from INTEGER.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_get_ui (mpz_srcptr integer)
+#else
+mpz_get_ui (integer)
+     mpz_srcptr integer;
+#endif
+{
+  if (integer->_mp_size == 0)
+    return 0;
+  else
+    return integer->_mp_d[0];
+}
diff --git a/rts/gmp/mpz/getlimbn.c b/rts/gmp/mpz/getlimbn.c
new file mode 100644
index 0000000000..b772ed05c4
--- /dev/null
+++ b/rts/gmp/mpz/getlimbn.c
@@ -0,0 +1,38 @@
+/* mpz_getlimbn(integer,n) -- Return the N:th limb from INTEGER.
+
+Copyright (C) 1993, 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+#if __STDC__
+mpz_getlimbn (mpz_srcptr integer, mp_size_t n)
+#else
+mpz_getlimbn (integer, n)
+     mpz_srcptr integer;
+     mp_size_t n;
+#endif
+{
+  if (ABS (integer->_mp_size) <= n || n < 0)
+    return 0;
+  else
+    return integer->_mp_d[n];
+}
diff --git a/rts/gmp/mpz/hamdist.c b/rts/gmp/mpz/hamdist.c
new file mode 100644
index 0000000000..b039a653d2
--- /dev/null
+++ b/rts/gmp/mpz/hamdist.c
@@ -0,0 +1,62 @@
+/* mpz_hamdist(mpz_ptr op1, mpz_ptr op2) -- Compute the hamming distance
+   between OP1 and OP2.  If one of the operands is negative, return ~0.  (We
+   could make the function well-defined when both operands are negative, but
+   that would probably not be worth the trouble.
+
+Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_hamdist (mpz_srcptr u, mpz_srcptr v)
+#else
+mpz_hamdist (u, v)
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+{
+  mp_srcptr up, vp;
+  mp_size_t usize, vsize, size;
+  unsigned long int count;
+
+  usize = u->_mp_size;
+  vsize = v->_mp_size;
+
+  if ((usize | vsize) < 0)
+    return ~ (unsigned long int) 0;
+
+  up = u->_mp_d;
+  vp = v->_mp_d;
+
+  if (usize > vsize)
+    {
+      count = mpn_popcount (up + vsize, usize - vsize);
+      size = vsize;
+    }
+  else
+    {
+      count = mpn_popcount (vp + usize, vsize - usize);
+      size = usize;
+    }
+
+  return count + mpn_hamdist (up, vp, size);
+}
diff --git a/rts/gmp/mpz/init.c b/rts/gmp/mpz/init.c
new file mode 100644
index 0000000000..2e8e4d2cbd
--- /dev/null
+++ b/rts/gmp/mpz/init.c
@@ -0,0 +1,36 @@
+/* mpz_init() -- Make a new multiple precision number with value 0.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_init (mpz_ptr x)
+#else
+mpz_init (x)
+     mpz_ptr x;
+#endif
+{
+  x->_mp_alloc = 1;
+  x->_mp_d = (mp_ptr) (*_mp_allocate_func) (BYTES_PER_MP_LIMB);
+  x->_mp_size = 0;
+}
diff --git a/rts/gmp/mpz/inp_raw.c b/rts/gmp/mpz/inp_raw.c
new file mode 100644
index 0000000000..15e601229d
--- /dev/null
+++ b/rts/gmp/mpz/inp_raw.c
@@ -0,0 +1,101 @@
+/* mpz_inp_raw -- Input a mpz_t in raw, but endianess, and wordsize
+   independent format (as output by mpz_out_raw).
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h>
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+size_t
+#if __STDC__
+mpz_inp_raw (mpz_ptr x, FILE *stream)
+#else
+mpz_inp_raw (x, stream)
+     mpz_ptr x;
+     FILE *stream;
+#endif
+{
+  int i;
+  mp_size_t s;
+  mp_size_t xsize;
+  mp_ptr xp;
+  unsigned int c;
+  mp_limb_t x_limb;
+  mp_size_t in_bytesize;
+  int neg_flag;
+
+  if (stream == 0)
+    stream = stdin;
+
+  /* Read 4-byte size */
+  in_bytesize = 0;
+  for (i = 4 - 1; i >= 0; i--)
+    {
+      c = fgetc (stream);
+      in_bytesize = (in_bytesize << BITS_PER_CHAR) | c;
+    }
+
+  /* Size is stored as a 32 bit word; sign extend in_bytesize for non-32 bit
+     machines.  */
+  if (sizeof (mp_size_t) > 4)
+    in_bytesize |= (-(in_bytesize < 0)) << 31;
+
+  neg_flag = in_bytesize < 0;
+  in_bytesize = ABS (in_bytesize);
+  xsize = (in_bytesize + BYTES_PER_MP_LIMB - 1) / BYTES_PER_MP_LIMB;
+
+  if (xsize == 0)
+    {
+      x->_mp_size = 0;
+      return 4;			/* we've read 4 bytes */
+    }
+
+  if (x->_mp_alloc < xsize)
+    _mpz_realloc (x, xsize);
+  xp = x->_mp_d;
+
+  x_limb = 0;
+  for (i = (in_bytesize - 1) % BYTES_PER_MP_LIMB; i >= 0; i--)
+    {
+      c = fgetc (stream);
+      x_limb = (x_limb << BITS_PER_CHAR) | c;
+    }
+  xp[xsize - 1] = x_limb;
+
+  for (s = xsize - 2; s >= 0; s--)
+    {
+      x_limb = 0;
+      for (i = BYTES_PER_MP_LIMB - 1; i >= 0; i--)
+	{
+	  c = fgetc (stream);
+	  x_limb = (x_limb << BITS_PER_CHAR) | c;
+	}
+      xp[s] = x_limb;
+    }
+
+  if (c == EOF)
+    return 0;			/* error */
+
+  MPN_NORMALIZE (xp, xsize);
+  x->_mp_size = neg_flag ? -xsize : xsize;
+  return in_bytesize + 4;
+}
diff --git a/rts/gmp/mpz/inp_str.c b/rts/gmp/mpz/inp_str.c
new file mode 100644
index 0000000000..7aa5e1fc30
--- /dev/null
+++ b/rts/gmp/mpz/inp_str.c
@@ -0,0 +1,167 @@
+/* mpz_inp_str(dest_integer, stream, base) -- Input a number in base
+   BASE from stdio stream STREAM and store the result in DEST_INTEGER.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1998, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h>
+#include <ctype.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+static int
+#if __STDC__
+digit_value_in_base (int c, int base)
+#else
+digit_value_in_base (c, base)
+     int c;
+     int base;
+#endif
+{
+  int digit;
+
+  if (isdigit (c))
+    digit = c - '0';
+  else if (islower (c))
+    digit = c - 'a' + 10;
+  else if (isupper (c))
+    digit = c - 'A' + 10;
+  else
+    return -1;
+
+  if (digit < base)
+    return digit;
+  return -1;
+}
+
+size_t
+#if __STDC__
+mpz_inp_str (mpz_ptr x, FILE *stream, int base)
+#else
+mpz_inp_str (x, stream, base)
+     mpz_ptr x;
+     FILE *stream;
+     int base;
+#endif
+{
+  char *str;
+  size_t alloc_size, str_size;
+  int c;
+  int negative;
+  mp_size_t xsize;
+  size_t nread;
+
+  if (stream == 0)
+    stream = stdin;
+
+  nread = 0;
+
+  /* Skip whitespace.  */
+  do
+    {
+      c = getc (stream);
+      nread++;
+    }
+  while (isspace (c));
+
+  negative = 0;
+  if (c == '-')
+    {
+      negative = 1;
+      c = getc (stream);
+      nread++;
+    }
+
+  if (digit_value_in_base (c, base == 0 ? 10 : base) < 0)
+    return 0;			/* error if no digits */
+
+  /* If BASE is 0, try to find out the base by looking at the initial
+     characters.  */
+  if (base == 0)
+    {
+      base = 10;
+      if (c == '0')
+	{
+	  base = 8;
+	  c = getc (stream);
+	  nread++;
+	  if (c == 'x' || c == 'X')
+	    {
+	      base = 16;
+	      c = getc (stream);
+	      nread++;
+	    }
+	  else if (c == 'b' || c == 'B')
+	    {
+	      base = 2;
+	      c = getc (stream);
+	      nread++;
+	    }
+	}
+    }
+
+  /* Skip leading zeros.  */
+  while (c == '0')
+    {
+      c = getc (stream);
+      nread++;
+    }
+
+  alloc_size = 100;
+  str = (char *) (*_mp_allocate_func) (alloc_size);
+  str_size = 0;
+
+  for (;;)
+    {
+      int dig;
+      if (str_size >= alloc_size)
+	{
+	  size_t old_alloc_size = alloc_size;
+	  alloc_size = alloc_size * 3 / 2;
+	  str = (char *) (*_mp_reallocate_func) (str, old_alloc_size, alloc_size);
+	}
+      dig = digit_value_in_base (c, base);
+      if (dig < 0)
+	break;
+      str[str_size++] = dig;
+      c = getc (stream);
+    }
+
+  ungetc (c, stream);
+
+  /* Make sure the string is not empty, mpn_set_str would fail.  */
+  if (str_size == 0)
+    {
+      x->_mp_size = 0;
+      (*_mp_free_func) (str, alloc_size);
+      return nread;
+    }
+
+  xsize = (((mp_size_t) (str_size / __mp_bases[base].chars_per_bit_exactly))
+	   / BITS_PER_MP_LIMB + 2);
+  if (x->_mp_alloc < xsize)
+    _mpz_realloc (x, xsize);
+
+  /* Convert the byte array in base BASE to our bignum format.  */
+  xsize = mpn_set_str (x->_mp_d, (unsigned char *) str, str_size, base);
+  x->_mp_size = negative ? -xsize : xsize;
+
+  (*_mp_free_func) (str, alloc_size);
+  return str_size + nread;
+}
diff --git a/rts/gmp/mpz/invert.c b/rts/gmp/mpz/invert.c
new file mode 100644
index 0000000000..749a0969fc
--- /dev/null
+++ b/rts/gmp/mpz/invert.c
@@ -0,0 +1,77 @@
+/* mpz_invert (inv, x, n).  Find multiplicative inverse of X in Z(N).
+   If X has an inverse, return non-zero and store inverse in INVERSE,
+   otherwise, return 0 and put garbage in INVERSE.
+
+Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_invert (mpz_ptr inverse, mpz_srcptr x, mpz_srcptr n)
+#else
+mpz_invert (inverse, x, n)
+     mpz_ptr inverse;
+     mpz_srcptr x, n;
+#endif
+{
+  mpz_t gcd, tmp;
+  mp_size_t xsize, nsize, size;
+  TMP_DECL (marker);
+
+  xsize = SIZ (x);
+  nsize = SIZ (n);
+  xsize = ABS (xsize);
+  nsize = ABS (nsize);
+  size = MAX (xsize, nsize) + 1;
+
+  /* No inverse exists if the leftside operand is 0.  Likewise, no
+     inverse exists if the mod operand is 1.  */
+  if (xsize == 0 || (nsize == 1 && (PTR (n))[0] == 1))
+    return 0;
+
+  TMP_MARK (marker);
+
+  MPZ_TMP_INIT (gcd, size);
+  MPZ_TMP_INIT (tmp, size);
+  mpz_gcdext (gcd, tmp, (mpz_ptr) 0, x, n);
+
+  /* If no inverse existed, return with an indication of that.  */
+  if (gcd->_mp_size != 1 || (gcd->_mp_d)[0] != 1)
+    {
+      TMP_FREE (marker);
+      return 0;
+    }
+
+  /* Make sure we return a positive inverse.  */
+  if (SIZ (tmp) < 0)
+    {
+      if (SIZ (n) < 0)
+	mpz_sub (inverse, tmp, n);
+      else
+	mpz_add (inverse, tmp, n);
+    }
+  else
+    mpz_set (inverse, tmp);
+
+  TMP_FREE (marker);
+  return 1;
+}
diff --git a/rts/gmp/mpz/ior.c b/rts/gmp/mpz/ior.c
new file mode 100644
index 0000000000..0bb5a806dc
--- /dev/null
+++ b/rts/gmp/mpz/ior.c
@@ -0,0 +1,244 @@
+/* mpz_ior -- Logical inclusive or.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_ior (mpz_ptr res, mpz_srcptr op1, mpz_srcptr op2)
+#else
+mpz_ior (res, op1, op2)
+     mpz_ptr res;
+     mpz_srcptr op1;
+     mpz_srcptr op2;
+#endif
+{
+  mp_srcptr op1_ptr, op2_ptr;
+  mp_size_t op1_size, op2_size;
+  mp_ptr res_ptr;
+  mp_size_t res_size;
+  mp_size_t i;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+  op1_size = op1->_mp_size;
+  op2_size = op2->_mp_size;
+
+  op1_ptr = op1->_mp_d;
+  op2_ptr = op2->_mp_d;
+  res_ptr = res->_mp_d;
+
+  if (op1_size >= 0)
+    {
+      if (op2_size >= 0)
+	{
+	  if (op1_size >= op2_size)
+	    {
+	      if (res->_mp_alloc < op1_size)
+		{
+		  _mpz_realloc (res, op1_size);
+		  op1_ptr = op1->_mp_d;
+		  op2_ptr = op2->_mp_d;
+		  res_ptr = res->_mp_d;
+		}
+
+	      if (res_ptr != op1_ptr)
+		MPN_COPY (res_ptr + op2_size, op1_ptr + op2_size,
+			  op1_size - op2_size);
+	      for (i = op2_size - 1; i >= 0; i--)
+		res_ptr[i] = op1_ptr[i] | op2_ptr[i];
+	      res_size = op1_size;
+	    }
+	  else
+	    {
+	      if (res->_mp_alloc < op2_size)
+		{
+		  _mpz_realloc (res, op2_size);
+		  op1_ptr = op1->_mp_d;
+		  op2_ptr = op2->_mp_d;
+		  res_ptr = res->_mp_d;
+		}
+
+	      if (res_ptr != op2_ptr)
+		MPN_COPY (res_ptr + op1_size, op2_ptr + op1_size,
+			  op2_size - op1_size);
+	      for (i = op1_size - 1; i >= 0; i--)
+		res_ptr[i] = op1_ptr[i] | op2_ptr[i];
+	      res_size = op2_size;
+	    }
+
+	  res->_mp_size = res_size;
+	  return;
+	}
+      else /* op2_size < 0 */
+	{
+	  /* Fall through to the code at the end of the function.  */
+	}
+    }
+  else
+    {
+      if (op2_size < 0)
+	{
+	  mp_ptr opx;
+	  mp_limb_t cy;
+
+	  /* Both operands are negative, so will be the result.
+	     -((-OP1) | (-OP2)) = -(~(OP1 - 1) | ~(OP2 - 1)) =
+	     = ~(~(OP1 - 1) | ~(OP2 - 1)) + 1 =
+	     = ((OP1 - 1) & (OP2 - 1)) + 1      */
+
+	  op1_size = -op1_size;
+	  op2_size = -op2_size;
+
+	  res_size = MIN (op1_size, op2_size);
+
+	  /* Possible optimization: Decrease mpn_sub precision,
+	     as we won't use the entire res of both.  */
+	  opx = (mp_ptr) TMP_ALLOC (res_size * BYTES_PER_MP_LIMB);
+	  mpn_sub_1 (opx, op1_ptr, res_size, (mp_limb_t) 1);
+	  op1_ptr = opx;
+
+	  opx = (mp_ptr) TMP_ALLOC (res_size * BYTES_PER_MP_LIMB);
+	  mpn_sub_1 (opx, op2_ptr, res_size, (mp_limb_t) 1);
+	  op2_ptr = opx;
+
+	  if (res->_mp_alloc < res_size)
+	    {
+	      _mpz_realloc (res, res_size);
+	      res_ptr = res->_mp_d;
+	      /* Don't re-read OP1_PTR and OP2_PTR.  They point to
+		 temporary space--never to the space RES->_mp_d used
+		 to point to before reallocation.  */
+	    }
+
+	  /* First loop finds the size of the result.  */
+	  for (i = res_size - 1; i >= 0; i--)
+	    if ((op1_ptr[i] & op2_ptr[i]) != 0)
+	      break;
+	  res_size = i + 1;
+
+	  if (res_size != 0)
+	    {
+	      /* Second loop computes the real result.  */
+	      for (i = res_size - 1; i >= 0; i--)
+		res_ptr[i] = op1_ptr[i] & op2_ptr[i];
+
+	      cy = mpn_add_1 (res_ptr, res_ptr, res_size, (mp_limb_t) 1);
+	      if (cy)
+		{
+		  res_ptr[res_size] = cy;
+		  res_size++;
+		}
+	    }
+	  else
+	    {
+	      res_ptr[0] = 1;
+	      res_size = 1;
+	    }
+
+	  res->_mp_size = -res_size;
+	  TMP_FREE (marker);
+	  return;
+	}
+      else
+	{
+	  /* We should compute -OP1 | OP2.  Swap OP1 and OP2 and fall
+	     through to the code that handles OP1 | -OP2.  */
+          MPZ_SRCPTR_SWAP (op1, op2);
+          MPN_SRCPTR_SWAP (op1_ptr,op1_size, op2_ptr,op2_size);
+	}
+    }
+
+  {
+    mp_ptr opx;
+    mp_limb_t cy;
+    mp_size_t res_alloc;
+    mp_size_t count;
+
+    /* Operand 2 negative, so will be the result.
+       -(OP1 | (-OP2)) = -(OP1 | ~(OP2 - 1)) =
+       = ~(OP1 | ~(OP2 - 1)) + 1 =
+       = (~OP1 & (OP2 - 1)) + 1      */
+
+    op2_size = -op2_size;
+
+    res_alloc = op2_size;
+
+    opx = (mp_ptr) TMP_ALLOC (op2_size * BYTES_PER_MP_LIMB);
+    mpn_sub_1 (opx, op2_ptr, op2_size, (mp_limb_t) 1);
+    op2_ptr = opx;
+    op2_size -= op2_ptr[op2_size - 1] == 0;
+
+    if (res->_mp_alloc < res_alloc)
+      {
+	_mpz_realloc (res, res_alloc);
+	op1_ptr = op1->_mp_d;
+	res_ptr = res->_mp_d;
+	/* Don't re-read OP2_PTR.  It points to temporary space--never
+	   to the space RES->_mp_d used to point to before reallocation.  */
+      }
+
+    if (op1_size >= op2_size)
+      {
+	/* We can just ignore the part of OP1 that stretches above OP2,
+	   because the result limbs are zero there.  */
+
+	/* First loop finds the size of the result.  */
+	for (i = op2_size - 1; i >= 0; i--)
+	  if ((~op1_ptr[i] & op2_ptr[i]) != 0)
+	    break;
+	res_size = i + 1;
+	count = res_size;
+      }
+    else
+      {
+	res_size = op2_size;
+
+	/* Copy the part of OP2 that stretches above OP1, to RES.  */
+	MPN_COPY (res_ptr + op1_size, op2_ptr + op1_size, op2_size - op1_size);
+	count = op1_size;
+      }
+
+    if (res_size != 0)
+      {
+	/* Second loop computes the real result.  */
+	for (i = count - 1; i >= 0; i--)
+	  res_ptr[i] = ~op1_ptr[i] & op2_ptr[i];
+
+	cy = mpn_add_1 (res_ptr, res_ptr, res_size, (mp_limb_t) 1);
+	if (cy)
+	  {
+	    res_ptr[res_size] = cy;
+	    res_size++;
+	  }
+      }
+    else
+      {
+	res_ptr[0] = 1;
+	res_size = 1;
+      }
+
+    res->_mp_size = -res_size;
+  }
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/iset.c b/rts/gmp/mpz/iset.c
new file mode 100644
index 0000000000..114bc2d542
--- /dev/null
+++ b/rts/gmp/mpz/iset.c
@@ -0,0 +1,49 @@
+/* mpz_init_set (src_integer) -- Make a new multiple precision number with
+   a value copied from SRC_INTEGER.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_init_set (mpz_ptr w, mpz_srcptr u)
+#else
+mpz_init_set (w, u)
+     mpz_ptr w;
+     mpz_srcptr u;
+#endif
+{
+  mp_ptr wp, up;
+  mp_size_t usize, size;
+
+  usize = u->_mp_size;
+  size = ABS (usize);
+
+  w->_mp_alloc = MAX (size, 1);
+  w->_mp_d = (mp_ptr) (*_mp_allocate_func) (w->_mp_alloc * BYTES_PER_MP_LIMB);
+
+  wp = w->_mp_d;
+  up = u->_mp_d;
+
+  MPN_COPY (wp, up, size);
+  w->_mp_size = usize;
+}
diff --git a/rts/gmp/mpz/iset_d.c b/rts/gmp/mpz/iset_d.c
new file mode 100644
index 0000000000..502a8933e2
--- /dev/null
+++ b/rts/gmp/mpz/iset_d.c
@@ -0,0 +1,39 @@
+/* mpz_init_set_d(integer, val) -- Initialize and assign INTEGER with a double
+   value VAL.
+
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_init_set_d (mpz_ptr dest, double val)
+#else
+mpz_init_set_d (dest, val)
+     mpz_ptr dest;
+     double val;
+#endif
+{
+  dest->_mp_alloc = 1;
+  dest->_mp_d = (mp_ptr) (*_mp_allocate_func) (BYTES_PER_MP_LIMB);
+  dest->_mp_size = 0;
+  mpz_set_d (dest, val);
+}
diff --git a/rts/gmp/mpz/iset_si.c b/rts/gmp/mpz/iset_si.c
new file mode 100644
index 0000000000..842db140ef
--- /dev/null
+++ b/rts/gmp/mpz/iset_si.c
@@ -0,0 +1,49 @@
+/* mpz_init_set_si(val) -- Make a new multiple precision number with
+   value val.
+
+Copyright (C) 1991, 1993, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_init_set_si (mpz_ptr x, signed long int val)
+#else
+mpz_init_set_si (x, val)
+     mpz_ptr x;
+     signed long int val;
+#endif
+{
+  x->_mp_alloc = 1;
+  x->_mp_d = (mp_ptr) (*_mp_allocate_func) (BYTES_PER_MP_LIMB);
+  if (val > 0)
+    {
+      x->_mp_d[0] = val;
+      x->_mp_size = 1;
+    }
+  else if (val < 0)
+    {
+      x->_mp_d[0] = (unsigned long) -val;
+      x->_mp_size = -1;
+    }
+  else
+    x->_mp_size = 0;
+}
diff --git a/rts/gmp/mpz/iset_str.c b/rts/gmp/mpz/iset_str.c
new file mode 100644
index 0000000000..dfb8c6b230
--- /dev/null
+++ b/rts/gmp/mpz/iset_str.c
@@ -0,0 +1,47 @@
+/* mpz_init_set_str(string, base) -- Convert the \0-terminated string
+   STRING in base BASE to a multiple precision integer.  Return a MP_INT
+   structure representing the integer.  Allow white space in the
+   string.  If BASE == 0 determine the base in the C standard way,
+   i.e.  0xhh...h means base 16, 0oo...o means base 8, otherwise
+   assume base 10.
+
+Copyright (C) 1991, 1993, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_init_set_str (mpz_ptr x, const char *str, int base)
+#else
+mpz_init_set_str (x, str, base)
+     mpz_ptr x;
+     const char *str;
+     int base;
+#endif
+{
+  x->_mp_alloc = 1;
+  x->_mp_d = (mp_ptr) (*_mp_allocate_func) (BYTES_PER_MP_LIMB);
+
+  /* if str has no digits mpz_set_str leaves x->_mp_size unset */
+  x->_mp_size = 0;
+
+  return mpz_set_str (x, str, base);
+}
diff --git a/rts/gmp/mpz/iset_ui.c b/rts/gmp/mpz/iset_ui.c
new file mode 100644
index 0000000000..759182c556
--- /dev/null
+++ b/rts/gmp/mpz/iset_ui.c
@@ -0,0 +1,39 @@
+/* mpz_init_set_ui(val) -- Make a new multiple precision number with
+   value val.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_init_set_ui (mpz_ptr x, unsigned long int val)
+#else
+mpz_init_set_ui (x, val)
+     mpz_ptr x;
+     unsigned long int val;
+#endif
+{
+  x->_mp_alloc = 1;
+  x->_mp_d = (mp_ptr) (*_mp_allocate_func) (BYTES_PER_MP_LIMB);
+  x->_mp_d[0] = val;
+  x->_mp_size = val != 0;
+}
diff --git a/rts/gmp/mpz/jacobi.c b/rts/gmp/mpz/jacobi.c
new file mode 100644
index 0000000000..9d49e1d0c6
--- /dev/null
+++ b/rts/gmp/mpz/jacobi.c
@@ -0,0 +1,53 @@
+/* mpz_jacobi (op1, op2).
+   Contributed by Bennet Yee (bsy) at Carnegie-Mellon University
+
+Copyright (C) 1991, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+
+/* Precondition:  both p and q are positive */
+
+int
+#if	__STDC__
+mpz_jacobi (mpz_srcptr pi, mpz_srcptr qi)
+#else
+mpz_jacobi (pi, qi)
+     mpz_srcptr pi, qi;
+#endif
+{
+#if GCDCHECK
+  int retval;
+  mpz_t gcdval;
+
+  mpz_init (gcdval);
+  mpz_gcd (gcdval, pi, qi);
+  if (!mpz_cmp_ui (gcdval, 1L))
+    {
+      /* J(ab,cb) = J(ab,c)J(ab,b) = J(ab,c)J(0,b) = J(ab,c)*0 */
+      retval = 0;
+    }
+  else
+    retval = mpz_legendre (pi, qi);
+  mpz_clear (gcdval);
+  return retval;
+#else
+  return mpz_legendre (pi, qi);
+#endif
+}
diff --git a/rts/gmp/mpz/kronsz.c b/rts/gmp/mpz/kronsz.c
new file mode 100644
index 0000000000..c8c6752224
--- /dev/null
+++ b/rts/gmp/mpz/kronsz.c
@@ -0,0 +1,126 @@
+/* mpz_si_kronecker -- Kronecker/Jacobi symbol. */
+
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+int
+#if __STDC__
+mpz_si_kronecker (long a, mpz_srcptr b)
+#else
+mpz_si_kronecker (a, b)
+     long       a;
+     mpz_srcptr b;
+#endif
+{
+  int        b_abs_size;
+  mp_srcptr  b_ptr;
+  mp_limb_t  b_low;
+  int        twos;
+  int        result_bit1;
+
+  b_abs_size = ABSIZ (b);
+  if (b_abs_size == 0)
+    return JACOBI_S0 (a);  /* (a/0) */
+
+  b_ptr = PTR(b);
+  b_low = b_ptr[0];
+
+  /* (0/b) = 1 if b=+/-1, 0 otherwise */
+  if (a == 0)
+    return (b_abs_size == 1) & (b_low == 1);
+
+  /* account for the effect of the sign of b, so can then ignore it */
+  result_bit1 = JACOBI_BSGN_SZ_BIT1 (a, b);
+
+  if ((b_low & 1) == 0)
+    {
+      /* b even */
+
+      if ((a & 1) == 0)
+        return 0;  /* (a/b)=0 if both a,b even */
+
+      /* Require MP_BITS_PER_LIMB even, so that (a/2)^MP_BITS_PER_LIMB = 1,
+         and so that therefore there's no need to account for how many zero
+         limbs are stripped.  */
+      ASSERT ((BITS_PER_MP_LIMB & 1) == 0);
+
+      MPN_STRIP_LOW_ZEROS_NOT_ZERO (b_ptr, b_abs_size);
+      b_low = b_ptr[0];
+
+      if ((b_low & 1) == 0)
+        {
+          /* odd a, even b */
+
+          mp_limb_t  b_shl_bit1;
+
+          count_trailing_zeros (twos, b_low);
+
+          /* b_shl_bit1 is b>>twos, but with only bit 1 guaranteed */
+          if (twos == BITS_PER_MP_LIMB-1)
+            b_shl_bit1 = (b_abs_size == 1) ? 0 : (b_ptr[1] << 1);
+          else
+            b_shl_bit1 = (b_low >> twos);
+
+          result_bit1 ^= JACOBI_ASGN_SU_BIT1 (a, b_shl_bit1);
+          a = ABS(a);
+
+          if (a == 1)
+            return JACOBI_BIT1_TO_PN (result_bit1);  /* (1/b)=1 */
+
+          /* twos (a/2), reciprocity to (b/a), and (b/a) = (b mod a / b) */
+          return mpn_jacobi_base (mpn_mod_1_rshift (b_ptr, b_abs_size,
+                                                    twos, a),
+                                  a,
+                                  result_bit1
+                                  ^ JACOBI_TWOS_U_BIT1 (twos, a)
+                                  ^ JACOBI_RECIP_UU_BIT1 (a, b_shl_bit1));
+        }
+    }
+
+  /* b odd */
+
+  result_bit1 ^= JACOBI_ASGN_SU_BIT1 (a, b_low);
+  a = ABS(a);
+
+  /* (a/1) = 1 for any a */
+  if (b_abs_size == 1 && b_low == 1)
+    return JACOBI_BIT1_TO_PN (result_bit1);
+
+  /* Note a is cast to unsigned because 0x80..00 doesn't fit in a signed. */
+  if ((a & 1) == 0)
+    {
+      count_trailing_zeros (twos, a);
+      a = ((unsigned long) a) >> twos;
+      result_bit1 ^= JACOBI_TWOS_U_BIT1 (twos, b_low);
+    }
+
+  if (a == 1)
+    return JACOBI_BIT1_TO_PN (result_bit1);  /* (1/b)=1 */
+
+  /* reciprocity to (b/a), and (b/a) == (b mod a / a) */
+  return mpn_jacobi_base (mpn_mod_1 (b_ptr, b_abs_size, a), a,
+                          result_bit1 ^ JACOBI_RECIP_UU_BIT1 (a, b_low));
+}
diff --git a/rts/gmp/mpz/kronuz.c b/rts/gmp/mpz/kronuz.c
new file mode 100644
index 0000000000..b877e6f64c
--- /dev/null
+++ b/rts/gmp/mpz/kronuz.c
@@ -0,0 +1,115 @@
+/* mpz_ui_kronecker -- Kronecker/Jacobi symbol. */
+
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+int
+#if __STDC__
+mpz_ui_kronecker (unsigned long a, mpz_srcptr b)
+#else
+mpz_ui_kronecker (a, b)
+     unsigned long a;
+     mpz_srcptr    b;
+#endif
+{
+  int        b_abs_size;
+  mp_srcptr  b_ptr;
+  mp_limb_t  b_low;
+  int        twos;
+  int        result_bit1;
+
+  /* (a/0) */
+  b_abs_size = ABSIZ (b);
+  if (b_abs_size == 0)
+    return JACOBI_U0 (a);
+
+  /* (a/-1)=1 when a>=0, so the sign of b is ignored */
+  b_ptr = PTR(b);
+  b_low = b_ptr[0];
+
+  /* (0/1)=1; (0/-1)=1; (0/b)=0 for b!=+/-1
+     (1/b)=1, for any b */
+  if (a <= 1)
+    return (a == 1) | ((b_abs_size == 1) & (b_low == 1));
+
+  if (b_low & 1)
+    {
+      /* (a/1) = 1 for any a */
+      if (b_abs_size == 1 && b_low == 1)
+        return 1;
+
+      count_trailing_zeros (twos, a);
+      a >>= twos;
+      if (a == 1)
+        return JACOBI_TWOS_U (twos, b_low);  /* powers of (2/b) only */
+
+      /* powers of (2/b); reciprocity to (b/a); (b/a) == (b mod a / a) */
+      return mpn_jacobi_base (mpn_mod_1 (b_ptr, b_abs_size, a),
+                              a,
+                              JACOBI_TWOS_U_BIT1 (twos, b_low)
+                              ^ JACOBI_RECIP_UU_BIT1 (b_low, a));
+    }
+
+  /* b is even; (a/2)=0 if a is even */
+  if ((a & 1) == 0)
+    return 0;
+
+  /* Require MP_BITS_PER_LIMB even, so (a/2)^MP_BITS_PER_LIMB = 1, and so we
+     don't have to pay attention to how many trailing zero limbs are
+     stripped.  */
+  ASSERT ((BITS_PER_MP_LIMB & 1) == 0);
+
+  MPN_STRIP_LOW_ZEROS_NOT_ZERO (b_ptr, b_abs_size);
+  b_low = b_ptr[0];
+
+  if (b_low & 1)
+    /* reciprocity to (b/a); (b/a) == (b mod a / a) */
+    return mpn_jacobi_base (mpn_mod_1 (b_ptr, b_abs_size, a),
+                            a,
+                            JACOBI_RECIP_UU_BIT1 (b_low, a));
+
+  count_trailing_zeros (twos, b_low);
+
+  /* reciprocity to get (b/a) */
+  if (twos == BITS_PER_MP_LIMB-1)
+    {
+      if (b_abs_size == 1)
+        {
+          /* b==0x800...00, one limb high bit only, so (a/2)^(BPML-1) */
+          return JACOBI_TWOS_U (BITS_PER_MP_LIMB-1, a);
+        }
+
+      /* b_abs_size > 1 */
+      result_bit1 = JACOBI_RECIP_UU_BIT1 (a, b_ptr[1] << 1);
+    }
+  else
+    result_bit1 = JACOBI_RECIP_UU_BIT1 (a, b_low >> twos);
+
+  /* powers of (a/2); reciprocity to (b/a); (b/a) == (b mod a / a) */
+  return mpn_jacobi_base (mpn_mod_1_rshift (b_ptr, b_abs_size, twos, a),
+                          a,
+                          JACOBI_TWOS_U_BIT1 (twos, a) ^ result_bit1);
+}
diff --git a/rts/gmp/mpz/kronzs.c b/rts/gmp/mpz/kronzs.c
new file mode 100644
index 0000000000..edfb465976
--- /dev/null
+++ b/rts/gmp/mpz/kronzs.c
@@ -0,0 +1,74 @@
+/* mpz_kronecker_si -- Kronecker/Jacobi symbol. */
+
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* This function is expected to be often used with b odd, so there's a test
+   for this before invoking count_trailing_zeros().
+
+   After the absolute value of b is established it's treated as an unsigned
+   long, because 0x80..00 doesn't fit in a signed long. */
+
+int
+#if __STDC__
+mpz_kronecker_si (mpz_srcptr a, long b)
+#else
+mpz_kronecker_si (a, b)
+     mpz_srcptr a;
+     long       b;
+#endif
+{
+  int  result_bit1;
+  int  twos;
+
+  if (b == 0)
+    return JACOBI_Z0 (a);
+
+  result_bit1 = JACOBI_BSGN_ZS_BIT1(a, b);
+  b = ABS (b);
+
+  if (b == 1)
+    return JACOBI_BIT1_TO_PN (result_bit1);  /* (a/1) = 1 for any a */
+
+  if (b & 1) 
+    return mpn_jacobi_base (mpz_fdiv_ui (a, b), b, result_bit1);
+      
+  /* result 0 if both a,b even */
+  if (mpz_even_p (a))
+    return 0;
+
+  /* (a/2)=(2/a) when a odd */
+  count_trailing_zeros (twos, b);
+  result_bit1 ^= JACOBI_TWOS_U_BIT1 (twos, PTR(a)[0]);
+
+  b = ((unsigned long) b) >> twos;
+  if (b == 1)
+    return JACOBI_BIT1_TO_PN (result_bit1);
+  else
+    return mpn_jacobi_base (mpz_fdiv_ui (a, b), b, result_bit1);
+}
+
+
diff --git a/rts/gmp/mpz/kronzu.c b/rts/gmp/mpz/kronzu.c
new file mode 100644
index 0000000000..749be5df07
--- /dev/null
+++ b/rts/gmp/mpz/kronzu.c
@@ -0,0 +1,66 @@
+/* mpz_kronecker_ui -- Kronecker/Jacobi symbol. */
+
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* This function is expected to be often used with b an odd prime, so the
+   code for odd b is nice and short. */
+
+int
+#if __STDC__
+mpz_kronecker_ui (mpz_srcptr a, unsigned long b)
+#else
+mpz_kronecker_ui (a, b)
+     mpz_srcptr    a;
+     unsigned long b;
+#endif
+{
+  int  twos;
+
+  if (b & 1)
+    {
+      if (b != 1)
+        return mpn_jacobi_base (mpz_fdiv_ui (a, b), b, 0);
+      else
+        return 1;  /* (a/1)=1 for any a */
+    }
+
+  if (b == 0)
+    return JACOBI_Z0 (a);
+
+  /* (a/2)=0 if a even */
+  if (mpz_even_p (a))
+    return 0;
+
+  /* (a/2)=(2/a) when a odd */
+  count_trailing_zeros (twos, b);  
+  b >>= twos;
+  if (b == 1)
+    return JACOBI_TWOS_U (twos, PTR(a)[0]);
+
+  return mpn_jacobi_base (mpz_fdiv_ui (a, b), b,
+                          JACOBI_TWOS_U_BIT1(twos, PTR(a)[0]));
+}
diff --git a/rts/gmp/mpz/lcm.c b/rts/gmp/mpz/lcm.c
new file mode 100644
index 0000000000..7495882ae5
--- /dev/null
+++ b/rts/gmp/mpz/lcm.c
@@ -0,0 +1,61 @@
+/* mpz/lcm.c:   Calculate the least common multiple of two integers.
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void *_mpz_realloc ();
+
+void
+#if __STDC__
+mpz_lcm (mpz_ptr r, mpz_srcptr u, mpz_srcptr v)
+#else
+mpz_lcm (r, u, v)
+     mpz_ptr r;
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+{
+  mpz_t g;
+  mp_size_t usize, vsize, size;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  usize = ABS (SIZ (u));
+  vsize = ABS (SIZ (v));
+
+  if (usize == 0 || vsize == 0)
+    {
+      SIZ (r) = 0;
+      return;
+    }
+
+  size = MAX (usize, vsize);
+  MPZ_TMP_INIT (g, size);
+
+  mpz_gcd (g, u, v);
+  mpz_divexact (g, u, g);
+  mpz_mul (r, g, v);
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/legendre.c b/rts/gmp/mpz/legendre.c
new file mode 100644
index 0000000000..ab665f70d0
--- /dev/null
+++ b/rts/gmp/mpz/legendre.c
@@ -0,0 +1,184 @@
+/* mpz_legendre (op1, op2).
+   Contributed by Bennet Yee (bsy) at Carnegie-Mellon University
+
+Copyright (C) 1992, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+
+#if defined (DEBUG)
+#include <stdio.h>
+#endif
+
+/* Precondition:  both p and q are positive */
+
+int
+#if __STDC__
+mpz_legendre (mpz_srcptr pi, mpz_srcptr qi)
+#else
+mpz_legendre (pi, qi)
+mpz_srcptr pi, qi;
+#endif
+{
+  mpz_t p, q, qdiv2;
+#ifdef Q_MINUS_1
+  mpz_t q_minus_1;
+#endif
+  mpz_ptr mtmp;
+  register mpz_ptr pptr, qptr;
+  register int retval = 1;
+  register unsigned long int s;
+
+  pptr = p;
+  mpz_init_set (pptr, pi);
+  qptr = q;
+  mpz_init_set (qptr, qi);
+
+#ifdef Q_MINUS_1
+  mpz_init (q_minus_1);
+#endif
+  mpz_init (qdiv2);
+
+tail_recurse2:
+#ifdef DEBUG
+  printf ("tail_recurse2: p=");
+  mpz_out_str (stdout, 10, pptr);
+  printf ("\nq=");
+  mpz_out_str (stdout, 10, qptr);
+  putchar ('\n');
+#endif
+  s = mpz_scan1 (qptr, 0);
+  if (s) mpz_tdiv_q_2exp (qptr, qptr, s); /* J(a,2) = 1 */
+#ifdef DEBUG
+  printf ("2 factor decomposition: p=");
+  mpz_out_str (stdout, 10, pptr);
+  printf ("\nq=");
+  mpz_out_str (stdout, 10, qptr);
+  putchar ('\n');
+#endif
+  /* postcondition q odd */
+  if (!mpz_cmp_ui (qptr, 1L))  /* J(a,1) = 1 */
+    goto done;
+  mpz_mod (pptr, pptr, qptr); /* J(a,q) = J(b,q) when a == b mod q */
+#ifdef DEBUG
+  printf ("mod out by q: p=");
+  mpz_out_str (stdout, 10, pptr);
+  printf ("\nq=");
+  mpz_out_str (stdout, 10, qptr);
+  putchar ('\n');
+#endif
+  /* quick calculation to get approximate size first */
+  /* precondition: p < q */
+  if ((mpz_sizeinbase (pptr, 2) + 1 >= mpz_sizeinbase (qptr,2))
+      && (mpz_tdiv_q_2exp (qdiv2, qptr, 1L), mpz_cmp (pptr, qdiv2) > 0))
+    {
+      /* p > q/2 */
+      mpz_sub (pptr, qptr, pptr);
+      /* J(-1,q) = (-1)^((q-1)/2), q odd */
+      if (mpz_get_ui (qptr) & 2)
+	retval = -retval;
+    }
+  /* p < q/2 */
+#ifdef Q_MINUS_1
+  mpz_sub_ui (q_minus_q, qptr, 1L);
+#endif
+tail_recurse: /* we use tail_recurse only if q has not changed */
+#ifdef DEBUG
+  printf ("tail_recurse1: p=");
+  mpz_out_str (stdout, 10, pptr);
+  printf ("\nq=");
+  mpz_out_str (stdout, 10, qptr);
+  putchar ('\n');
+#endif
+  /*
+   * J(0,q) = 0
+   * this occurs only if gcd(p,q) != 1 which is never true for
+   * Legendre function.
+   */
+  if (!mpz_cmp_ui (pptr, 0L))
+    {
+      retval = 0;
+      goto done;
+    }
+
+  if (!mpz_cmp_ui (pptr, 1L))
+    {
+      /* J(1,q) = 1 */
+      /* retval *= 1; */
+      goto done;
+    }
+#ifdef Q_MINUS_1
+  if (!mpz_cmp (pptr, q_minus_1))
+    {
+      /* J(-1,q) = (-1)^((q-1)/2) */
+      if (mpz_get_ui (qptr) & 2)
+	retval = -retval;
+      /* else    retval *= 1; */
+      goto done;
+    }
+#endif
+  /*
+   * we do not handle J(xy,q) except for x==2
+   * since we do not want to factor
+   */
+  if ((s = mpz_scan1 (pptr, 0)) != 0)
+    {
+      /*
+       * J(2,q) = (-1)^((q^2-1)/8)
+       *
+       * Note that q odd guarantees that q^2-1 is divisible by 8:
+       * Let a: q=2a+1.  q^2 = 4a^2+4a+1, (q^2-1)/8 = a(a+1)/2, qed
+       *
+       * Now, note that this means that the low two bits of _a_
+       * (or the low bits of q shifted over by 1 determines
+       * the factor).
+       */
+      mpz_tdiv_q_2exp (pptr, pptr, s);
+
+      /* even powers of 2 gives J(2,q)^{2n} = 1 */
+      if (s & 1)
+	{
+	  s = mpz_get_ui (qptr) >> 1;
+	  s = s * (s + 1);
+	  if (s & 2)
+	    retval = -retval;
+	}
+      goto tail_recurse;
+    }
+  /*
+   * we know p is odd since we have cast out 2s
+   * precondition that q is odd guarantees both odd.
+   *
+   * quadratic reciprocity
+   * J(p,q) = (-1)^((p-1)(q-1)/4) * J(q,p)
+   */
+  if ((s = mpz_scan1 (pptr, 1)) <= 2 && (s + mpz_scan1 (qptr, 1)) <= 2)
+    retval = -retval;
+
+  mtmp = pptr; pptr = qptr; qptr = mtmp;
+  goto tail_recurse2;
+done:
+  mpz_clear (p);
+  mpz_clear (q);
+  mpz_clear (qdiv2);
+#ifdef Q_MINUS_1
+  mpz_clear (q_minus_1);
+#endif
+  return retval;
+}
diff --git a/rts/gmp/mpz/mod.c b/rts/gmp/mpz/mod.c
new file mode 100644
index 0000000000..87033b333b
--- /dev/null
+++ b/rts/gmp/mpz/mod.c
@@ -0,0 +1,63 @@
+/* mpz_mod -- The mathematical mod function.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_mod (mpz_ptr rem, mpz_srcptr dividend, mpz_srcptr divisor)
+#else
+mpz_mod (rem, dividend, divisor)
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     mpz_srcptr divisor;
+#endif
+{
+  mp_size_t divisor_size = divisor->_mp_size;
+  mpz_t temp_divisor;		/* N.B.: lives until function returns! */
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* We need the original value of the divisor after the remainder has been
+     preliminary calculated.  We have to copy it to temporary space if it's
+     the same variable as REM.  */
+  if (rem == divisor)
+    {
+      MPZ_TMP_INIT (temp_divisor, ABS (divisor_size));
+      mpz_set (temp_divisor, divisor);
+      divisor = temp_divisor;
+    }
+
+  mpz_tdiv_r (rem, dividend, divisor);
+
+  if (rem->_mp_size != 0)
+    {
+      if (dividend->_mp_size < 0)
+	if (divisor->_mp_size < 0)
+	  mpz_sub (rem, rem, divisor);
+	else
+	  mpz_add (rem, rem, divisor);
+    }
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/mul.c b/rts/gmp/mpz/mul.c
new file mode 100644
index 0000000000..7854788e50
--- /dev/null
+++ b/rts/gmp/mpz/mul.c
@@ -0,0 +1,131 @@
+/* mpz_mul -- Multiply two integers.
+
+Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+
+#ifndef BERKELEY_MP
+void
+#if __STDC__
+mpz_mul (mpz_ptr w, mpz_srcptr u, mpz_srcptr v)
+#else
+mpz_mul (w, u, v)
+     mpz_ptr w;
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+#else /* BERKELEY_MP */
+void
+#if __STDC__
+mult (mpz_srcptr u, mpz_srcptr v, mpz_ptr w)
+#else
+mult (u, v, w)
+     mpz_srcptr u;
+     mpz_srcptr v;
+     mpz_ptr w;
+#endif
+#endif /* BERKELEY_MP */
+{
+  mp_size_t usize = u->_mp_size;
+  mp_size_t vsize = v->_mp_size;
+  mp_size_t wsize;
+  mp_size_t sign_product;
+  mp_ptr up, vp;
+  mp_ptr wp;
+  mp_ptr free_me = NULL;
+  size_t free_me_size;
+  mp_limb_t cy_limb;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+  sign_product = usize ^ vsize;
+  usize = ABS (usize);
+  vsize = ABS (vsize);
+
+  if (usize < vsize)
+    {
+      /* Swap U and V.  */
+      {const __mpz_struct *t = u; u = v; v = t;}
+      {mp_size_t t = usize; usize = vsize; vsize = t;}
+    }
+
+  up = u->_mp_d;
+  vp = v->_mp_d;
+  wp = w->_mp_d;
+
+  /* Ensure W has space enough to store the result.  */
+  wsize = usize + vsize;
+  if (w->_mp_alloc < wsize)
+    {
+      if (wp == up || wp == vp)
+	{
+	  free_me = wp;
+	  free_me_size = w->_mp_alloc;
+	}
+      else
+	(*_mp_free_func) (wp, w->_mp_alloc * BYTES_PER_MP_LIMB);
+
+      w->_mp_alloc = wsize;
+      wp = (mp_ptr) (*_mp_allocate_func) (wsize * BYTES_PER_MP_LIMB);
+      w->_mp_d = wp;
+    }
+  else
+    {
+      /* Make U and V not overlap with W.  */
+      if (wp == up)
+	{
+	  /* W and U are identical.  Allocate temporary space for U.  */
+	  up = (mp_ptr) TMP_ALLOC (usize * BYTES_PER_MP_LIMB);
+	  /* Is V identical too?  Keep it identical with U.  */
+	  if (wp == vp)
+	    vp = up;
+	  /* Copy to the temporary space.  */
+	  MPN_COPY (up, wp, usize);
+	}
+      else if (wp == vp)
+	{
+	  /* W and V are identical.  Allocate temporary space for V.  */
+	  vp = (mp_ptr) TMP_ALLOC (vsize * BYTES_PER_MP_LIMB);
+	  /* Copy to the temporary space.  */
+	  MPN_COPY (vp, wp, vsize);
+	}
+    }
+
+  if (vsize == 0)
+    {
+      wsize = 0;
+    }
+  else
+    {
+      cy_limb = mpn_mul (wp, up, usize, vp, vsize);
+      wsize = usize + vsize;
+      wsize -= cy_limb == 0;
+    }
+
+  w->_mp_size = sign_product < 0 ? -wsize : wsize;
+  if (free_me != NULL)
+    (*_mp_free_func) (free_me, free_me_size * BYTES_PER_MP_LIMB);
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/mul_2exp.c b/rts/gmp/mpz/mul_2exp.c
new file mode 100644
index 0000000000..abea5fed2c
--- /dev/null
+++ b/rts/gmp/mpz/mul_2exp.c
@@ -0,0 +1,76 @@
+/* mpz_mul_2exp -- Multiply a bignum by 2**CNT
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_mul_2exp (mpz_ptr w, mpz_srcptr u, unsigned long int cnt)
+#else
+mpz_mul_2exp (w, u, cnt)
+     mpz_ptr w;
+     mpz_srcptr u;
+     unsigned long int cnt;
+#endif
+{
+  mp_size_t usize = u->_mp_size;
+  mp_size_t abs_usize = ABS (usize);
+  mp_size_t wsize;
+  mp_size_t limb_cnt;
+  mp_ptr wp;
+  mp_limb_t wlimb;
+
+  if (usize == 0)
+    {
+      w->_mp_size = 0;
+      return;
+    }
+
+  limb_cnt = cnt / BITS_PER_MP_LIMB;
+  wsize = abs_usize + limb_cnt + 1;
+  if (w->_mp_alloc < wsize)
+    _mpz_realloc (w, wsize);
+
+  wp = w->_mp_d;
+  wsize = abs_usize + limb_cnt;
+
+  cnt %= BITS_PER_MP_LIMB;
+  if (cnt != 0)
+    {
+      wlimb = mpn_lshift (wp + limb_cnt, u->_mp_d, abs_usize, cnt);
+      if (wlimb != 0)
+	{
+	  wp[wsize] = wlimb;
+	  wsize++;
+	}
+    }
+  else
+    {
+      MPN_COPY_DECR (wp + limb_cnt, u->_mp_d, abs_usize);
+    }
+
+  /* Zero all whole limbs at low end.  Do it here and not before calling
+     mpn_lshift, not to lose for U == W.  */
+  MPN_ZERO (wp, limb_cnt);
+
+  w->_mp_size = usize >= 0 ? wsize : -wsize;
+}
diff --git a/rts/gmp/mpz/mul_siui.c b/rts/gmp/mpz/mul_siui.c
new file mode 100644
index 0000000000..9849cd41b0
--- /dev/null
+++ b/rts/gmp/mpz/mul_siui.c
@@ -0,0 +1,81 @@
+/* mpz_mul_ui/si (product, multiplier, small_multiplicand) -- Set PRODUCT to
+   MULTIPLICATOR times SMALL_MULTIPLICAND.
+
+Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+#ifdef OPERATION_mul_ui
+#define FUNCTION              mpz_mul_ui
+#define MULTIPLICAND_UNSIGNED unsigned
+#define MULTIPLICAND_ABS(x)   x
+#else
+#ifdef OPERATION_mul_si
+#define FUNCTION              mpz_mul_si
+#define MULTIPLICAND_UNSIGNED
+#define MULTIPLICAND_ABS(x)   ABS(x)
+#else
+Error, error, unrecognised OPERATION
+#endif
+#endif
+
+
+void
+#if __STDC__
+FUNCTION (mpz_ptr prod, mpz_srcptr mult,
+          MULTIPLICAND_UNSIGNED long int small_mult)
+#else
+FUNCTION (prod, mult, small_mult)
+     mpz_ptr prod;
+     mpz_srcptr mult;
+     MULTIPLICAND_UNSIGNED long int small_mult;
+#endif
+{
+  mp_size_t size = mult->_mp_size;
+  mp_size_t sign_product = size;
+  mp_limb_t cy;
+  mp_size_t prod_size;
+  mp_ptr prod_ptr;
+
+  if (size == 0 || small_mult == 0)
+    {
+      prod->_mp_size = 0;
+      return;
+    }
+  size = ABS (size);
+
+  prod_size = size + 1;
+  if (prod->_mp_alloc < prod_size)
+    _mpz_realloc (prod, prod_size);
+
+  prod_ptr = prod->_mp_d;
+
+  cy = mpn_mul_1 (prod_ptr, mult->_mp_d, size,
+                  (mp_limb_t) MULTIPLICAND_ABS (small_mult));
+  if (cy != 0)
+    {
+      prod_ptr[size] = cy;
+      size++;
+    }
+
+  prod->_mp_size = ((sign_product < 0) ^ (small_mult < 0)) ? -size : size;
+}
diff --git a/rts/gmp/mpz/neg.c b/rts/gmp/mpz/neg.c
new file mode 100644
index 0000000000..566c3a95aa
--- /dev/null
+++ b/rts/gmp/mpz/neg.c
@@ -0,0 +1,53 @@
+/* mpz_neg(mpz_ptr dst, mpz_ptr src) -- Assign the negated value of SRC to DST.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_neg (mpz_ptr w, mpz_srcptr u)
+#else
+mpz_neg (w, u)
+     mpz_ptr w;
+     mpz_srcptr u;
+#endif
+{
+  mp_ptr wp, up;
+  mp_size_t usize, size;
+
+  usize = u->_mp_size;
+
+  if (u != w)
+    {
+      size = ABS (usize);
+
+      if (w->_mp_alloc < size)
+	_mpz_realloc (w, size);
+
+      wp = w->_mp_d;
+      up = u->_mp_d;
+
+      MPN_COPY (wp, up, size);
+    }
+
+  w->_mp_size = -usize;
+}
diff --git a/rts/gmp/mpz/nextprime.c b/rts/gmp/mpz/nextprime.c
new file mode 100644
index 0000000000..f024dd1206
--- /dev/null
+++ b/rts/gmp/mpz/nextprime.c
@@ -0,0 +1,120 @@
+/* mpz_nextprime(p,t) - compute the next prime > t and store that in p.
+
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_nextprime (mpz_ptr p, mpz_srcptr t)
+#else
+mpz_nextprime (p, t)
+     mpz_ptr    p;
+     mpz_srcptr t;
+#endif
+{
+  mpz_add_ui (p, t, 1L);
+  while (! mpz_probab_prime_p (p, 5))
+    mpz_add_ui (p, p, 1L);
+}
+
+#if 0
+/* This code is not yet tested.  Will be enabled in 3.1. */
+
+status unsigned short primes[] =
+{
+3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,
+101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,
+191,193,197,199,211,223,227,229,233,239,241,251,257,263,269,271,277,
+281,283,293,307,311,313,317,331,337,347,349,353,359,367,373,379,383,
+389,397,401,409,419,421,431,433,439,443,449,457,461,463,467,479,487,
+491,499,503,509,521,523,541,547,557,563,569,571,577,587,593,599,601,
+607,613,617,619,631,641,643,647,653,659,661,673,677,683,691,701,709,
+719,727,733,739,743,751,757,761,769,773,787,797,809,811,821,823,827,
+829,839,853,857,859,863,877,881,883,887,907,911,919,929,937,941,947,
+953,967,971,977,983,991,997
+};
+
+#define NUMBER_OF_PRIMES 167
+
+void
+#if __STDC__
+mpz_nextprime (mpz_ptr p, mpz_srcptr n)
+#else
+mpz_nextprime (p, n)
+     mpz_ptr p;
+     mpz_srcptr n;
+#endif
+{
+  mpz_t tmp;
+  unsigned short *moduli;
+  unsigned long difference;
+  int i;
+  int composite;
+
+  /* First handle tiny numbers */
+  if (mpz_cmp_ui (n, 2) < 0)
+    {
+      mpz_set_ui (p, 2);
+      return;
+    }
+  mpz_add_ui (p, n, 1);
+  mpz_setbit (p, 0);
+
+  if (mpz_cmp_ui (p, 7) <= 0)
+    return;
+
+  prime_limit = NUMBER_OF_PRIMES - 1;
+  if (mpz_cmp_ui (p, primes[prime_limit]) <= 0)
+    /* Just use first three entries (3,5,7) of table for small numbers */
+    prime_limit = 3;
+  if (prime_limit)
+    {
+      /* Compute residues modulo small odd primes */
+      moduli = (unsigned short *) TMP_ALLOC (prime_limit * sizeof moduli[0]);
+      for (i = 0; i < prime_limit; i++)
+	moduli[i] = mpz_fdiv_ui (p, primes[i]);
+    }
+  for (difference = 0; ; difference += 2)
+    {
+      composite = 0;
+
+      /* First check residues */
+      for (i = 0; i < prime_limit; i++)
+	{
+	  int acc, pr;
+	  composite |= (moduli[i] == 0);
+	  acc = moduli[i] + 2;
+	  pr = primes[i];
+	  moduli[i] = acc >= pr ? acc - pr : acc;
+	}
+      if (composite)
+	continue;
+
+      mpz_add_ui (p, p, difference);
+      difference = 0;
+
+      /* Miller-Rabin test */
+      if (mpz_millerrabin (p, 2))
+	break;
+    }
+}
+#endif
diff --git a/rts/gmp/mpz/out_raw.c b/rts/gmp/mpz/out_raw.c
new file mode 100644
index 0000000000..62709479c5
--- /dev/null
+++ b/rts/gmp/mpz/out_raw.c
@@ -0,0 +1,89 @@
+/* mpz_out_raw -- Output a mpz_t in binary.  Use an endianess and word size
+   independent format.
+
+Copyright (C) 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h>
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+size_t
+#if __STDC__
+mpz_out_raw (FILE *stream, mpz_srcptr x)
+#else
+mpz_out_raw (stream, x)
+     FILE *stream;
+     mpz_srcptr x;
+#endif
+{
+  int i;
+  mp_size_t s;
+  mp_size_t xsize = ABS (x->_mp_size);
+  mp_srcptr xp = x->_mp_d;
+  mp_size_t out_bytesize;
+  mp_limb_t hi_limb;
+  int n_bytes_in_hi_limb;
+
+  if (stream == 0)
+    stream = stdout;
+
+  if (xsize == 0)
+    {
+      for (i = 4 - 1; i >= 0; i--)
+	fputc (0, stream);
+      return ferror (stream) ? 0 : 4;
+    }
+
+  hi_limb = xp[xsize - 1];
+  for (i = BYTES_PER_MP_LIMB - 1; i > 0; i--)
+    {
+      if ((hi_limb >> i * BITS_PER_CHAR) != 0)
+	break;
+    }
+  n_bytes_in_hi_limb = i + 1;
+  out_bytesize = BYTES_PER_MP_LIMB * (xsize - 1) + n_bytes_in_hi_limb;
+  if (x->_mp_size < 0)
+    out_bytesize = -out_bytesize;
+
+  /* Make the size 4 bytes on all machines, to make the format portable.  */
+  for (i = 4 - 1; i >= 0; i--)
+    fputc ((out_bytesize >> (i * BITS_PER_CHAR)) % (1 << BITS_PER_CHAR),
+	   stream);
+
+  /* Output from the most significant limb to the least significant limb,
+     with each limb also output in decreasing significance order.  */
+
+  /* Output the most significant limb separately, since we will only
+     output some of its bytes.  */
+  for (i = n_bytes_in_hi_limb - 1; i >= 0; i--)
+    fputc ((hi_limb >> (i * BITS_PER_CHAR)) % (1 << BITS_PER_CHAR), stream);
+
+  /* Output the remaining limbs.  */
+  for (s = xsize - 2; s >= 0; s--)
+    {
+      mp_limb_t x_limb;
+
+      x_limb = xp[s];
+      for (i = BYTES_PER_MP_LIMB - 1; i >= 0; i--)
+	fputc ((x_limb >> (i * BITS_PER_CHAR)) % (1 << BITS_PER_CHAR), stream);
+    }
+  return ferror (stream) ? 0 : ABS (out_bytesize) + 4;
+}
diff --git a/rts/gmp/mpz/out_str.c b/rts/gmp/mpz/out_str.c
new file mode 100644
index 0000000000..bf971b0057
--- /dev/null
+++ b/rts/gmp/mpz/out_str.c
@@ -0,0 +1,108 @@
+/* mpz_out_str(stream, base, integer) -- Output to STREAM the multi prec.
+   integer INTEGER in base BASE.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+size_t
+#if __STDC__
+mpz_out_str (FILE *stream, int base, mpz_srcptr x)
+#else
+mpz_out_str (stream, base, x)
+     FILE *stream;
+     int base;
+     mpz_srcptr x;
+#endif
+{
+  mp_ptr xp;
+  mp_size_t x_size = x->_mp_size;
+  unsigned char *str;
+  size_t str_size;
+  size_t i;
+  size_t written;
+  char *num_to_text;
+  TMP_DECL (marker);
+
+  if (stream == 0)
+    stream = stdout;
+
+  if (base >= 0)
+    {
+      if (base == 0)
+	base = 10;
+      num_to_text = "0123456789abcdefghijklmnopqrstuvwxyz";
+    }
+  else
+    {
+      base = -base;
+      num_to_text = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+    }
+
+  if (x_size == 0)
+    {
+      fputc ('0', stream);
+      return ferror (stream) ? 0 : 1;
+    }
+
+  written = 0;
+
+  if (x_size < 0)
+    {
+      fputc ('-', stream);
+      x_size = -x_size;
+      written = 1;
+    }
+
+  TMP_MARK (marker);
+  str_size = ((size_t) (x_size * BITS_PER_MP_LIMB
+			* __mp_bases[base].chars_per_bit_exactly)) + 3;
+  str = (unsigned char *) TMP_ALLOC (str_size);
+
+  /* Move the number to convert into temporary space, since mpn_get_str
+     clobbers its argument + needs one extra high limb....  */
+  xp = (mp_ptr) TMP_ALLOC ((x_size + 1) * BYTES_PER_MP_LIMB);
+  MPN_COPY (xp, x->_mp_d, x_size);
+
+  str_size = mpn_get_str (str, base, xp, x_size);
+
+  /* mpn_get_str might make some leading zeros.  Skip them.  */
+  while (*str == 0)
+    {
+      str_size--;
+      str++;
+    }
+
+  /* Translate to printable chars.  */
+  for (i = 0; i < str_size; i++)
+    str[i] = num_to_text[str[i]];
+  str[str_size] = 0;
+
+  {
+    size_t fwret;
+    fwret = fwrite ((char *) str, 1, str_size, stream);
+    written += fwret;
+  }
+
+  TMP_FREE (marker);
+  return ferror (stream) ? 0 : written;
+}
diff --git a/rts/gmp/mpz/perfpow.c b/rts/gmp/mpz/perfpow.c
new file mode 100644
index 0000000000..e71670a0be
--- /dev/null
+++ b/rts/gmp/mpz/perfpow.c
@@ -0,0 +1,272 @@
+/* mpz_perfect_power_p(arg) -- Return non-zero if ARG is a perfect power,
+   zero otherwise.
+
+Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  We are to determine if c is a perfect power, c = a ^ b.
+  Assume c is divisible by 2^n and that codd = c/2^n is odd.
+  Assume a is divisible by 2^m and that aodd = a/2^m is odd.
+  It is always true that m divides n.
+
+  * If n is prime, either 1) a is 2*aodd and b = n
+		       or 2) a = c and b = 1.
+    So for n prime, we readily have a solution.
+  * If n is factorable into the non-trivial factors p1,p2,...
+    Since m divides n, m has a subset of n's factors and b = n / m.
+
+    BUG: Should handle negative numbers, since they can be odd perfect powers.
+*/
+
+/* This is a naive approach to recognizing perfect powers.
+   Many things can be improved.  In particular, we should use p-adic
+   arithmetic for computing possible roots.  */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+static unsigned long int gcd _PROTO ((unsigned long int a, unsigned long int b));
+static int isprime _PROTO ((unsigned long int t));
+
+static const unsigned short primes[] =
+{  2,  3,  5,  7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53,
+  59, 61, 67, 71, 73, 79, 83, 89, 97,101,103,107,109,113,127,131,
+ 137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,
+ 227,229,233,239,241,251,257,263,269,271,277,281,283,293,307,311,
+ 313,317,331,337,347,349,353,359,367,373,379,383,389,397,401,409,
+ 419,421,431,433,439,443,449,457,461,463,467,479,487,491,499,503,
+ 509,521,523,541,547,557,563,569,571,577,587,593,599,601,607,613,
+ 617,619,631,641,643,647,653,659,661,673,677,683,691,701,709,719,
+ 727,733,739,743,751,757,761,769,773,787,797,809,811,821,823,827,
+ 829,839,853,857,859,863,877,881,883,887,907,911,919,929,937,941,
+ 947,953,967,971,977,983,991,997,0
+};
+#define SMALLEST_OMITTED_PRIME 1009
+
+
+int
+#if __STDC__
+mpz_perfect_power_p (mpz_srcptr u)
+#else
+mpz_perfect_power_p (u)
+     mpz_srcptr u;
+#endif
+{
+  unsigned long int prime;
+  unsigned long int n, n2;
+  int i;
+  unsigned long int rem;
+  mpz_t u2, q;
+  int exact;
+  mp_size_t uns;
+  TMP_DECL (marker);
+
+  if (mpz_cmp_ui (u, 1) <= 0)
+    return 0;
+
+  n2 = mpz_scan1 (u, 0);
+  if (n2 == 1)
+    return 0;
+
+  TMP_MARK (marker);
+
+  uns = ABSIZ (u) - n2 / BITS_PER_MP_LIMB;
+  MPZ_TMP_INIT (q, uns);
+  MPZ_TMP_INIT (u2, uns);
+
+  mpz_tdiv_q_2exp (u2, u, n2);
+
+  if (isprime (n2))
+    goto n2prime;
+
+  for (i = 1; primes[i] != 0; i++)
+    {
+      prime = primes[i];
+      rem = mpz_tdiv_ui (u2, prime);
+      if (rem == 0)		/* divisable? */
+	{
+	  rem = mpz_tdiv_q_ui (q, u2, prime * prime);
+	  if (rem != 0)
+	    {
+	      TMP_FREE (marker);
+	      return 0;
+	    }
+	  mpz_swap (q, u2);
+	  for (n = 2;;)
+	    {
+	      rem = mpz_tdiv_q_ui (q, u2, prime);
+	      if (rem != 0)
+		break;
+	      mpz_swap (q, u2);
+	      n++;
+	    }
+
+	  n2 = gcd (n2, n);
+	  if (n2 == 1)
+	    {
+	      TMP_FREE (marker);
+	      return 0;
+	    }
+
+	  /* As soon as n2 becomes a prime number, stop factoring.
+	     Either we have u=x^n2 or u is not a perfect power.  */
+	  if (isprime (n2))
+	    goto n2prime;
+	}
+    }
+
+  if (mpz_cmp_ui (u2, 1) == 0)
+    {
+      TMP_FREE (marker);
+      return 1;
+    }
+
+  if (n2 == 0)
+    {
+      unsigned long int nth;
+      /* We did not find any factors above.  We have to consider all values
+	 of n.  */
+      for (nth = 2;; nth++)
+	{
+	  if (! isprime (nth))
+	    continue;
+#if 0
+	  exact = mpz_padic_root (q, u2, nth, PTH);
+	  if (exact)
+#endif
+	    exact = mpz_root (q, u2, nth);
+	  if (exact)
+	    {
+	      TMP_FREE (marker);
+	      return 1;
+	    }
+	  if (mpz_cmp_ui (q, SMALLEST_OMITTED_PRIME) < 0)
+	    {
+	      TMP_FREE (marker);
+	      return 0;
+	    }
+	}
+    }
+  else
+    {
+      unsigned long int nth;
+      /* We found some factors above.  We just need to consider values of n
+	 that divides n2.  */
+      for (nth = 2; nth <= n2; nth++)
+	{
+	  if (! isprime (nth))
+	    continue;
+	  if (n2 % nth != 0)
+	    continue;
+#if 0
+	  exact = mpz_padic_root (q, u2, nth, PTH);
+	  if (exact)
+#endif
+	    exact = mpz_root (q, u2, nth);
+	  if (exact)
+	    {
+	      TMP_FREE (marker);
+	      return 1;
+	    }
+	  if (mpz_cmp_ui (q, SMALLEST_OMITTED_PRIME) < 0)
+	    {
+	      TMP_FREE (marker);
+	      return 0;
+	    }
+	}
+
+      TMP_FREE (marker);
+      return 0;
+    }
+
+n2prime:
+  exact = mpz_root (NULL, u2, n2);
+  TMP_FREE (marker);
+  return exact;
+}
+
+static unsigned long int
+#if __STDC__
+gcd (unsigned long int a, unsigned long int b)
+#else
+gcd (a, b)
+     unsigned long int a, b;
+#endif
+{
+  int an2, bn2, n2;
+
+  if (a == 0)
+    return b;
+  if (b == 0)
+    return a;
+
+  count_trailing_zeros (an2, a);
+  a >>= an2;
+
+  count_trailing_zeros (bn2, b);
+  b >>= bn2;
+
+  n2 = MIN (an2, bn2);
+
+  while (a != b)
+    {
+      if (a > b)
+	{
+	  a -= b;
+	  do
+	    a >>= 1;
+	  while ((a & 1) == 0);
+	}
+      else /*  b > a.  */
+	{
+	  b -= a;
+	  do
+	    b >>= 1;
+	  while ((b & 1) == 0);
+	}
+    }
+
+  return a << n2;
+}
+
+static int
+#if __STDC__
+isprime (unsigned long int t)
+#else
+isprime (t)
+     unsigned long int t;
+#endif
+{
+  unsigned long int q, r, d;
+
+  if (t < 3 || (t & 1) == 0)
+    return t == 2;
+
+  for (d = 3, r = 1; r != 0; d += 2)
+    {
+      q = t / d;
+      r = t - q * d;
+      if (q < d)
+	return 1;
+    }
+  return 0;
+}
diff --git a/rts/gmp/mpz/perfsqr.c b/rts/gmp/mpz/perfsqr.c
new file mode 100644
index 0000000000..92e8d08ea9
--- /dev/null
+++ b/rts/gmp/mpz/perfsqr.c
@@ -0,0 +1,45 @@
+/* mpz_perfect_square_p(arg) -- Return non-zero if ARG is a perfect square,
+   zero otherwise.
+
+Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_perfect_square_p (mpz_srcptr a)
+#else
+mpz_perfect_square_p (a)
+     mpz_srcptr a;
+#endif
+{
+  mp_size_t asize = a->_mp_size;
+
+  /* No negative numbers are perfect squares.  */
+  if (asize < 0)
+    return 0;
+
+  /* Zero is a perfect square.  */
+  if (asize == 0)
+    return 1;
+
+  return mpn_perfect_square_p (a->_mp_d, asize);
+}
diff --git a/rts/gmp/mpz/popcount.c b/rts/gmp/mpz/popcount.c
new file mode 100644
index 0000000000..3105258e26
--- /dev/null
+++ b/rts/gmp/mpz/popcount.c
@@ -0,0 +1,42 @@
+/* mpz_popcount(mpz_ptr op) -- Population count of OP.  If the operand is
+   negative, return ~0 (a novel representation of infinity).
+
+Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_popcount (mpz_srcptr u)
+#else
+mpz_popcount (u)
+     mpz_srcptr u;
+#endif
+{
+  mp_size_t usize;
+
+  usize = u->_mp_size;
+
+  if ((usize) < 0)
+    return ~ (unsigned long int) 0;
+
+  return mpn_popcount (u->_mp_d, usize);
+}
diff --git a/rts/gmp/mpz/pow_ui.c b/rts/gmp/mpz/pow_ui.c
new file mode 100644
index 0000000000..96ca114e4d
--- /dev/null
+++ b/rts/gmp/mpz/pow_ui.c
@@ -0,0 +1,129 @@
+/* mpz_pow_ui(res, base, exp) -- Set RES to BASE**EXP.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef BERKELEY_MP
+void
+#if __STDC__
+mpz_pow_ui (mpz_ptr r, mpz_srcptr b, unsigned long int e)
+#else
+mpz_pow_ui (r, b, e)
+     mpz_ptr r;
+     mpz_srcptr b;
+     unsigned long int e;
+#endif
+#else /* BERKELEY_MP */
+void
+#if __STDC__
+rpow (const MINT *b, signed short int e, MINT *r)
+#else
+rpow (b, e, r)
+     const MINT *b;
+     signed short int e;
+     MINT *r;
+#endif
+#endif /* BERKELEY_MP */
+{
+  mp_ptr rp, bp, tp, xp;
+  mp_size_t ralloc, rsize, bsize;
+  int cnt, i;
+  mp_limb_t blimb;
+  TMP_DECL (marker);
+
+  bsize = ABS (b->_mp_size);
+
+  /* Single out cases that give result == 0 or 1.  These tests are here
+     to simplify the general code below, not to optimize.  */
+  if (e == 0)
+    {
+      r->_mp_d[0] = 1;
+      r->_mp_size = 1;
+      return;
+    }
+  if (bsize == 0
+#ifdef BERKELEY_MP
+      || e < 0
+#endif
+      )
+    {
+      r->_mp_size = 0;
+      return;
+    }
+
+  bp = b->_mp_d;
+
+  blimb = bp[bsize - 1];
+  if (bsize == 1 && blimb < 0x100)
+    {
+      /* Estimate space requirements accurately.  Using the code from the
+	 `else' path would over-estimate space requirements wildly.   */
+      float lb = __mp_bases[blimb].chars_per_bit_exactly;
+      ralloc = 3 + ((mp_size_t) (e / lb) / BITS_PER_MP_LIMB);
+    }
+  else
+    {
+      /* Over-estimate space requirements somewhat.  */
+      count_leading_zeros (cnt, blimb);
+      ralloc = bsize * e - cnt * e / BITS_PER_MP_LIMB + 2;
+    }
+
+  TMP_MARK (marker);
+
+  /* The two areas are used to alternatingly hold the input and recieve the
+     product for mpn_mul.  (This scheme is used to fulfill the requirements
+     of mpn_mul; that the product space may not be the same as any of the
+     input operands.)  */
+  rp = (mp_ptr) TMP_ALLOC (ralloc * BYTES_PER_MP_LIMB);
+  tp = (mp_ptr) TMP_ALLOC (ralloc * BYTES_PER_MP_LIMB);
+
+  MPN_COPY (rp, bp, bsize);
+  rsize = bsize;
+  count_leading_zeros (cnt, e);
+
+  for (i = BITS_PER_MP_LIMB - cnt - 2; i >= 0; i--)
+    {
+      mpn_mul_n (tp, rp, rp, rsize);
+      rsize = 2 * rsize;
+      rsize -= tp[rsize - 1] == 0;
+      xp = tp; tp = rp; rp = xp;
+
+      if ((e & ((mp_limb_t) 1 << i)) != 0)
+	{
+	  rsize = rsize + bsize - (mpn_mul (tp, rp, rsize, bp, bsize) == 0);
+	  xp = tp; tp = rp; rp = xp;
+	}
+    }
+
+  /* Now then we know the exact space requirements, reallocate if
+     necessary.  */
+  if (r->_mp_alloc < rsize)
+    _mpz_realloc (r, rsize);
+
+  MPN_COPY (r->_mp_d, rp, rsize);
+  r->_mp_size = (e & 1) == 0 || b->_mp_size >= 0 ? rsize : -rsize;
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/powm.c b/rts/gmp/mpz/powm.c
new file mode 100644
index 0000000000..e6af855a71
--- /dev/null
+++ b/rts/gmp/mpz/powm.c
@@ -0,0 +1,364 @@
+/* mpz_powm(res,base,exp,mod) -- Set RES to (base**exp) mod MOD.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation, Inc.
+Contributed by Paul Zimmermann.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+
+
+/* set c <- (a*b)/R^n mod m c has to have at least (2n) allocated limbs */
+static void
+#if __STDC__
+mpz_redc (mpz_ptr c, mpz_srcptr a, mpz_srcptr b, mpz_srcptr m, mp_limb_t Nprim)
+#else
+mpz_redc (c, a, b, m, Nprim)
+     mpz_ptr c;
+     mpz_srcptr a;
+     mpz_srcptr b;
+     mpz_srcptr m;
+     mp_limb_t Nprim;
+#endif
+{
+  mp_ptr cp, mp = PTR (m);
+  mp_limb_t cy, cout = 0;
+  mp_limb_t q;
+  size_t j, n = ABSIZ (m);
+
+  ASSERT (ALLOC (c) >= 2 * n);
+
+  mpz_mul (c, a, b);
+  cp = PTR (c);
+  j = ABSIZ (c);
+  MPN_ZERO (cp + j, 2 * n - j);
+  for (j = 0; j < n; j++)
+    {
+      q = cp[0] * Nprim;
+      cy = mpn_addmul_1 (cp, mp, n, q);
+      cout += mpn_add_1 (cp + n, cp + n, n - j, cy);
+      cp++;
+    }
+  cp -= n;
+  if (cout)
+    {
+      cy = cout - mpn_sub_n (cp, cp + n, mp, n);
+      while (cy)
+	cy -= mpn_sub_n (cp, cp, mp, n);
+    }
+  else
+    MPN_COPY (cp, cp + n, n);
+  MPN_NORMALIZE (cp, n);
+  SIZ (c) = SIZ (c) < 0 ? -n : n;
+}
+
+/* average number of calls to redc for an exponent of n bits
+   with the sliding window algorithm of base 2^k: the optimal is
+   obtained for the value of k which minimizes 2^(k-1)+n/(k+1):
+
+   n\k    4     5     6     7     8
+   128    156*  159   171   200   261
+   256    309   307*  316   343   403
+   512    617   607*  610   632   688
+   1024   1231  1204  1195* 1207  1256
+   2048   2461  2399  2366  2360* 2396
+   4096   4918  4787  4707  4665* 4670
+*/
+
+#ifndef BERKELEY_MP
+void
+#if __STDC__
+mpz_powm (mpz_ptr res, mpz_srcptr base, mpz_srcptr e, mpz_srcptr mod)
+#else
+mpz_powm (res, base, e, mod)
+     mpz_ptr res;
+     mpz_srcptr base;
+     mpz_srcptr e;
+     mpz_srcptr mod;
+#endif
+#else /* BERKELEY_MP */
+void
+#if __STDC__
+pow (mpz_srcptr base, mpz_srcptr e, mpz_srcptr mod, mpz_ptr res)
+#else
+pow (base, e, mod, res)
+     mpz_srcptr base;
+     mpz_srcptr e;
+     mpz_srcptr mod;
+     mpz_ptr res;
+#endif
+#endif /* BERKELEY_MP */
+{
+  mp_limb_t invm, *ep, c, mask;
+  mpz_t xx, *g;
+  mp_size_t n, i, K, j, l, k;
+  int sh;
+  int use_redc;
+
+#ifdef POWM_DEBUG
+  mpz_t exp;
+  mpz_init (exp);
+#endif
+
+  n = ABSIZ (mod);
+
+  if (n == 0)
+    DIVIDE_BY_ZERO;
+
+  if (SIZ (e) == 0)
+    {
+      /* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0
+         depending on if MOD equals 1.  */
+      SIZ(res) = (ABSIZ (mod) == 1 && (PTR(mod))[0] == 1) ? 0 : 1;
+      PTR(res)[0] = 1;
+      return;
+    }
+
+  /* Use REDC instead of usual reduction for sizes < POWM_THRESHOLD.
+     In REDC each modular multiplication costs about 2*n^2 limbs operations,
+     whereas using usual reduction it costs 3*K(n), where K(n) is the cost of a
+     multiplication using Karatsuba, and a division is assumed to cost 2*K(n),
+     for example using Burnikel-Ziegler's algorithm. This gives a theoretical
+     threshold of a*KARATSUBA_SQR_THRESHOLD, with a=(3/2)^(1/(2-ln(3)/ln(2))) ~
+     2.66.  */
+  /* For now, also disable REDC when MOD is even, as the inverse can't
+     handle that.  */
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD  ((8 * KARATSUBA_SQR_THRESHOLD) / 3)
+#endif
+
+  use_redc = (n < POWM_THRESHOLD && PTR(mod)[0] % 2 != 0);
+  if (use_redc)
+    {
+      /* invm = -1/m mod 2^BITS_PER_MP_LIMB, must have m odd */
+      modlimb_invert (invm, PTR(mod)[0]);
+      invm = -invm;
+    }
+
+  /* determines optimal value of k */
+  l = ABSIZ (e) * BITS_PER_MP_LIMB; /* number of bits of exponent */
+  k = 1;
+  K = 2;
+  while (2 * l > K * (2 + k * (3 + k)))
+    {
+      k++;
+      K *= 2;
+    }
+
+  g = (mpz_t *) (*_mp_allocate_func) (K / 2 * sizeof (mpz_t));
+  /* compute x*R^n where R=2^BITS_PER_MP_LIMB */
+  mpz_init (g[0]);
+  if (use_redc)
+    {
+      mpz_mul_2exp (g[0], base, n * BITS_PER_MP_LIMB);
+      mpz_mod (g[0], g[0], mod);
+    }
+  else
+    mpz_mod (g[0], base, mod);
+
+  /* compute xx^g for odd g < 2^k */
+  mpz_init (xx);
+  if (use_redc)
+    {
+      _mpz_realloc (xx, 2 * n);
+      mpz_redc (xx, g[0], g[0], mod, invm); /* xx = x^2*R^n */
+    }
+  else
+    {
+      mpz_mul (xx, g[0], g[0]);
+      mpz_mod (xx, xx, mod);
+    }
+  for (i = 1; i < K / 2; i++)
+    {
+      mpz_init (g[i]);
+      if (use_redc)
+	{
+	  _mpz_realloc (g[i], 2 * n);
+	  mpz_redc (g[i], g[i - 1], xx, mod, invm); /* g[i] = x^(2i+1)*R^n */
+	}
+      else
+	{
+	  mpz_mul (g[i], g[i - 1], xx);
+	  mpz_mod (g[i], g[i], mod);
+	}
+    }
+
+  /* now starts the real stuff */
+  mask = (mp_limb_t) ((1<<k) - 1);
+  ep = PTR (e);
+  i = ABSIZ (e) - 1;			/* current index */
+  c = ep[i];				/* current limb */
+  count_leading_zeros (sh, c);
+  sh = BITS_PER_MP_LIMB - sh;		/* significant bits in ep[i] */
+  sh -= k;				/* index of lower bit of ep[i] to take into account */
+  if (sh < 0)
+    {					/* k-sh extra bits are needed */
+      if (i > 0)
+	{
+	  i--;
+	  c = (c << (-sh)) | (ep[i] >> (BITS_PER_MP_LIMB + sh));
+	  sh += BITS_PER_MP_LIMB;
+	}
+    }
+  else
+    c = c >> sh;
+#ifdef POWM_DEBUG
+  printf ("-1/m mod 2^%u = %lu\n", BITS_PER_MP_LIMB, invm);
+  mpz_set_ui (exp, c);
+#endif
+  j=0;
+  while (c % 2 == 0)
+    {
+      j++;
+      c = (c >> 1);
+    }
+  mpz_set (xx, g[c >> 1]);
+  while (j--)
+    {
+      if (use_redc)
+	mpz_redc (xx, xx, xx, mod, invm);
+      else
+	{
+	  mpz_mul (xx, xx, xx);
+	  mpz_mod (xx, xx, mod);
+	}
+    }
+
+#ifdef POWM_DEBUG
+  printf ("x^"); mpz_out_str (0, 10, exp);
+  printf ("*2^%u mod m = ", n * BITS_PER_MP_LIMB); mpz_out_str (0, 10, xx);
+  putchar ('\n');
+#endif
+
+  while (i > 0 || sh > 0)
+    {
+      c = ep[i];
+      sh -= k;
+      l = k;				/* number of bits treated */
+      if (sh < 0)
+	{
+	  if (i > 0)
+	    {
+	      i--;
+	      c = (c << (-sh)) | (ep[i] >> (BITS_PER_MP_LIMB + sh));
+	      sh += BITS_PER_MP_LIMB;
+	    }
+	  else
+	    {
+	      l += sh;			/* may be less bits than k here */
+	      c = c & ((1<<l) - 1);
+	    }
+	}
+      else
+	c = c >> sh;
+      c = c & mask;
+
+      /* this while loop implements the sliding window improvement */
+      while ((c & (1 << (k - 1))) == 0 && (i > 0 || sh > 0))
+	{
+	  if (use_redc) mpz_redc (xx, xx, xx, mod, invm);
+	  else
+	    {
+	      mpz_mul (xx, xx, xx);
+	      mpz_mod (xx, xx, mod);
+	    }
+	  if (sh)
+	    {
+	      sh--;
+	      c = (c<<1) + ((ep[i]>>sh) & 1);
+	    }
+	  else
+	    {
+	      i--;
+	      sh = BITS_PER_MP_LIMB - 1;
+	      c = (c<<1) + (ep[i]>>sh);
+	    }
+	}
+
+#ifdef POWM_DEBUG
+      printf ("l=%u c=%lu\n", l, c);
+      mpz_mul_2exp (exp, exp, k);
+      mpz_add_ui (exp, exp, c);
+#endif
+
+      /* now replace xx by xx^(2^k)*x^c */
+      if (c != 0)
+	{
+	  j = 0;
+	  while (c % 2 == 0)
+	    {
+	      j++;
+	      c = c >> 1;
+	    }
+	  /* c0 = c * 2^j, i.e. xx^(2^k)*x^c = (A^(2^(k - j))*c)^(2^j) */
+	  l -= j;
+	  while (l--)
+	    if (use_redc) mpz_redc (xx, xx, xx, mod, invm);
+	    else
+	      {
+		mpz_mul (xx, xx, xx);
+		mpz_mod (xx, xx, mod);
+	      }
+	  if (use_redc)
+	    mpz_redc (xx, xx, g[c >> 1], mod, invm);
+	  else
+	    {
+	      mpz_mul (xx, xx, g[c >> 1]);
+	      mpz_mod (xx, xx, mod);
+	    }
+	}
+      else
+	j = l;				/* case c=0 */
+      while (j--)
+	{
+	  if (use_redc)
+	    mpz_redc (xx, xx, xx, mod, invm);
+	  else
+	    {
+	      mpz_mul (xx, xx, xx);
+	      mpz_mod (xx, xx, mod);
+	    }
+	}
+#ifdef POWM_DEBUG
+      printf ("x^"); mpz_out_str (0, 10, exp);
+      printf ("*2^%u mod m = ", n * BITS_PER_MP_LIMB); mpz_out_str (0, 10, xx);
+      putchar ('\n');
+#endif
+    }
+
+  /* now convert back xx to xx/R^n */
+  if (use_redc)
+    {
+      mpz_set_ui (g[0], 1);
+      mpz_redc (xx, xx, g[0], mod, invm);
+      if (mpz_cmp (xx, mod) >= 0)
+	mpz_sub (xx, xx, mod);
+    }
+  mpz_set (res, xx);
+
+  mpz_clear (xx);
+  for (i = 0; i < K / 2; i++)
+    mpz_clear (g[i]);
+  (*_mp_free_func) (g, K / 2 * sizeof (mpz_t));
+}
diff --git a/rts/gmp/mpz/powm_ui.c b/rts/gmp/mpz/powm_ui.c
new file mode 100644
index 0000000000..00f70bd563
--- /dev/null
+++ b/rts/gmp/mpz/powm_ui.c
@@ -0,0 +1,248 @@
+/* mpz_powm_ui(res,base,exp,mod) -- Set RES to (base**exp) mod MOD.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+#if __STDC__
+mpz_powm_ui (mpz_ptr res, mpz_srcptr base, unsigned long int exp, mpz_srcptr mod)
+#else
+mpz_powm_ui (res, base, exp, mod)
+     mpz_ptr res;
+     mpz_srcptr base;
+     unsigned long int exp;
+     mpz_srcptr mod;
+#endif
+{
+  mp_ptr rp, mp, bp;
+  mp_size_t msize, bsize, rsize;
+  mp_size_t size;
+  int mod_shift_cnt;
+  int negative_result;
+  mp_limb_t *free_me = NULL;
+  size_t free_me_size;
+  TMP_DECL (marker);
+
+  msize = ABS (mod->_mp_size);
+  size = 2 * msize;
+
+  rp = res->_mp_d;
+
+  if (msize == 0)
+    DIVIDE_BY_ZERO;
+
+  if (exp == 0)
+    {
+      /* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0
+	 depending on if MOD equals 1.  */
+      res->_mp_size = (msize == 1 && (mod->_mp_d)[0] == 1) ? 0 : 1;
+      rp[0] = 1;
+      return;
+    }
+
+  TMP_MARK (marker);
+
+  /* Normalize MOD (i.e. make its most significant bit set) as required by
+     mpn_divmod.  This will make the intermediate values in the calculation
+     slightly larger, but the correct result is obtained after a final
+     reduction using the original MOD value.  */
+
+  mp = (mp_ptr) TMP_ALLOC (msize * BYTES_PER_MP_LIMB);
+  count_leading_zeros (mod_shift_cnt, mod->_mp_d[msize - 1]);
+  if (mod_shift_cnt != 0)
+    mpn_lshift (mp, mod->_mp_d, msize, mod_shift_cnt);
+  else
+    MPN_COPY (mp, mod->_mp_d, msize);
+
+  bsize = ABS (base->_mp_size);
+  if (bsize > msize)
+    {
+      /* The base is larger than the module.  Reduce it.  */
+
+      /* Allocate (BSIZE + 1) with space for remainder and quotient.
+	 (The quotient is (bsize - msize + 1) limbs.)  */
+      bp = (mp_ptr) TMP_ALLOC ((bsize + 1) * BYTES_PER_MP_LIMB);
+      MPN_COPY (bp, base->_mp_d, bsize);
+      /* We don't care about the quotient, store it above the remainder,
+	 at BP + MSIZE.  */
+      mpn_divmod (bp + msize, bp, bsize, mp, msize);
+      bsize = msize;
+      /* Canonicalize the base, since we are going to multiply with it
+	 quite a few times.  */
+      MPN_NORMALIZE (bp, bsize);
+    }
+  else
+    bp = base->_mp_d;
+
+  if (bsize == 0)
+    {
+      res->_mp_size = 0;
+      TMP_FREE (marker);
+      return;
+    }
+
+  if (res->_mp_alloc < size)
+    {
+      /* We have to allocate more space for RES.  If any of the input
+	 parameters are identical to RES, defer deallocation of the old
+	 space.  */
+
+      if (rp == mp || rp == bp)
+	{
+	  free_me = rp;
+	  free_me_size = res->_mp_alloc;
+	}
+      else
+	(*_mp_free_func) (rp, res->_mp_alloc * BYTES_PER_MP_LIMB);
+
+      rp = (mp_ptr) (*_mp_allocate_func) (size * BYTES_PER_MP_LIMB);
+      res->_mp_alloc = size;
+      res->_mp_d = rp;
+    }
+  else
+    {
+      /* Make BASE, EXP and MOD not overlap with RES.  */
+      if (rp == bp)
+	{
+	  /* RES and BASE are identical.  Allocate temp. space for BASE.  */
+	  bp = (mp_ptr) TMP_ALLOC (bsize * BYTES_PER_MP_LIMB);
+	  MPN_COPY (bp, rp, bsize);
+	}
+      if (rp == mp)
+	{
+	  /* RES and MOD are identical.  Allocate temporary space for MOD.  */
+	  mp = (mp_ptr) TMP_ALLOC (msize * BYTES_PER_MP_LIMB);
+	  MPN_COPY (mp, rp, msize);
+	}
+    }
+
+  MPN_COPY (rp, bp, bsize);
+  rsize = bsize;
+
+  {
+    mp_ptr xp = (mp_ptr) TMP_ALLOC (2 * (msize + 1) * BYTES_PER_MP_LIMB);
+    int c;
+    mp_limb_t e;
+    mp_limb_t carry_limb;
+
+    negative_result = (exp & 1) && base->_mp_size < 0;
+
+    e = exp;
+    count_leading_zeros (c, e);
+    e = (e << c) << 1;		/* shift the exp bits to the left, lose msb */
+    c = BITS_PER_MP_LIMB - 1 - c;
+
+    /* Main loop.
+
+       Make the result be pointed to alternately by XP and RP.  This
+       helps us avoid block copying, which would otherwise be necessary
+       with the overlap restrictions of mpn_divmod.  With 50% probability
+       the result after this loop will be in the area originally pointed
+       by RP (==RES->_mp_d), and with 50% probability in the area originally
+       pointed to by XP.  */
+
+    while (c != 0)
+      {
+	mp_ptr tp;
+	mp_size_t xsize;
+
+	mpn_mul_n (xp, rp, rp, rsize);
+	xsize = 2 * rsize;
+	xsize -= xp[xsize - 1] == 0;
+	if (xsize > msize)
+	  {
+	    mpn_divmod (xp + msize, xp, xsize, mp, msize);
+	    xsize = msize;
+	  }
+
+	tp = rp; rp = xp; xp = tp;
+	rsize = xsize;
+
+	if ((mp_limb_signed_t) e < 0)
+	  {
+	    mpn_mul (xp, rp, rsize, bp, bsize);
+	    xsize = rsize + bsize;
+	    xsize -= xp[xsize - 1] == 0;
+	    if (xsize > msize)
+	      {
+		mpn_divmod (xp + msize, xp, xsize, mp, msize);
+		xsize = msize;
+	      }
+
+	    tp = rp; rp = xp; xp = tp;
+	    rsize = xsize;
+	  }
+	e <<= 1;
+	c--;
+      }
+
+    /* We shifted MOD, the modulo reduction argument, left MOD_SHIFT_CNT
+       steps.  Adjust the result by reducing it with the original MOD.
+
+       Also make sure the result is put in RES->_mp_d (where it already
+       might be, see above).  */
+
+    if (mod_shift_cnt != 0)
+      {
+	carry_limb = mpn_lshift (res->_mp_d, rp, rsize, mod_shift_cnt);
+	rp = res->_mp_d;
+	if (carry_limb != 0)
+	  {
+	    rp[rsize] = carry_limb;
+	    rsize++;
+	  }
+      }
+    else
+      {
+	MPN_COPY (res->_mp_d, rp, rsize);
+	rp = res->_mp_d;
+      }
+
+    if (rsize >= msize)
+      {
+	mpn_divmod (rp + msize, rp, rsize, mp, msize);
+	rsize = msize;
+      }
+
+    /* Remove any leading zero words from the result.  */
+    if (mod_shift_cnt != 0)
+      mpn_rshift (rp, rp, rsize, mod_shift_cnt);
+    MPN_NORMALIZE (rp, rsize);
+  }
+
+  if (negative_result && rsize != 0)
+    {
+      if (mod_shift_cnt != 0)
+	mpn_rshift (mp, mp, msize, mod_shift_cnt);
+      mpn_sub (rp, mp, msize, rp, rsize);
+      rsize = msize;
+      MPN_NORMALIZE (rp, rsize);
+    }
+  res->_mp_size = rsize;
+
+  if (free_me != NULL)
+    (*_mp_free_func) (free_me, free_me_size * BYTES_PER_MP_LIMB);
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/pprime_p.c b/rts/gmp/mpz/pprime_p.c
new file mode 100644
index 0000000000..82eb678238
--- /dev/null
+++ b/rts/gmp/mpz/pprime_p.c
@@ -0,0 +1,242 @@
+/* mpz_probab_prime_p --
+   An implementation of the probabilistic primality test found in Knuth's
+   Seminumerical Algorithms book.  If the function mpz_probab_prime_p()
+   returns 0 then n is not prime.  If it returns 1, then n is 'probably'
+   prime.  If it returns 2, n is surely prime.  The probability of a false
+   positive is (1/4)**reps, where reps is the number of internal passes of the
+   probabilistic algorithm.  Knuth indicates that 25 passes are reasonable.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
+Foundation, Inc.  Miller-Rabin code contributed by John Amanatides.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+static int isprime _PROTO ((unsigned long int t));
+static int mpz_millerrabin _PROTO ((mpz_srcptr n, int reps));
+
+int
+#if __STDC__
+mpz_probab_prime_p (mpz_srcptr n, int reps)
+#else
+mpz_probab_prime_p (n, reps)
+     mpz_srcptr n;
+     int reps;
+#endif
+{
+  mp_limb_t r;
+
+  /* Handle small and negative n.  */
+  if (mpz_cmp_ui (n, 1000000L) <= 0)
+    {
+      int is_prime;
+      if (mpz_sgn (n) < 0)
+	{
+	  /* Negative number.  Negate and call ourselves.  */
+	  mpz_t n2;
+	  mpz_init (n2);
+	  mpz_neg (n2, n);
+	  is_prime = mpz_probab_prime_p (n2, reps);
+	  mpz_clear (n2);
+	  return is_prime;
+	}
+      is_prime = isprime (mpz_get_ui (n));
+      return is_prime ? 2 : 0;
+    }
+
+  /* If n is now even, it is not a prime.  */
+  if ((mpz_get_ui (n) & 1) == 0)
+    return 0;
+
+  /* Check if n has small factors.  */
+  if (UDIV_TIME > (2 * UMUL_TIME + 6))
+    r = mpn_preinv_mod_1 (PTR(n), SIZ(n), (mp_limb_t) PP, (mp_limb_t) PP_INVERTED);
+  else
+    r = mpn_mod_1 (PTR(n), SIZ(n), (mp_limb_t) PP);
+  if (r % 3 == 0 || r % 5 == 0 || r % 7 == 0 || r % 11 == 0 || r % 13 == 0
+      || r % 17 == 0 || r % 19 == 0 || r % 23 == 0 || r % 29 == 0
+#if BITS_PER_MP_LIMB == 64
+      || r % 31 == 0 || r % 37 == 0 || r % 41 == 0 || r % 43 == 0
+      || r % 47 == 0 || r % 53 == 0
+#endif
+      )
+    {
+      return 0;
+    }
+
+  /* Do more dividing.  We collect small primes, using umul_ppmm, until we
+     overflow a single limb.  We divide our number by the small primes product,
+     and look for factors in the remainder.  */
+  {
+    unsigned long int ln2;
+    unsigned long int q;
+    mp_limb_t p1, p0, p;
+    unsigned int primes[15];
+    int nprimes;
+
+    nprimes = 0;
+    p = 1;
+    ln2 = mpz_sizeinbase (n, 2) / 30; ln2 = ln2 * ln2;
+    for (q = BITS_PER_MP_LIMB == 64 ? 59 : 31; q < ln2; q += 2)
+      {
+	if (isprime (q))
+	  {
+	    umul_ppmm (p1, p0, p, q);
+	    if (p1 != 0)
+	      {
+		r = mpn_mod_1 (PTR(n), SIZ(n), p);
+		while (--nprimes >= 0)
+		  if (r % primes[nprimes] == 0)
+		    {
+		      if (mpn_mod_1 (PTR(n), SIZ(n), (mp_limb_t) primes[nprimes]) != 0)
+			abort ();
+		      return 0;
+		    }
+		p = q;
+		nprimes = 0;
+	      }
+	    else
+	      {
+		p = p0;
+	      }
+	    primes[nprimes++] = q;
+	  }
+      }
+  }
+
+  /* Perform a number of Miller-Rabin tests.  */
+  return mpz_millerrabin (n, reps);
+}
+
+static int
+#if __STDC__
+isprime (unsigned long int t)
+#else
+isprime (t)
+     unsigned long int t;
+#endif
+{
+  unsigned long int q, r, d;
+
+  if (t < 3 || (t & 1) == 0)
+    return t == 2;
+
+  for (d = 3, r = 1; r != 0; d += 2)
+    {
+      q = t / d;
+      r = t - q * d;
+      if (q < d)
+	return 1;
+    }
+  return 0;
+}
+
+static int millerrabin _PROTO ((mpz_srcptr n, mpz_srcptr nm1,
+                                mpz_ptr x, mpz_ptr y,
+                                mpz_srcptr q, unsigned long int k));
+
+static int
+#if __STDC__
+mpz_millerrabin (mpz_srcptr n, int reps)
+#else
+mpz_millerrabin (n, reps)
+     mpz_srcptr n;
+     int reps;
+#endif
+{
+  int r;
+  mpz_t nm1, x, y, q;
+  unsigned long int k;
+  gmp_randstate_t rstate;
+  int is_prime;
+  TMP_DECL (marker);
+  TMP_MARK (marker);
+
+  MPZ_TMP_INIT (nm1, SIZ (n) + 1);
+  mpz_sub_ui (nm1, n, 1L);
+
+  MPZ_TMP_INIT (x, SIZ (n));
+  MPZ_TMP_INIT (y, 2 * SIZ (n)); /* mpz_powm_ui needs excessive memory!!! */
+
+  /* Perform a Fermat test.  */
+  mpz_set_ui (x, 210L);
+  mpz_powm (y, x, nm1, n);
+  if (mpz_cmp_ui (y, 1L) != 0)
+    {
+      TMP_FREE (marker);
+      return 0;
+    }
+
+  MPZ_TMP_INIT (q, SIZ (n));
+
+  /* Find q and k, where q is odd and n = 1 + 2**k * q.  */
+  k = mpz_scan1 (nm1, 0L);
+  mpz_tdiv_q_2exp (q, nm1, k);
+
+  gmp_randinit (rstate, GMP_RAND_ALG_DEFAULT, 32L);
+
+  is_prime = 1;
+  for (r = 0; r < reps && is_prime; r++)
+    {
+      do
+	mpz_urandomb (x, rstate, mpz_sizeinbase (n, 2) - 1);
+      while (mpz_cmp_ui (x, 1L) <= 0);
+
+      is_prime = millerrabin (n, nm1, x, y, q, k);
+    }
+
+  gmp_randclear (rstate);
+
+  TMP_FREE (marker);
+  return is_prime;
+}
+
+static int
+#if __STDC__
+millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
+             mpz_srcptr q, unsigned long int k)
+#else
+millerrabin (n, nm1, x, y, q, k)
+     mpz_srcptr n;
+     mpz_srcptr nm1;
+     mpz_ptr x;
+     mpz_ptr y;
+     mpz_srcptr q;
+     unsigned long int k;
+#endif
+{
+  unsigned long int i;
+
+  mpz_powm (y, x, q, n);
+
+  if (mpz_cmp_ui (y, 1L) == 0 || mpz_cmp (y, nm1) == 0)
+    return 1;
+
+  for (i = 1; i < k; i++)
+    {
+      mpz_powm_ui (y, y, 2L, n);
+      if (mpz_cmp (y, nm1) == 0)
+	return 1;
+      if (mpz_cmp_ui (y, 1L) == 0)
+	return 0;
+    }
+  return 0;
+}
diff --git a/rts/gmp/mpz/random.c b/rts/gmp/mpz/random.c
new file mode 100644
index 0000000000..60d9113991
--- /dev/null
+++ b/rts/gmp/mpz/random.c
@@ -0,0 +1,56 @@
+/* mpz_random -- Generate a random mpz_t of specified size.
+   This function is non-portable and generates poor random numbers.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "urandom.h"
+
+void
+#if __STDC__
+mpz_random (mpz_ptr x, mp_size_t size)
+#else
+mpz_random (x, size)
+     mpz_ptr x;
+     mp_size_t size;
+#endif
+{
+  mp_size_t i;
+  mp_limb_t ran;
+  mp_ptr xp;
+  mp_size_t abs_size;
+
+  abs_size = ABS (size);
+
+  if (x->_mp_alloc < abs_size)
+    _mpz_realloc (x, abs_size);
+
+  xp = x->_mp_d;
+
+  for (i = 0; i < abs_size; i++)
+    {
+      ran = urandom ();
+      xp[i] = ran;
+    }
+
+  MPN_NORMALIZE (xp, abs_size);
+  x->_mp_size = size < 0 ? -abs_size : abs_size;
+}
diff --git a/rts/gmp/mpz/random2.c b/rts/gmp/mpz/random2.c
new file mode 100644
index 0000000000..a90af115e9
--- /dev/null
+++ b/rts/gmp/mpz/random2.c
@@ -0,0 +1,48 @@
+/* mpz_random2 -- Generate a positive random mpz_t of specified size, with
+   long runs of consecutive ones and zeros in the binary representation.
+   Meant for testing of other MP routines.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_random2 (mpz_ptr x, mp_size_t size)
+#else
+mpz_random2 (x, size)
+     mpz_ptr x;
+     mp_size_t size;
+#endif
+{
+  mp_size_t abs_size;
+
+  abs_size = ABS (size);
+  if (abs_size != 0)
+    {
+      if (x->_mp_alloc < abs_size)
+	_mpz_realloc (x, abs_size);
+
+      mpn_random2 (x->_mp_d, abs_size);
+    }
+
+  x->_mp_size = size;
+}
diff --git a/rts/gmp/mpz/realloc.c b/rts/gmp/mpz/realloc.c
new file mode 100644
index 0000000000..0b9e447ec3
--- /dev/null
+++ b/rts/gmp/mpz/realloc.c
@@ -0,0 +1,52 @@
+/* _mpz_realloc -- make the mpz_t have NEW_SIZE digits allocated.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void *
+#if __STDC__
+_mpz_realloc (mpz_ptr m, mp_size_t new_size)
+#else
+_mpz_realloc (m, new_size)
+     mpz_ptr m;
+     mp_size_t new_size;
+#endif
+{
+  /* Never allocate zero space. */
+  if (new_size == 0)
+    new_size = 1;
+
+  m->_mp_d = (mp_ptr) (*_mp_reallocate_func) (m->_mp_d,
+					      m->_mp_alloc * BYTES_PER_MP_LIMB,
+					      new_size * BYTES_PER_MP_LIMB);
+  m->_mp_alloc = new_size;
+
+#if 0
+  /* This might break some code that reads the size field after
+     reallocation, in the case the reallocated destination and a
+     source argument are identical.  */
+  if (ABS (m->_mp_size) > new_size)
+    m->_mp_size = 0;
+#endif
+
+  return (void *) m->_mp_d;
+}
diff --git a/rts/gmp/mpz/remove.c b/rts/gmp/mpz/remove.c
new file mode 100644
index 0000000000..bc6675f972
--- /dev/null
+++ b/rts/gmp/mpz/remove.c
@@ -0,0 +1,93 @@
+/* mpz_remove -- divide out a factor and return its multiplicity.
+
+Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_remove (mpz_ptr dest, mpz_srcptr src, mpz_srcptr f)
+#else
+mpz_remove (dest, src, f)
+     mpz_ptr dest;
+     mpz_srcptr src;
+     mpz_srcptr f;
+#endif
+{
+  mpz_t fpow[40];		/* inexhaustible...until year 2020 or so */
+  mpz_t x, rem;
+  unsigned long int pwr;
+  int p;
+
+  if (mpz_cmp_ui (f, 1) <= 0 || mpz_sgn (src) == 0)
+    DIVIDE_BY_ZERO;
+  if (mpz_cmp_ui (f, 2) == 0)
+    {
+      unsigned long int s0;
+      s0 = mpz_scan1 (src, 0);
+      mpz_div_2exp (dest, src, s0);
+      return s0;
+    }
+
+  /* We could perhaps compute mpz_scan1(src,0)/mpz_scan1(f,0).  It is an
+     upper bound of the result we're seeking.  We could also shift down the
+     operands so that they become odd, to make intermediate values smaller.  */
+
+  mpz_init (rem);
+  mpz_init (x);
+
+  pwr = 0;
+  mpz_init (fpow[0]);
+  mpz_set (fpow[0], f);
+  mpz_set (dest, src);
+
+  /* Divide by f, f^2, ..., f^(2^k) until we get a remainder for f^(2^k).  */
+  for (p = 0;; p++)
+    {
+      mpz_tdiv_qr (x, rem, dest, fpow[p]);
+      if (SIZ (rem) != 0)
+	break;
+      mpz_init (fpow[p + 1]);
+      mpz_mul (fpow[p + 1], fpow[p], fpow[p]);
+      mpz_set (dest, x);
+    }
+
+  pwr = (1 << p) - 1;
+
+  mpz_clear (fpow[p]);
+
+  /* Divide by f^(2^(k-1)), f^(2^(k-2)), ..., f for all divisors that give a
+     zero remainder.  */
+  while (--p >= 0)
+    {
+      mpz_tdiv_qr (x, rem, dest, fpow[p]);
+      if (SIZ (rem) == 0)
+	{
+	  pwr += 1 << p;
+	  mpz_set (dest, x);
+	}
+      mpz_clear (fpow[p]);
+    }
+
+  mpz_clear (x);
+  mpz_clear (rem);
+  return pwr;
+}
diff --git a/rts/gmp/mpz/root.c b/rts/gmp/mpz/root.c
new file mode 100644
index 0000000000..0920bf22d3
--- /dev/null
+++ b/rts/gmp/mpz/root.c
@@ -0,0 +1,183 @@
+/* mpz_root(root, u, nth) --  Set ROOT to floor(U^(1/nth)).
+   Return an indication if the result is exact.
+
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* Naive implementation of nth root extraction.  It would probably be a
+   better idea to use a division-free Newton iteration.  It is insane
+   to use full precision from iteration 1.  The mpz_scan1 trick compensates
+   to some extent.  It would be natural to avoid representing the low zero
+   bits mpz_scan1 is counting, and at the same time call mpn directly.  */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+int
+#if __STDC__
+mpz_root (mpz_ptr r, mpz_srcptr c, unsigned long int nth)
+#else
+mpz_root (r, c, nth)
+     mpz_ptr r;
+     mpz_srcptr c;
+     unsigned long int nth;
+#endif
+{
+  mpz_t x, t0, t1, t2;
+  __mpz_struct ccs, *cc = &ccs;
+  unsigned long int nbits;
+  int bit;
+  int exact;
+  int i;
+  unsigned long int lowz;
+  unsigned long int rl;
+
+  /* even roots of negatives provoke an exception */
+  if (mpz_sgn (c) < 0 && (nth & 1) == 0)
+    SQRT_OF_NEGATIVE;
+
+  /* root extraction interpreted as c^(1/nth) means a zeroth root should
+     provoke a divide by zero, do this even if c==0 */
+  if (nth == 0)
+    DIVIDE_BY_ZERO;
+
+  if (mpz_sgn (c) == 0)
+    {
+      if (r != NULL)
+	mpz_set_ui (r, 0);
+      return 1;			/* exact result */
+    }
+
+  PTR(cc) = PTR(c);
+  SIZ(cc) = ABSIZ(c);
+
+  nbits = (mpz_sizeinbase (cc, 2) - 1) / nth;
+  if (nbits == 0)
+    {
+      if (r != NULL)
+	mpz_set_ui (r, 1);
+      if (mpz_sgn (c) < 0)
+	{
+	  if (r != NULL)
+	    SIZ(r) = -SIZ(r);
+	  return mpz_cmp_si (c, -1L) == 0;
+	}
+      return mpz_cmp_ui (c, 1L) == 0;
+    }
+
+  mpz_init (x);
+  mpz_init (t0);
+  mpz_init (t1);
+  mpz_init (t2);
+
+  /* Create a one-bit approximation.  */
+  mpz_set_ui (x, 0);
+  mpz_setbit (x, nbits);
+
+  /* Make the approximation better, one bit at a time.  This odd-looking
+     termination criteria makes large nth get better initial approximation,
+     which avoids slow convergence for such values.  */
+  bit = nbits - 1;
+  for (i = 1; (nth >> i) != 0; i++)
+    {
+      mpz_setbit (x, bit);
+      mpz_tdiv_q_2exp (t0, x, bit);
+      mpz_pow_ui (t1, t0, nth);
+      mpz_mul_2exp (t1, t1, bit * nth);
+      if (mpz_cmp (cc, t1) < 0)
+	mpz_clrbit (x, bit);
+
+      bit--;			/* check/set next bit */
+      if (bit < 0)
+	{
+	  /* We're done.  */
+	  mpz_pow_ui (t1, x, nth);
+	  goto done;
+	}
+    }
+  mpz_setbit (x, bit);
+  mpz_set_ui (t2, 0); mpz_setbit (t2, bit);  mpz_add (x, x, t2);
+
+#if DEBUG
+  /* Check that the starting approximation is >= than the root.  */
+  mpz_pow_ui (t1, x, nth);
+  if (mpz_cmp (cc, t1) >= 0)
+    abort ();
+#endif
+
+  mpz_add_ui (x, x, 1);
+
+  /* Main loop */
+  do
+    {
+      lowz = mpz_scan1 (x, 0);
+      mpz_tdiv_q_2exp (t0, x, lowz);
+      mpz_pow_ui (t1, t0, nth - 1);
+      mpz_mul_2exp (t1, t1, lowz * (nth - 1));
+      mpz_tdiv_q (t2, cc, t1);
+      mpz_sub (t2, x, t2);
+      rl = mpz_tdiv_q_ui (t2, t2, nth);
+      mpz_sub (x, x, t2);
+    }
+  while (mpz_sgn (t2) != 0);
+
+  /* If we got a non-zero remainder in the last division, we know our root
+     is too large.  */
+  mpz_sub_ui (x, x, (mp_limb_t) (rl != 0));
+
+  /* Adjustment loop.  If we spend more care on rounding in the loop above,
+     we could probably get rid of this, or greatly simplify it.  */
+  {
+    int bad = 0;
+    lowz = mpz_scan1 (x, 0);
+    mpz_tdiv_q_2exp (t0, x, lowz);
+    mpz_pow_ui (t1, t0, nth);
+    mpz_mul_2exp (t1, t1, lowz * nth);
+    while (mpz_cmp (cc, t1) < 0)
+      {
+	bad++;
+	if (bad > 2)
+	  abort ();			/* abort if our root is far off */
+	mpz_sub_ui (x, x, 1);
+	lowz = mpz_scan1 (x, 0);
+	mpz_tdiv_q_2exp (t0, x, lowz);
+	mpz_pow_ui (t1, t0, nth);
+	mpz_mul_2exp (t1, t1, lowz * nth);
+      }
+  }
+
+ done:
+  exact = mpz_cmp (t1, cc) == 0;
+
+  if (r != NULL)
+    {
+      mpz_set (r, x);
+      if (mpz_sgn (c) < 0)
+	SIZ(r) = -SIZ(r);
+    }
+
+  mpz_clear (t2);
+  mpz_clear (t1);
+  mpz_clear (t0);
+  mpz_clear (x);
+
+  return exact;
+}
diff --git a/rts/gmp/mpz/rrandomb.c b/rts/gmp/mpz/rrandomb.c
new file mode 100644
index 0000000000..7d78243674
--- /dev/null
+++ b/rts/gmp/mpz/rrandomb.c
@@ -0,0 +1,117 @@
+/* mpz_rrandomb -- Generate a positive random mpz_t of specified bit size, with
+   long runs of consecutive ones and zeros in the binary representation.
+   Meant for testing of other MP routines.
+
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+static void gmp_rrandomb _PROTO ((mp_ptr rp, gmp_randstate_t rstate, unsigned long int nbits));
+
+void
+#if __STDC__
+mpz_rrandomb (mpz_ptr x, gmp_randstate_t rstate, unsigned long int nbits)
+#else
+mpz_rrandomb (x, rstate, nbits)
+     mpz_ptr x;
+     gmp_randstate_t rstate;
+     unsigned long int nbits;
+#endif
+{
+  mp_size_t nl = 0;
+
+  if (nbits != 0)
+    {
+      mp_ptr xp;
+      nl = (nbits + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB;
+      if (x->_mp_alloc < nl)
+	_mpz_realloc (x, nl);
+
+      xp = PTR(x);
+      gmp_rrandomb (xp, rstate, nbits);
+      MPN_NORMALIZE (xp, nl);
+    }
+
+  SIZ(x) = nl;
+}
+
+#define BITS_PER_CHUNK 4
+
+static void
+#if __STDC__
+gmp_rrandomb (mp_ptr rp, gmp_randstate_t rstate, unsigned long int nbits)
+#else
+gmp_rrandomb (rp, rstate, nbits)
+     mp_ptr rp;
+     gmp_randstate_t rstate;
+     unsigned long int nbits;
+#endif
+{
+  int nb;
+  int bit_pos;
+  mp_size_t limb_pos;
+  mp_limb_t ran, ranm;
+  mp_limb_t acc;
+  mp_size_t n;
+
+  bit_pos = nbits % BITS_PER_MP_LIMB;
+  limb_pos = nbits / BITS_PER_MP_LIMB;
+  if (bit_pos == 0)
+    {
+      bit_pos = BITS_PER_MP_LIMB;
+      limb_pos--;
+    }
+
+  acc = 0;
+  while (limb_pos >= 0)
+    {
+      _gmp_rand (&ranm, rstate, BITS_PER_CHUNK + 1);
+      ran = ranm;
+      nb = (ran >> 1) + 1;
+      if ((ran & 1) != 0)
+	{
+	  /* Generate a string of ones.  */
+	  if (nb > bit_pos)
+	    {
+	      rp[limb_pos--] = acc | ((((mp_limb_t) 1) << bit_pos) - 1);
+	      bit_pos += BITS_PER_MP_LIMB;
+	      bit_pos -= nb;
+	      acc = (~(mp_limb_t) 0) << bit_pos;
+	    }
+	  else
+	    {
+	      bit_pos -= nb;
+	      acc |= ((((mp_limb_t) 1) << nb) - 1) << bit_pos;
+	    }
+	}
+      else
+	{
+	  /* Generate a string of zeroes.  */
+	  if (nb > bit_pos)
+	    {
+	      rp[limb_pos--] = acc;
+	      acc = 0;
+	      bit_pos += BITS_PER_MP_LIMB;
+	    }
+	  bit_pos -= nb;
+	}
+    }
+}
diff --git a/rts/gmp/mpz/scan0.c b/rts/gmp/mpz/scan0.c
new file mode 100644
index 0000000000..6c59cf8939
--- /dev/null
+++ b/rts/gmp/mpz/scan0.c
@@ -0,0 +1,35 @@
+/* mpz_scan0(op, startbit) -- Scan for the next set bit, starting at startbit.
+
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_scan0 (mpz_srcptr u, unsigned long int starting_bit)
+#else
+mpz_scan0 (u, starting_bit)
+     mpz_srcptr u;
+     unsigned long int starting_bit;
+#endif
+{
+  return mpn_scan0 (u->_mp_d, starting_bit);
+}
diff --git a/rts/gmp/mpz/scan1.c b/rts/gmp/mpz/scan1.c
new file mode 100644
index 0000000000..3b84e3420c
--- /dev/null
+++ b/rts/gmp/mpz/scan1.c
@@ -0,0 +1,35 @@
+/* mpz_scan1(op, startbit) -- Scan for the next set bit, starting at startbit.
+
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_scan1 (mpz_srcptr u, unsigned long int starting_bit)
+#else
+mpz_scan1 (u, starting_bit)
+     mpz_srcptr u;
+     unsigned long int starting_bit;
+#endif
+{
+  return mpn_scan1 (u->_mp_d, starting_bit);
+}
diff --git a/rts/gmp/mpz/set.c b/rts/gmp/mpz/set.c
new file mode 100644
index 0000000000..06b2eef511
--- /dev/null
+++ b/rts/gmp/mpz/set.c
@@ -0,0 +1,48 @@
+/* mpz_set (dest_integer, src_integer) -- Assign DEST_INTEGER from SRC_INTEGER.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_set (mpz_ptr w, mpz_srcptr u)
+#else
+mpz_set (w, u)
+     mpz_ptr w;
+     mpz_srcptr u;
+#endif
+{
+  mp_ptr wp, up;
+  mp_size_t usize, size;
+
+  usize = u->_mp_size;
+  size = ABS (usize);
+
+  if (w->_mp_alloc < size)
+    _mpz_realloc (w, size);
+
+  wp = w->_mp_d;
+  up = u->_mp_d;
+
+  MPN_COPY (wp, up, size);
+  w->_mp_size = usize;
+}
diff --git a/rts/gmp/mpz/set_d.c b/rts/gmp/mpz/set_d.c
new file mode 100644
index 0000000000..e90ed9bc2f
--- /dev/null
+++ b/rts/gmp/mpz/set_d.c
@@ -0,0 +1,96 @@
+/* mpz_set_d(integer, val) -- Assign INTEGER with a double value VAL.
+
+Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_set_d (mpz_ptr r, double d)
+#else
+mpz_set_d (r, d)
+     mpz_ptr r;
+     double d;
+#endif
+{
+  int negative;
+  mp_limb_t tp[3];
+  mp_ptr rp;
+  mp_size_t rn;
+
+  negative = d < 0;
+  d = ABS (d);
+
+  /* Handle small arguments quickly.  */
+  if (d < MP_BASE_AS_DOUBLE)
+    {
+      mp_limb_t tmp;
+      tmp = d;
+      PTR(r)[0] = tmp;
+      SIZ(r) = negative ? -(tmp != 0) : (tmp != 0);
+      return;
+    }
+
+  rn = __gmp_extract_double (tp, d);
+
+  if (ALLOC(r) < rn)
+    _mpz_realloc (r, rn);
+
+  rp = PTR (r);
+
+#if BITS_PER_MP_LIMB == 32
+  switch (rn)
+    {
+    default:
+      MPN_ZERO (rp, rn - 3);
+      rp += rn - 3;
+      /* fall through */
+    case 3:
+      rp[2] = tp[2];
+      rp[1] = tp[1];
+      rp[0] = tp[0];
+      break;
+    case 2:
+      rp[1] = tp[2];
+      rp[0] = tp[1];
+      break;
+    case 1:
+      /* handled in "small aguments" case above */
+      abort ();
+    }
+#else
+  switch (rn)
+    {
+    default:
+      MPN_ZERO (rp, rn - 2);
+      rp += rn - 2;
+      /* fall through */
+    case 2:
+      rp[1] = tp[1], rp[0] = tp[0];
+      break;
+    case 1:
+      /* handled in "small aguments" case above */
+      abort ();
+    }
+#endif
+
+  SIZ(r) = negative ? -rn : rn;
+}
diff --git a/rts/gmp/mpz/set_f.c b/rts/gmp/mpz/set_f.c
new file mode 100644
index 0000000000..2273953dfd
--- /dev/null
+++ b/rts/gmp/mpz/set_f.c
@@ -0,0 +1,64 @@
+/* mpz_set_f (dest_integer, src_float) -- Assign DEST_INTEGER from SRC_FLOAT.
+
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_set_f (mpz_ptr w, mpf_srcptr u)
+#else
+mpz_set_f (w, u)
+     mpz_ptr w;
+     mpf_srcptr u;
+#endif
+{
+  mp_ptr wp, up;
+  mp_size_t usize, size;
+  mp_exp_t exp;
+
+  usize = SIZ (u);
+  size = ABS (usize);
+  exp = EXP (u);
+
+  if (w->_mp_alloc < exp)
+    _mpz_realloc (w, exp);
+
+  wp = w->_mp_d;
+  up = u->_mp_d;
+
+  if (exp <= 0)
+    {
+      SIZ (w) = 0;
+      return;
+    }
+  if (exp < size)
+    {
+      MPN_COPY (wp, up + size - exp, exp);
+    }
+  else
+    {
+      MPN_ZERO (wp, exp - size);
+      MPN_COPY (wp + exp - size, up, size);
+    }
+
+  w->_mp_size = usize >= 0 ? exp : -exp;
+}
diff --git a/rts/gmp/mpz/set_q.c b/rts/gmp/mpz/set_q.c
new file mode 100644
index 0000000000..72d3222a80
--- /dev/null
+++ b/rts/gmp/mpz/set_q.c
@@ -0,0 +1,36 @@
+/* mpz_set_q (dest_integer, src_rational) -- Assign DEST_INTEGER from
+   SRC_rational.
+
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_set_q (mpz_ptr w, mpq_srcptr u)
+#else
+mpz_set_q (w, u)
+     mpz_ptr w;
+     mpq_srcptr u;
+#endif
+{
+  mpz_tdiv_q (w, mpq_numref (u), mpq_denref (u));
+}
diff --git a/rts/gmp/mpz/set_si.c b/rts/gmp/mpz/set_si.c
new file mode 100644
index 0000000000..9ba2fbaf30
--- /dev/null
+++ b/rts/gmp/mpz/set_si.c
@@ -0,0 +1,48 @@
+/* mpz_set_si(integer, val) -- Assign INTEGER with a small value VAL.
+
+Copyright (C) 1991, 1993, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_set_si (mpz_ptr dest, signed long int val)
+#else
+mpz_set_si (dest, val)
+     mpz_ptr dest;
+     signed long int val;
+#endif
+{
+  /* We don't check if the allocation is enough, since the rest of the
+     package ensures it's at least 1, which is what we need here.  */
+  if (val > 0)
+    {
+      dest->_mp_d[0] = val;
+      dest->_mp_size = 1;
+    }
+  else if (val < 0)
+    {
+      dest->_mp_d[0] = (unsigned long) -val;
+      dest->_mp_size = -1;
+    }
+  else
+    dest->_mp_size = 0;
+}
diff --git a/rts/gmp/mpz/set_str.c b/rts/gmp/mpz/set_str.c
new file mode 100644
index 0000000000..3ab79c0e89
--- /dev/null
+++ b/rts/gmp/mpz/set_str.c
@@ -0,0 +1,157 @@
+/* mpz_set_str(mp_dest, string, base) -- Convert the \0-terminated
+   string STRING in base BASE to multiple precision integer in
+   MP_DEST.  Allow white space in the string.  If BASE == 0 determine
+   the base in the C standard way, i.e.  0xhh...h means base 16,
+   0oo...o means base 8, otherwise assume base 10.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <string.h>
+#include <ctype.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+static int
+#if __STDC__
+digit_value_in_base (int c, int base)
+#else
+digit_value_in_base (c, base)
+     int c;
+     int base;
+#endif
+{
+  int digit;
+
+  if (isdigit (c))
+    digit = c - '0';
+  else if (islower (c))
+    digit = c - 'a' + 10;
+  else if (isupper (c))
+    digit = c - 'A' + 10;
+  else
+    return -1;
+
+  if (digit < base)
+    return digit;
+  return -1;
+}
+
+int
+#if __STDC__
+mpz_set_str (mpz_ptr x, const char *str, int base)
+#else
+mpz_set_str (x, str, base)
+     mpz_ptr x;
+     const char *str;
+     int base;
+#endif
+{
+  size_t str_size;
+  char *s, *begs;
+  size_t i;
+  mp_size_t xsize;
+  int c;
+  int negative;
+  TMP_DECL (marker);
+
+  /* Skip whitespace.  */
+  do
+    c = *str++;
+  while (isspace (c));
+
+  negative = 0;
+  if (c == '-')
+    {
+      negative = 1;
+      c = *str++;
+    }
+
+  if (digit_value_in_base (c, base == 0 ? 10 : base) < 0)
+    return -1;			/* error if no digits */
+
+  /* If BASE is 0, try to find out the base by looking at the initial
+     characters.  */
+  if (base == 0)
+    {
+      base = 10;
+      if (c == '0')
+	{
+	  base = 8;
+	  c = *str++;
+	  if (c == 'x' || c == 'X')
+	    {
+	      base = 16;
+	      c = *str++;
+	    }
+	  else if (c == 'b' || c == 'B')
+	    {
+	      base = 2;
+	      c = *str++;
+	    }
+	}
+    }
+
+  /* Skip leading zeros.  */
+  while (c == '0')
+    c = *str++;
+  /* Make sure the string does not become empty, mpn_set_str would fail.  */
+  if (c == 0)
+    {
+      x->_mp_size = 0;
+      return 0;
+    }
+
+  TMP_MARK (marker);
+  str_size = strlen (str - 1);
+  s = begs = (char *) TMP_ALLOC (str_size + 1);
+
+  /* Remove spaces from the string and convert the result from ASCII to a
+     byte array.  */
+  for (i = 0; i < str_size; i++)
+    {
+      if (!isspace (c))
+	{
+	  int dig = digit_value_in_base (c, base);
+	  if (dig < 0)
+	    {
+	      TMP_FREE (marker);
+	      return -1;
+	    }
+	  *s++ = dig;
+	}
+      c = *str++;
+    }
+
+  str_size = s - begs;
+
+  xsize = (((mp_size_t) (str_size / __mp_bases[base].chars_per_bit_exactly))
+	   / BITS_PER_MP_LIMB + 2);
+  if (x->_mp_alloc < xsize)
+    _mpz_realloc (x, xsize);
+
+  /* Convert the byte array in base BASE to our bignum format.  */
+  xsize = mpn_set_str (x->_mp_d, (unsigned char *) begs, str_size, base);
+  x->_mp_size = negative ? -xsize : xsize;
+
+  TMP_FREE (marker);
+  return 0;
+}
diff --git a/rts/gmp/mpz/set_ui.c b/rts/gmp/mpz/set_ui.c
new file mode 100644
index 0000000000..d6097c170a
--- /dev/null
+++ b/rts/gmp/mpz/set_ui.c
@@ -0,0 +1,43 @@
+/* mpz_set_ui(integer, val) -- Assign INTEGER with a small value VAL.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_set_ui (mpz_ptr dest, unsigned long int val)
+#else
+mpz_set_ui (dest, val)
+     mpz_ptr dest;
+     unsigned long int val;
+#endif
+{
+  /* We don't check if the allocation is enough, since the rest of the
+     package ensures it's at least 1, which is what we need here.  */
+  if (val > 0)
+    {
+      dest->_mp_d[0] = val;
+      dest->_mp_size = 1;
+    }
+  else
+    dest->_mp_size = 0;
+}
diff --git a/rts/gmp/mpz/setbit.c b/rts/gmp/mpz/setbit.c
new file mode 100644
index 0000000000..d4249a434e
--- /dev/null
+++ b/rts/gmp/mpz/setbit.c
@@ -0,0 +1,119 @@
+/* mpz_setbit -- set a specified bit.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1997, 1999 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_setbit (mpz_ptr d, unsigned long int bit_index)
+#else
+mpz_setbit (d, bit_index)
+     mpz_ptr d;
+     unsigned long int bit_index;
+#endif
+{
+  mp_size_t dsize = d->_mp_size;
+  mp_ptr dp = d->_mp_d;
+  mp_size_t limb_index;
+
+  limb_index = bit_index / BITS_PER_MP_LIMB;
+  if (dsize >= 0)
+    {
+      if (limb_index < dsize)
+	{
+	  dp[limb_index] |= (mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB);
+	  d->_mp_size = dsize;
+	}
+      else
+	{
+	  /* Ugh.  The bit should be set outside of the end of the
+	     number.  We have to increase the size of the number.  */
+	  if (d->_mp_alloc < limb_index + 1)
+	    {
+	      _mpz_realloc (d, limb_index + 1);
+	      dp = d->_mp_d;
+	    }
+	  MPN_ZERO (dp + dsize, limb_index - dsize);
+	  dp[limb_index] = (mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB);
+	  d->_mp_size = limb_index + 1;
+	}
+    }
+  else
+    {
+      mp_size_t zero_bound;
+
+      /* Simulate two's complement arithmetic, i.e. simulate
+	 1. Set OP = ~(OP - 1) [with infinitely many leading ones].
+	 2. Set the bit.
+	 3. Set OP = ~OP + 1.  */
+
+      dsize = -dsize;
+
+      /* No upper bound on this loop, we're sure there's a non-zero limb
+	 sooner ot later.  */
+      for (zero_bound = 0; ; zero_bound++)
+	if (dp[zero_bound] != 0)
+	  break;
+
+      if (limb_index > zero_bound)
+	{
+	  if (limb_index < dsize)
+	    dp[limb_index] &= ~((mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB));
+	  else
+	    ;
+	}
+      else if (limb_index == zero_bound)
+	{
+	  dp[limb_index] = ((dp[limb_index] - 1)
+			    & ~((mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB))) + 1;
+	  if (dp[limb_index] == 0)
+	    {
+	      mp_size_t i;
+	      for (i = limb_index + 1; i < dsize; i++)
+		{
+		  dp[i] += 1;
+		  if (dp[i] != 0)
+		    goto fin;
+		}
+	      /* We got carry all way out beyond the end of D.  Increase
+		 its size (and allocation if necessary).  */
+	      dsize++;
+	      if (d->_mp_alloc < dsize)
+		{
+		  _mpz_realloc (d, dsize);
+		  dp = d->_mp_d;
+		}
+	      dp[i] = 1;
+	      d->_mp_size = -dsize;
+	    fin:;
+	    }
+	}
+      else
+	{
+	  mpn_decr_u (dp + limb_index,
+		     (mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB));
+	  dsize -= dp[dsize - 1] == 0;
+	  d->_mp_size = -dsize;
+	}
+    }
+}
diff --git a/rts/gmp/mpz/size.c b/rts/gmp/mpz/size.c
new file mode 100644
index 0000000000..6574756783
--- /dev/null
+++ b/rts/gmp/mpz/size.c
@@ -0,0 +1,35 @@
+/* mpz_size(x) -- return the number of lims currently used by the
+   value of integer X.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+size_t
+#if __STDC__
+mpz_size (mpz_srcptr x)
+#else
+mpz_size (x)
+     mpz_srcptr x;
+#endif
+{
+  return ABS (x->_mp_size);
+}
diff --git a/rts/gmp/mpz/sizeinbase.c b/rts/gmp/mpz/sizeinbase.c
new file mode 100644
index 0000000000..734f9c4532
--- /dev/null
+++ b/rts/gmp/mpz/sizeinbase.c
@@ -0,0 +1,60 @@
+/* mpz_sizeinbase(x, base) -- return an approximation to the number of
+   character the integer X would have printed in base BASE.  The
+   approximation is never too small.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+size_t
+#if __STDC__
+mpz_sizeinbase (mpz_srcptr x, int base)
+#else
+mpz_sizeinbase (x, base)
+     mpz_srcptr x;
+     int base;
+#endif
+{
+  mp_size_t size = ABS (x->_mp_size);
+  int lb_base, cnt;
+  size_t totbits;
+
+  /* Special case for X == 0.  */
+  if (size == 0)
+    return 1;
+
+  /* Calculate the total number of significant bits of X.  */
+  count_leading_zeros (cnt, x->_mp_d[size - 1]);
+  totbits = size * BITS_PER_MP_LIMB - cnt;
+
+  if ((base & (base - 1)) == 0)
+    {
+      /* Special case for powers of 2, giving exact result.  */
+
+      count_leading_zeros (lb_base, base);
+      lb_base = BITS_PER_MP_LIMB - lb_base - 1;
+
+      return (totbits + lb_base - 1) / lb_base;
+    }
+  else
+    return (size_t) (totbits * __mp_bases[base].chars_per_bit_exactly) + 1;
+}
diff --git a/rts/gmp/mpz/sqrt.c b/rts/gmp/mpz/sqrt.c
new file mode 100644
index 0000000000..fe82fe407a
--- /dev/null
+++ b/rts/gmp/mpz/sqrt.c
@@ -0,0 +1,86 @@
+/* mpz_sqrt(root, u) --  Set ROOT to floor(sqrt(U)).
+
+Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_sqrt (mpz_ptr root, mpz_srcptr op)
+#else
+mpz_sqrt (root, op)
+     mpz_ptr root;
+     mpz_srcptr op;
+#endif
+{
+  mp_size_t op_size, root_size;
+  mp_ptr root_ptr, op_ptr;
+  mp_ptr free_me = NULL;
+  mp_size_t free_me_size;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+  op_size = op->_mp_size;
+  if (op_size < 0)
+    SQRT_OF_NEGATIVE;
+
+  /* The size of the root is accurate after this simple calculation.  */
+  root_size = (op_size + 1) / 2;
+
+  root_ptr = root->_mp_d;
+  op_ptr = op->_mp_d;
+
+  if (root->_mp_alloc < root_size)
+    {
+      if (root_ptr == op_ptr)
+	{
+	  free_me = root_ptr;
+	  free_me_size = root->_mp_alloc;
+	}
+      else
+	(*_mp_free_func) (root_ptr, root->_mp_alloc * BYTES_PER_MP_LIMB);
+
+      root->_mp_alloc = root_size;
+      root_ptr = (mp_ptr) (*_mp_allocate_func) (root_size * BYTES_PER_MP_LIMB);
+      root->_mp_d = root_ptr;
+    }
+  else
+    {
+      /* Make OP not overlap with ROOT.  */
+      if (root_ptr == op_ptr)
+	{
+	  /* ROOT and OP are identical.  Allocate temporary space for OP.  */
+	  op_ptr = (mp_ptr) TMP_ALLOC (op_size * BYTES_PER_MP_LIMB);
+	  /* Copy to the temporary space.  Hack: Avoid temporary variable
+	     by using ROOT_PTR.  */
+	  MPN_COPY (op_ptr, root_ptr, op_size);
+	}
+    }
+
+  mpn_sqrtrem (root_ptr, NULL, op_ptr, op_size);
+
+  root->_mp_size = root_size;
+
+  if (free_me != NULL)
+    (*_mp_free_func) (free_me, free_me_size * BYTES_PER_MP_LIMB);
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/sqrtrem.c b/rts/gmp/mpz/sqrtrem.c
new file mode 100644
index 0000000000..99a6453122
--- /dev/null
+++ b/rts/gmp/mpz/sqrtrem.c
@@ -0,0 +1,111 @@
+/* mpz_sqrtrem(root,rem,x) -- Set ROOT to floor(sqrt(X)) and REM
+   to the remainder, i.e. X - ROOT**2.
+
+Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+
+#ifndef BERKELEY_MP
+void
+#if __STDC__
+mpz_sqrtrem (mpz_ptr root, mpz_ptr rem, mpz_srcptr op)
+#else
+mpz_sqrtrem (root, rem, op)
+     mpz_ptr root;
+     mpz_ptr rem;
+     mpz_srcptr op;
+#endif
+#else /* BERKELEY_MP */
+void
+#if __STDC__
+msqrt (mpz_srcptr op, mpz_ptr root, mpz_ptr rem)
+#else
+msqrt (op, root, rem)
+     mpz_srcptr op;
+     mpz_ptr root;
+     mpz_ptr rem;
+#endif
+#endif /* BERKELEY_MP */
+{
+  mp_size_t op_size, root_size, rem_size;
+  mp_ptr root_ptr, op_ptr;
+  mp_ptr free_me = NULL;
+  mp_size_t free_me_size;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+  op_size = op->_mp_size;
+  if (op_size < 0)
+    SQRT_OF_NEGATIVE;
+
+  if (rem->_mp_alloc < op_size)
+    _mpz_realloc (rem, op_size);
+
+  /* The size of the root is accurate after this simple calculation.  */
+  root_size = (op_size + 1) / 2;
+
+  root_ptr = root->_mp_d;
+  op_ptr = op->_mp_d;
+
+  if (root->_mp_alloc < root_size)
+    {
+      if (root_ptr == op_ptr)
+	{
+	  free_me = root_ptr;
+	  free_me_size = root->_mp_alloc;
+	}
+      else
+	(*_mp_free_func) (root_ptr, root->_mp_alloc * BYTES_PER_MP_LIMB);
+
+      root->_mp_alloc = root_size;
+      root_ptr = (mp_ptr) (*_mp_allocate_func) (root_size * BYTES_PER_MP_LIMB);
+      root->_mp_d = root_ptr;
+    }
+  else
+    {
+      /* Make OP not overlap with ROOT.  */
+      if (root_ptr == op_ptr)
+	{
+	  /* ROOT and OP are identical.  Allocate temporary space for OP.  */
+	  op_ptr = (mp_ptr) TMP_ALLOC (op_size * BYTES_PER_MP_LIMB);
+	  /* Copy to the temporary space.  Hack: Avoid temporary variable
+	     by using ROOT_PTR.  */
+	  MPN_COPY (op_ptr, root_ptr, op_size);
+	}
+    }
+
+  rem_size = mpn_sqrtrem (root_ptr, rem->_mp_d, op_ptr, op_size);
+
+  root->_mp_size = root_size;
+
+  /* Write remainder size last, to enable us to define this function to
+     give only the square root remainder, if the user calls if with
+     ROOT == REM.  */
+  rem->_mp_size = rem_size;
+
+  if (free_me != NULL)
+    (*_mp_free_func) (free_me, free_me_size * BYTES_PER_MP_LIMB);
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/sub.c b/rts/gmp/mpz/sub.c
new file mode 100644
index 0000000000..f3ae7c23a0
--- /dev/null
+++ b/rts/gmp/mpz/sub.c
@@ -0,0 +1,123 @@
+/* mpz_sub -- Subtract two integers.
+
+Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+
+#ifndef BERKELEY_MP
+void
+#if __STDC__
+mpz_sub (mpz_ptr w, mpz_srcptr u, mpz_srcptr v)
+#else
+mpz_sub (w, u, v)
+     mpz_ptr w;
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+#else /* BERKELEY_MP */
+void
+#if __STDC__
+msub (mpz_srcptr u, mpz_srcptr v, mpz_ptr w)
+#else
+msub (u, v, w)
+     mpz_srcptr u;
+     mpz_srcptr v;
+     mpz_ptr w;
+#endif
+#endif /* BERKELEY_MP */
+{
+  mp_srcptr up, vp;
+  mp_ptr wp;
+  mp_size_t usize, vsize, wsize;
+  mp_size_t abs_usize;
+  mp_size_t abs_vsize;
+
+  usize = u->_mp_size;
+  vsize = -v->_mp_size;		/* The "-" makes the difference from mpz_add */
+  abs_usize = ABS (usize);
+  abs_vsize = ABS (vsize);
+
+  if (abs_usize < abs_vsize)
+    {
+      /* Swap U and V. */
+      MPZ_SRCPTR_SWAP (u, v);
+      MP_SIZE_T_SWAP (usize, vsize);
+      MP_SIZE_T_SWAP (abs_usize, abs_vsize);
+    }
+
+  /* True: ABS_USIZE >= ABS_VSIZE.  */
+
+  /* If not space for w (and possible carry), increase space.  */
+  wsize = abs_usize + 1;
+  if (w->_mp_alloc < wsize)
+    _mpz_realloc (w, wsize);
+
+  /* These must be after realloc (u or v may be the same as w).  */
+  up = u->_mp_d;
+  vp = v->_mp_d;
+  wp = w->_mp_d;
+
+  if ((usize ^ vsize) < 0)
+    {
+      /* U and V have different sign.  Need to compare them to determine
+	 which operand to subtract from which.  */
+
+      /* This test is right since ABS_USIZE >= ABS_VSIZE.  */
+      if (abs_usize != abs_vsize)
+	{
+	  mpn_sub (wp, up, abs_usize, vp, abs_vsize);
+	  wsize = abs_usize;
+	  MPN_NORMALIZE (wp, wsize);
+	  if (usize < 0)
+	    wsize = -wsize;
+	}
+      else if (mpn_cmp (up, vp, abs_usize) < 0)
+	{
+	  mpn_sub_n (wp, vp, up, abs_usize);
+	  wsize = abs_usize;
+	  MPN_NORMALIZE (wp, wsize);
+	  if (usize >= 0)
+	    wsize = -wsize;
+	}
+      else
+	{
+	  mpn_sub_n (wp, up, vp, abs_usize);
+	  wsize = abs_usize;
+	  MPN_NORMALIZE (wp, wsize);
+	  if (usize < 0)
+	    wsize = -wsize;
+	}
+    }
+  else
+    {
+      /* U and V have same sign.  Add them.  */
+      mp_limb_t cy_limb = mpn_add (wp, up, abs_usize, vp, abs_vsize);
+      wp[abs_usize] = cy_limb;
+      wsize = abs_usize + cy_limb;
+      if (usize < 0)
+	wsize = -wsize;
+    }
+
+  w->_mp_size = wsize;
+}
diff --git a/rts/gmp/mpz/sub_ui.c b/rts/gmp/mpz/sub_ui.c
new file mode 100644
index 0000000000..327add8503
--- /dev/null
+++ b/rts/gmp/mpz/sub_ui.c
@@ -0,0 +1,84 @@
+/* mpz_sub_ui -- Subtract an unsigned one-word integer from an MP_INT.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_sub_ui (mpz_ptr w, mpz_srcptr u, unsigned long int v)
+#else
+mpz_sub_ui (w, u, v)
+     mpz_ptr w;
+     mpz_srcptr u;
+     unsigned long int v;
+#endif
+{
+  mp_srcptr up;
+  mp_ptr wp;
+  mp_size_t usize, wsize;
+  mp_size_t abs_usize;
+
+  usize = u->_mp_size;
+  abs_usize = ABS (usize);
+
+  /* If not space for W (and possible carry), increase space.  */
+  wsize = abs_usize + 1;
+  if (w->_mp_alloc < wsize)
+    _mpz_realloc (w, wsize);
+
+  /* These must be after realloc (U may be the same as W).  */
+  up = u->_mp_d;
+  wp = w->_mp_d;
+
+  if (abs_usize == 0)
+    {
+      wp[0] = v;
+      w->_mp_size = -(v != 0);
+      return;
+    }
+
+  if (usize < 0)
+    {
+      mp_limb_t cy;
+      cy = mpn_add_1 (wp, up, abs_usize, (mp_limb_t) v);
+      wp[abs_usize] = cy;
+      wsize = -(abs_usize + cy);
+    }
+  else
+    {
+      /* The signs are different.  Need exact comparison to determine
+	 which operand to subtract from which.  */
+      if (abs_usize == 1 && up[0] < v)
+	{
+	  wp[0] = v - up[0];
+	  wsize = -1;
+	}
+      else
+	{
+	  mpn_sub_1 (wp, up, abs_usize, (mp_limb_t) v);
+	  /* Size can decrease with at most one limb.  */
+	  wsize = abs_usize - (wp[abs_usize - 1] == 0);
+	}
+    }
+
+  w->_mp_size = wsize;
+}
diff --git a/rts/gmp/mpz/swap.c b/rts/gmp/mpz/swap.c
new file mode 100644
index 0000000000..0070d6ff24
--- /dev/null
+++ b/rts/gmp/mpz/swap.c
@@ -0,0 +1,52 @@
+/* mpz_swap (dest_integer, src_integer) -- Swap U and V.
+
+Copyright (C) 1997, 1998 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_swap (mpz_ptr u, mpz_ptr v)
+#else
+mpz_swap (u, v)
+     mpz_ptr u;
+     mpz_ptr v;
+#endif
+{
+  mp_ptr up, vp;
+  mp_size_t usize, vsize;
+  mp_size_t ualloc, valloc;
+
+  ualloc = u->_mp_alloc;
+  valloc = v->_mp_alloc;
+  v->_mp_alloc = ualloc;
+  u->_mp_alloc = valloc;
+
+  usize = u->_mp_size;
+  vsize = v->_mp_size;
+  v->_mp_size = usize;
+  u->_mp_size = vsize;
+
+  up = u->_mp_d;
+  vp = v->_mp_d;
+  v->_mp_d = up;
+  u->_mp_d = vp;
+}
diff --git a/rts/gmp/mpz/tdiv_q.c b/rts/gmp/mpz/tdiv_q.c
new file mode 100644
index 0000000000..21db4ab385
--- /dev/null
+++ b/rts/gmp/mpz/tdiv_q.c
@@ -0,0 +1,91 @@
+/* mpz_tdiv_q -- divide two integers and produce a quotient.
+
+Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+#if __STDC__
+mpz_tdiv_q (mpz_ptr quot, mpz_srcptr num, mpz_srcptr den)
+#else
+mpz_tdiv_q (quot, num, den)
+     mpz_ptr quot;
+     mpz_srcptr num;
+     mpz_srcptr den;
+#endif
+{
+  mp_size_t ql;
+  mp_size_t ns, ds, nl, dl;
+  mp_ptr np, dp, qp, rp;
+  TMP_DECL (marker);
+
+  ns = SIZ (num);
+  ds = SIZ (den);
+  nl = ABS (ns);
+  dl = ABS (ds);
+  ql = nl - dl + 1;
+
+  if (dl == 0)
+    DIVIDE_BY_ZERO;
+
+  if (ql <= 0)
+    {
+      SIZ (quot) = 0;
+      return;
+    }
+
+  MPZ_REALLOC (quot, ql);
+
+  TMP_MARK (marker);
+  qp = PTR (quot);
+  rp = (mp_ptr) TMP_ALLOC (dl * BYTES_PER_MP_LIMB);
+  np = PTR (num);
+  dp = PTR (den);
+
+  /* FIXME: We should think about how to handle the temporary allocation.
+     Perhaps mpn_tdiv_qr should handle it, since it anyway often needs to
+     allocate temp space.  */
+
+  /* Copy denominator to temporary space if it overlaps with the quotient.  */
+  if (dp == qp)
+    {
+      mp_ptr tp;
+      tp = (mp_ptr) TMP_ALLOC (dl * BYTES_PER_MP_LIMB);
+      MPN_COPY (tp, dp, dl);
+      dp = tp;
+    }
+  /* Copy numerator to temporary space if it overlaps with the quotient.  */
+  if (np == qp)
+    {
+      mp_ptr tp;
+      tp = (mp_ptr) TMP_ALLOC (nl * BYTES_PER_MP_LIMB);
+      MPN_COPY (tp, np, nl);
+      np = tp;
+    }
+
+  mpn_tdiv_qr (qp, rp, 0L, np, nl, dp, dl);
+
+  ql -=  qp[ql - 1] == 0;
+
+  SIZ (quot) = (ns ^ ds) >= 0 ? ql : -ql;
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/tdiv_q_2exp.c b/rts/gmp/mpz/tdiv_q_2exp.c
new file mode 100644
index 0000000000..03d1e01f89
--- /dev/null
+++ b/rts/gmp/mpz/tdiv_q_2exp.c
@@ -0,0 +1,68 @@
+/* mpz_tdiv_q_2exp -- Divide an integer by 2**CNT.  Round the quotient
+   towards -infinity.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_tdiv_q_2exp (mpz_ptr w, mpz_srcptr u, unsigned long int cnt)
+#else
+mpz_tdiv_q_2exp (w, u, cnt)
+     mpz_ptr w;
+     mpz_srcptr u;
+     unsigned long int cnt;
+#endif
+{
+  mp_size_t usize, wsize;
+  mp_size_t limb_cnt;
+
+  usize = u->_mp_size;
+  limb_cnt = cnt / BITS_PER_MP_LIMB;
+  wsize = ABS (usize) - limb_cnt;
+  if (wsize <= 0)
+    w->_mp_size = 0;
+  else
+    {
+      mp_ptr wp;
+      mp_srcptr up;
+
+      if (w->_mp_alloc < wsize)
+	_mpz_realloc (w, wsize);
+
+      wp = w->_mp_d;
+      up = u->_mp_d;
+
+      cnt %= BITS_PER_MP_LIMB;
+      if (cnt != 0)
+	{
+	  mpn_rshift (wp, up + limb_cnt, wsize, cnt);
+	  wsize -= wp[wsize - 1] == 0;
+	}
+      else
+	{
+	  MPN_COPY_INCR (wp, up + limb_cnt, wsize);
+	}
+
+      w->_mp_size = usize >= 0 ? wsize : -wsize;
+    }
+}
diff --git a/rts/gmp/mpz/tdiv_q_ui.c b/rts/gmp/mpz/tdiv_q_ui.c
new file mode 100644
index 0000000000..a2e3462b76
--- /dev/null
+++ b/rts/gmp/mpz/tdiv_q_ui.c
@@ -0,0 +1,64 @@
+/* mpz_tdiv_q_ui(quot, dividend, divisor_limb)
+   -- Divide DIVIDEND by DIVISOR_LIMB and store the result in QUOT.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1998 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_tdiv_q_ui (mpz_ptr quot, mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_tdiv_q_ui (quot, dividend, divisor)
+     mpz_ptr quot;
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_ptr quot_ptr;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  /* No need for temporary allocation and copying if QUOT == DIVIDEND as
+     the divisor is just one limb, and thus no intermediate remainders
+     need to be stored.  */
+
+  if (quot->_mp_alloc < size)
+    _mpz_realloc (quot, size);
+
+  quot_ptr = quot->_mp_d;
+
+  remainder_limb
+    = mpn_divmod_1 (quot_ptr, dividend->_mp_d, size, (mp_limb_t) divisor);
+
+  /* The quotient is SIZE limbs, but the most significant might be zero. */
+  size -= size != 0 && quot_ptr[size - 1] == 0;
+  quot->_mp_size = dividend_size >= 0 ? size : -size;
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/tdiv_qr.c b/rts/gmp/mpz/tdiv_qr.c
new file mode 100644
index 0000000000..d66f57d9e5
--- /dev/null
+++ b/rts/gmp/mpz/tdiv_qr.c
@@ -0,0 +1,130 @@
+/* mpz_tdiv_qr(quot,rem,dividend,divisor) -- Set QUOT to DIVIDEND/DIVISOR,
+   and REM to DIVIDEND mod DIVISOR.
+
+Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+
+
+#ifndef BERKELEY_MP
+
+void
+#if __STDC__
+mpz_tdiv_qr (mpz_ptr quot, mpz_ptr rem, mpz_srcptr num, mpz_srcptr den)
+#else
+mpz_tdiv_qr (quot, rem, num, den)
+     mpz_ptr quot;
+     mpz_ptr rem;
+     mpz_srcptr num;
+     mpz_srcptr den;
+#endif
+
+#else /* BERKELEY_MP */
+
+void
+#if __STDC__
+mdiv (mpz_srcptr num, mpz_srcptr den, mpz_ptr quot, mpz_ptr rem)
+#else
+mdiv (num, den, quot, rem)
+     mpz_srcptr num;
+     mpz_srcptr den;
+     mpz_ptr    quot;
+     mpz_ptr    rem;
+#endif
+
+#endif /* BERKELEY_MP */
+{
+  mp_size_t ql;
+  mp_size_t ns, ds, nl, dl;
+  mp_ptr np, dp, qp, rp;
+  TMP_DECL (marker);
+
+  ns = SIZ (num);
+  ds = SIZ (den);
+  nl = ABS (ns);
+  dl = ABS (ds);
+  ql = nl - dl + 1;
+
+  if (dl == 0)
+    DIVIDE_BY_ZERO;
+
+  MPZ_REALLOC (rem, dl);
+
+  if (ql <= 0)
+    {
+      if (num != rem)
+	{
+	  mp_ptr np, rp;
+	  np = PTR (num);
+	  rp = PTR (rem);
+	  MPN_COPY (rp, np, nl);
+	  SIZ (rem) = SIZ (num);
+	}
+      /* This needs to follow the assignment to rem, in case the
+	 numerator and quotient are the same.  */
+      SIZ (quot) = 0;
+      return;
+    }
+
+  MPZ_REALLOC (quot, ql);
+
+  TMP_MARK (marker);
+  qp = PTR (quot);
+  rp = PTR (rem);
+  np = PTR (num);
+  dp = PTR (den);
+
+  /* FIXME: We should think about how to handle the temporary allocation.
+     Perhaps mpn_tdiv_qr should handle it, since it anyway often needs to
+     allocate temp space.  */
+
+  /* Copy denominator to temporary space if it overlaps with the quotient
+     or remainder.  */
+  if (dp == rp || dp == qp)
+    {
+      mp_ptr tp;
+      tp = (mp_ptr) TMP_ALLOC (dl * BYTES_PER_MP_LIMB);
+      MPN_COPY (tp, dp, dl);
+      dp = tp;
+    }
+  /* Copy numerator to temporary space if it overlaps with the quotient or
+     remainder.  */
+  if (np == rp || np == qp)
+    {
+      mp_ptr tp;
+      tp = (mp_ptr) TMP_ALLOC (nl * BYTES_PER_MP_LIMB);
+      MPN_COPY (tp, np, nl);
+      np = tp;
+    }
+
+  mpn_tdiv_qr (qp, rp, 0L, np, nl, dp, dl);
+
+  ql -=  qp[ql - 1] == 0;
+  MPN_NORMALIZE (rp, dl);
+
+  SIZ (quot) = (ns ^ ds) >= 0 ? ql : -ql;
+  SIZ (rem) = ns >= 0 ? dl : -dl;
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/tdiv_qr_ui.c b/rts/gmp/mpz/tdiv_qr_ui.c
new file mode 100644
index 0000000000..10368cd340
--- /dev/null
+++ b/rts/gmp/mpz/tdiv_qr_ui.c
@@ -0,0 +1,76 @@
+/* mpz_tdiv_qr_ui(quot,rem,dividend,short_divisor) --
+   Set QUOT to DIVIDEND / SHORT_DIVISOR
+   and REM to DIVIDEND mod SHORT_DIVISOR.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1998 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_tdiv_qr_ui (mpz_ptr quot, mpz_ptr rem, mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_tdiv_qr_ui (quot, rem, dividend, divisor)
+     mpz_ptr quot;
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_ptr quot_ptr;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  /* No need for temporary allocation and copying if QUOT == DIVIDEND as
+     the divisor is just one limb, and thus no intermediate remainders
+     need to be stored.  */
+
+  if (quot->_mp_alloc < size)
+    _mpz_realloc (quot, size);
+
+  quot_ptr = quot->_mp_d;
+
+  remainder_limb = mpn_divmod_1 (quot_ptr, dividend->_mp_d, size,
+				 (mp_limb_t) divisor);
+
+  if (remainder_limb == 0)
+    rem->_mp_size = 0;
+  else
+    {
+      /* Store the single-limb remainder.  We don't check if there's space
+	 for just one limb, since no function ever makes zero space.  */
+      rem->_mp_size = dividend_size >= 0 ? 1 : -1;
+      rem->_mp_d[0] = remainder_limb;
+    }
+
+  /* The quotient is SIZE limbs, but the most significant might be zero. */
+  size -= size != 0 && quot_ptr[size - 1] == 0;
+  quot->_mp_size = dividend_size >= 0 ? size : -size;
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/tdiv_r.c b/rts/gmp/mpz/tdiv_r.c
new file mode 100644
index 0000000000..9eb87dfabf
--- /dev/null
+++ b/rts/gmp/mpz/tdiv_r.c
@@ -0,0 +1,98 @@
+/* mpz_tdiv_r(rem, dividend, divisor) -- Set REM to DIVIDEND mod DIVISOR.
+
+Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+#if __STDC__
+mpz_tdiv_r (mpz_ptr rem, mpz_srcptr num, mpz_srcptr den)
+#else
+mpz_tdiv_r (rem, num, den)
+     mpz_ptr rem;
+     mpz_srcptr num;
+     mpz_srcptr den;
+#endif
+{
+  mp_size_t ql;
+  mp_size_t ns, ds, nl, dl;
+  mp_ptr np, dp, qp, rp;
+  TMP_DECL (marker);
+
+  ns = SIZ (num);
+  ds = SIZ (den);
+  nl = ABS (ns);
+  dl = ABS (ds);
+  ql = nl - dl + 1;
+
+  if (dl == 0)
+    DIVIDE_BY_ZERO;
+
+  MPZ_REALLOC (rem, dl);
+
+  if (ql <= 0)
+    {
+      if (num != rem)
+	{
+	  mp_ptr np, rp;
+	  np = PTR (num);
+	  rp = PTR (rem);
+	  MPN_COPY (rp, np, nl);
+	  SIZ (rem) = SIZ (num);
+	}
+      return;
+    }
+
+  TMP_MARK (marker);
+  qp = (mp_ptr) TMP_ALLOC (ql * BYTES_PER_MP_LIMB);
+  rp = PTR (rem);
+  np = PTR (num);
+  dp = PTR (den);
+
+  /* FIXME: We should think about how to handle the temporary allocation.
+     Perhaps mpn_tdiv_qr should handle it, since it anyway often needs to
+     allocate temp space.  */
+
+  /* Copy denominator to temporary space if it overlaps with the remainder.  */
+  if (dp == rp)
+    {
+      mp_ptr tp;
+      tp = (mp_ptr) TMP_ALLOC (dl * BYTES_PER_MP_LIMB);
+      MPN_COPY (tp, dp, dl);
+      dp = tp;
+    }
+  /* Copy numerator to temporary space if it overlaps with the remainder.  */
+  if (np == rp)
+    {
+      mp_ptr tp;
+      tp = (mp_ptr) TMP_ALLOC (nl * BYTES_PER_MP_LIMB);
+      MPN_COPY (tp, np, nl);
+      np = tp;
+    }
+
+  mpn_tdiv_qr (qp, rp, 0L, np, nl, dp, dl);
+
+  MPN_NORMALIZE (rp, dl);
+
+  SIZ (rem) = ns >= 0 ? dl : -dl;
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/tdiv_r_2exp.c b/rts/gmp/mpz/tdiv_r_2exp.c
new file mode 100644
index 0000000000..91de170f5c
--- /dev/null
+++ b/rts/gmp/mpz/tdiv_r_2exp.c
@@ -0,0 +1,79 @@
+/* mpz_tdiv_r_2exp -- Divide a integer by 2**CNT and produce a remainder.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_tdiv_r_2exp (mpz_ptr res, mpz_srcptr in, unsigned long int cnt)
+#else
+mpz_tdiv_r_2exp (res, in, cnt)
+     mpz_ptr res;
+     mpz_srcptr in;
+     unsigned long int cnt;
+#endif
+{
+  mp_size_t in_size = ABS (in->_mp_size);
+  mp_size_t res_size;
+  mp_size_t limb_cnt = cnt / BITS_PER_MP_LIMB;
+  mp_srcptr in_ptr = in->_mp_d;
+
+  if (in_size > limb_cnt)
+    {
+      /* The input operand is (probably) greater than 2**CNT.  */
+      mp_limb_t x;
+
+      x = in_ptr[limb_cnt] & (((mp_limb_t) 1 << cnt % BITS_PER_MP_LIMB) - 1);
+      if (x != 0)
+	{
+	  res_size = limb_cnt + 1;
+	  if (res->_mp_alloc < res_size)
+	    _mpz_realloc (res, res_size);
+
+	  res->_mp_d[limb_cnt] = x;
+	}
+      else
+	{
+	  res_size = limb_cnt;
+	  MPN_NORMALIZE (in_ptr, res_size);
+
+	  if (res->_mp_alloc < res_size)
+	    _mpz_realloc (res, res_size);
+
+	  limb_cnt = res_size;
+	}
+    }
+  else
+    {
+      /* The input operand is smaller than 2**CNT.  We perform a no-op,
+	 apart from that we might need to copy IN to RES.  */
+      res_size = in_size;
+      if (res->_mp_alloc < res_size)
+	_mpz_realloc (res, res_size);
+
+      limb_cnt = res_size;
+    }
+
+  if (res != in)
+    MPN_COPY (res->_mp_d, in->_mp_d, limb_cnt);
+  res->_mp_size = in->_mp_size >= 0 ? res_size : -res_size;
+}
diff --git a/rts/gmp/mpz/tdiv_r_ui.c b/rts/gmp/mpz/tdiv_r_ui.c
new file mode 100644
index 0000000000..2ea411fda1
--- /dev/null
+++ b/rts/gmp/mpz/tdiv_r_ui.c
@@ -0,0 +1,63 @@
+/* mpz_tdiv_r_ui(rem, dividend, divisor_limb)
+   -- Set REM to DIVDEND mod DIVISOR_LIMB.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1998 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_tdiv_r_ui (mpz_ptr rem, mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_tdiv_r_ui (rem, dividend, divisor)
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  /* No need for temporary allocation and copying if QUOT == DIVIDEND as
+     the divisor is just one limb, and thus no intermediate remainders
+     need to be stored.  */
+
+  remainder_limb = mpn_mod_1 (dividend->_mp_d, size, (mp_limb_t) divisor);
+
+  if (remainder_limb == 0)
+    rem->_mp_size = 0;
+  else
+    {
+      /* Store the single-limb remainder.  We don't check if there's space
+	 for just one limb, since no function ever makes zero space.  */
+      rem->_mp_size = dividend_size >= 0 ? 1 : -1;
+      rem->_mp_d[0] = remainder_limb;
+    }
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/tdiv_ui.c b/rts/gmp/mpz/tdiv_ui.c
new file mode 100644
index 0000000000..7a40a6a7f7
--- /dev/null
+++ b/rts/gmp/mpz/tdiv_ui.c
@@ -0,0 +1,53 @@
+/* mpz_tdiv_ui(dividend, divisor_limb)
+   -- Return DIVDEND mod DIVISOR_LIMB.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_tdiv_ui (mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_tdiv_ui (dividend, divisor)
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  /* No need for temporary allocation and copying if QUOT == DIVIDEND as
+     the divisor is just one limb, and thus no intermediate remainders
+     need to be stored.  */
+
+  remainder_limb = mpn_mod_1 (dividend->_mp_d, size, (mp_limb_t) divisor);
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/tstbit.c b/rts/gmp/mpz/tstbit.c
new file mode 100644
index 0000000000..b0a8b0b31a
--- /dev/null
+++ b/rts/gmp/mpz/tstbit.c
@@ -0,0 +1,70 @@
+/* mpz_tstbit -- test a specified bit.  Simulate 2's complement representation.
+
+Copyright (C) 1997 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_tstbit (mpz_srcptr d, unsigned long int bit_index)
+#else
+mpz_tstbit (d, bit_index)
+     mpz_srcptr d;
+     unsigned long int bit_index;
+#endif
+{
+  mp_size_t dsize = d->_mp_size;
+  mp_ptr dp = d->_mp_d;
+  mp_size_t limb_index;
+
+  limb_index = bit_index / BITS_PER_MP_LIMB;
+  if (dsize >= 0)
+    {
+      if (limb_index < dsize)
+	return (dp[limb_index] >> (bit_index % BITS_PER_MP_LIMB)) & 1;
+      else
+	/* Testing a bit outside of a positive number.  */
+	return 0;
+    }
+  else
+    {
+      mp_size_t zero_bound;
+
+      dsize = -dsize;
+
+      /* Locate the least significant non-zero limb.  */
+      for (zero_bound = 0; dp[zero_bound] == 0; zero_bound++)
+	;
+
+      if (limb_index > zero_bound)
+	{
+	  if (limb_index < dsize)
+	    return (~dp[limb_index] >> (bit_index % BITS_PER_MP_LIMB)) & 1;
+	  else
+	    /* Testing a bit outside of a negative number.  */
+	    return 1;
+	}
+      else if (limb_index == zero_bound)
+	return (-dp[limb_index] >> (bit_index % BITS_PER_MP_LIMB)) & 1;
+      else
+	return 0;
+    }
+}
diff --git a/rts/gmp/mpz/ui_pow_ui.c b/rts/gmp/mpz/ui_pow_ui.c
new file mode 100644
index 0000000000..edd2dee625
--- /dev/null
+++ b/rts/gmp/mpz/ui_pow_ui.c
@@ -0,0 +1,139 @@
+/* mpz_ui_pow_ui(res, base, exp) -- Set RES to BASE**EXP.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+static void mpz_pow2 _PROTO ((mpz_ptr r, mp_limb_t blimb, unsigned long int e, mp_limb_t rl));
+
+void
+#if __STDC__
+mpz_ui_pow_ui (mpz_ptr r, unsigned long int b, unsigned long int e)
+#else
+mpz_ui_pow_ui (r, b, e)
+     mpz_ptr r;
+     unsigned long int b;
+     unsigned long int e;
+#endif
+{
+  mp_limb_t blimb = b;
+  mp_limb_t rl;
+
+  if (e == 0)
+    {
+      /* For x^0 we return 1, even if x is 0.  */
+      r->_mp_d[0] = 1;
+      r->_mp_size = 1;
+      return;
+    }
+
+  /* Compute b^e as (b^n)^(e div n) * b^(e mod n), where n is chosen such that
+     the latter factor is the largest number small enough to fit in a limb.  */
+
+  rl = 1;
+  while (e != 0 && blimb < ((mp_limb_t) 1 << BITS_PER_MP_LIMB/2))
+    {
+      if ((e & 1) != 0)
+	rl = rl * blimb;
+      blimb = blimb * blimb;
+      e = e >> 1;
+    }
+
+  /* rl is now b^(e mod n).  (I.e., the latter factor above.)  */
+
+  if (e == 0)
+    {
+      r->_mp_d[0] = rl;
+      r->_mp_size = rl != 0;
+      return;
+    }
+
+  mpz_pow2 (r, blimb, e, rl);
+}
+
+/* Multi-precision part of expontialization code.  */
+static void
+#if __STDC__
+mpz_pow2 (mpz_ptr r, mp_limb_t blimb, unsigned long int e, mp_limb_t rl)
+#else
+mpz_pow2 (r, blimb, e, rl)
+     mpz_ptr r;
+     mp_limb_t blimb;
+     unsigned long int e;
+     mp_limb_t rl;
+#endif
+{
+  mp_ptr rp, tp;
+  mp_size_t ralloc, rsize;
+  int cnt, i;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* Over-estimate temporary space requirements somewhat.  */
+  count_leading_zeros (cnt, blimb);
+  ralloc = e - cnt * e / BITS_PER_MP_LIMB + 1;
+
+  /* The two areas are used to alternatingly hold the input and receive the
+     product for mpn_mul.  (Needed since mpn_mul_n requires that the product
+     is distinct from either input operand.)  */
+  rp = (mp_ptr) TMP_ALLOC (ralloc * BYTES_PER_MP_LIMB);
+  tp = (mp_ptr) TMP_ALLOC (ralloc * BYTES_PER_MP_LIMB);
+
+  rp[0] = blimb;
+  rsize = 1;
+
+  count_leading_zeros (cnt, e);
+  for (i = BITS_PER_MP_LIMB - cnt - 2; i >= 0; i--)
+    {
+      mpn_mul_n (tp, rp, rp, rsize);
+      rsize = 2 * rsize;
+      rsize -= tp[rsize - 1] == 0;
+      MP_PTR_SWAP (rp, tp);
+
+      if ((e & ((mp_limb_t) 1 << i)) != 0)
+	{
+	  mp_limb_t cy;
+	  cy = mpn_mul_1 (rp, rp, rsize, blimb);
+	  rp[rsize] = cy;
+	  rsize += cy != 0;
+	}
+    }
+
+  /* We will need rsize or rsize+1 limbs for the result.  */
+  if (r->_mp_alloc <= rsize)
+    _mpz_realloc (r, rsize + 1);
+
+  /* Multiply the two factors (in rp,rsize and rl) and put the final result
+     in place.  */
+  {
+    mp_limb_t cy;
+    cy = mpn_mul_1 (r->_mp_d, rp, rsize, rl);
+    (r->_mp_d)[rsize] = cy;
+    rsize += cy != 0;
+  }
+
+  r->_mp_size = rsize;
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/urandomb.c b/rts/gmp/mpz/urandomb.c
new file mode 100644
index 0000000000..caca086e05
--- /dev/null
+++ b/rts/gmp/mpz/urandomb.c
@@ -0,0 +1,49 @@
+/* mpz_urandomb (rop, state, n) -- Generate a uniform pseudorandom
+   integer in the range 0 to 2^N - 1, inclusive, using STATE as the
+   random state previously initialized by a call to gmp_randinit().
+
+Copyright (C) 1999, 2000  Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_urandomb (mpz_t rop, gmp_randstate_t rstate, unsigned long int nbits)
+#else
+mpz_urandomb (rop, rstate, nbits)
+     mpz_t rop;
+     gmp_randstate_t rstate;
+     unsigned long int nbits;
+#endif
+{
+  mp_ptr rp;
+  mp_size_t size;
+
+  size = (nbits + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB;
+  if (ALLOC (rop) < size)
+    _mpz_realloc (rop, size);
+
+  rp = PTR (rop);
+
+  _gmp_rand (rp, rstate, nbits);
+  MPN_NORMALIZE (rp, size);
+  SIZ (rop) = size;
+}
diff --git a/rts/gmp/mpz/urandomm.c b/rts/gmp/mpz/urandomm.c
new file mode 100644
index 0000000000..69e1bae78a
--- /dev/null
+++ b/rts/gmp/mpz/urandomm.c
@@ -0,0 +1,78 @@
+/* mpz_urandomm (rop, state, n) -- Generate a uniform pseudorandom
+   integer in the range 0 to N-1, using STATE as the random state
+   previously initialized by a call to gmp_randinit().
+
+Copyright (C) 2000  Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+#if __STDC__
+mpz_urandomm (mpz_t rop, gmp_randstate_t rstate, mpz_t n)
+#else
+mpz_urandomm (rop, rstate, n)
+     mpz_t rop;
+     gmp_randstate_t rstate;
+     mpz_t n;
+#endif
+{
+  mpz_t t, p, m;
+  mp_ptr tp;
+  mp_size_t nbits, size;
+  int count;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* FIXME: Should check for n == 0 and report error */
+
+  size = SIZ (n);
+  count_leading_zeros (count, PTR (n)[size - 1]);
+  nbits = size * BITS_PER_MP_LIMB - count;
+
+  /* Allocate enough for any mpz function called since a realloc of
+     these will fail.  */
+  MPZ_TMP_INIT (t, size);
+  MPZ_TMP_INIT (m, size + 1);
+  MPZ_TMP_INIT (p, size + 1);
+
+  /* Let m = highest possible random number plus 1.  */
+  mpz_set_ui (m, 0);
+  mpz_setbit (m, nbits);
+
+  /* Let p = floor(m / n) * n.  */
+  mpz_fdiv_q (p, m, n);
+  mpz_mul (p, p, n);
+
+  tp = PTR (t);
+  do
+    {
+      _gmp_rand (tp, rstate, nbits);
+      MPN_NORMALIZE (tp, size);	/* FIXME: Really necessary?  */
+      SIZ (t) = size;
+    }
+  while (mpz_cmp (t, p) >= 0);
+
+  mpz_mod (rop, t, n);
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/xor.c b/rts/gmp/mpz/xor.c
new file mode 100644
index 0000000000..69898d1791
--- /dev/null
+++ b/rts/gmp/mpz/xor.c
@@ -0,0 +1,217 @@
+/* mpz_xor -- Logical xor.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_xor (mpz_ptr res, mpz_srcptr op1, mpz_srcptr op2)
+#else
+mpz_xor (res, op1, op2)
+     mpz_ptr res;
+     mpz_srcptr op1;
+     mpz_srcptr op2;
+#endif
+{
+  mp_srcptr op1_ptr, op2_ptr;
+  mp_size_t op1_size, op2_size;
+  mp_ptr res_ptr;
+  mp_size_t res_size, res_alloc;
+  mp_size_t i;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+  op1_size = op1->_mp_size;
+  op2_size = op2->_mp_size;
+
+  op1_ptr = op1->_mp_d;
+  op2_ptr = op2->_mp_d;
+  res_ptr = res->_mp_d;
+
+  if (op1_size >= 0)
+    {
+      if (op2_size >= 0)
+	{
+	  if (op1_size >= op2_size)
+	    {
+	      if (res->_mp_alloc < op1_size)
+		{
+		  _mpz_realloc (res, op1_size);
+		  op1_ptr = op1->_mp_d;
+		  op2_ptr = op2->_mp_d;
+		  res_ptr = res->_mp_d;
+		}
+
+	      if (res_ptr != op1_ptr)
+		MPN_COPY (res_ptr + op2_size, op1_ptr + op2_size,
+			  op1_size - op2_size);
+	      for (i = op2_size - 1; i >= 0; i--)
+		res_ptr[i] = op1_ptr[i] ^ op2_ptr[i];
+	      res_size = op1_size;
+	    }
+	  else
+	    {
+	      if (res->_mp_alloc < op2_size)
+		{
+		  _mpz_realloc (res, op2_size);
+		  op1_ptr = op1->_mp_d;
+		  op2_ptr = op2->_mp_d;
+		  res_ptr = res->_mp_d;
+		}
+
+	      if (res_ptr != op2_ptr)
+		MPN_COPY (res_ptr + op1_size, op2_ptr + op1_size,
+			  op2_size - op1_size);
+	      for (i = op1_size - 1; i >= 0; i--)
+		res_ptr[i] = op1_ptr[i] ^ op2_ptr[i];
+	      res_size = op2_size;
+	    }
+
+	  MPN_NORMALIZE (res_ptr, res_size);
+	  res->_mp_size = res_size;
+	  return;
+	}
+      else /* op2_size < 0 */
+	{
+	  /* Fall through to the code at the end of the function.  */
+	}
+    }
+  else
+    {
+      if (op2_size < 0)
+	{
+	  mp_ptr opx;
+	  mp_limb_t cy;
+
+	  /* Both operands are negative, the result will be positive.
+	      (-OP1) ^ (-OP2) =
+	     = ~(OP1 - 1) ^ ~(OP2 - 1) =
+	     = (OP1 - 1) ^ (OP2 - 1)  */
+
+	  op1_size = -op1_size;
+	  op2_size = -op2_size;
+
+	  /* Possible optimization: Decrease mpn_sub precision,
+	     as we won't use the entire res of both.  */
+	  opx = (mp_ptr) TMP_ALLOC (op1_size * BYTES_PER_MP_LIMB);
+	  mpn_sub_1 (opx, op1_ptr, op1_size, (mp_limb_t) 1);
+	  op1_ptr = opx;
+
+	  opx = (mp_ptr) TMP_ALLOC (op2_size * BYTES_PER_MP_LIMB);
+	  mpn_sub_1 (opx, op2_ptr, op2_size, (mp_limb_t) 1);
+	  op2_ptr = opx;
+
+	  res_alloc = MAX (op1_size, op2_size);
+	  if (res->_mp_alloc < res_alloc)
+	    {
+	      _mpz_realloc (res, res_alloc);
+	      res_ptr = res->_mp_d;
+	      /* Don't re-read OP1_PTR and OP2_PTR.  They point to
+		 temporary space--never to the space RES->_mp_d used
+		 to point to before reallocation.  */
+	    }
+
+	  if (op1_size > op2_size)
+	    {
+	      MPN_COPY (res_ptr + op2_size, op1_ptr + op2_size,
+			op1_size - op2_size);
+	      for (i = op2_size - 1; i >= 0; i--)
+		res_ptr[i] = op1_ptr[i] ^ op2_ptr[i];
+	      res_size = op1_size;
+	    }
+	  else
+	    {
+	      MPN_COPY (res_ptr + op1_size, op2_ptr + op1_size,
+			op2_size - op1_size);
+	      for (i = op1_size - 1; i >= 0; i--)
+		res_ptr[i] = op1_ptr[i] ^ op2_ptr[i];
+	      res_size = op2_size;
+	    }
+
+	  MPN_NORMALIZE (res_ptr, res_size);
+	  res->_mp_size = res_size;
+	  TMP_FREE (marker);
+	  return;
+	}
+      else
+	{
+	  /* We should compute -OP1 ^ OP2.  Swap OP1 and OP2 and fall
+	     through to the code that handles OP1 ^ -OP2.  */
+          MPZ_SRCPTR_SWAP (op1, op2);
+          MPN_SRCPTR_SWAP (op1_ptr,op1_size, op2_ptr,op2_size);
+	}
+    }
+
+  {
+    mp_ptr opx;
+    mp_limb_t cy;
+    mp_size_t count;
+
+    /* Operand 2 negative, so will be the result.
+       -(OP1 ^ (-OP2)) = -(OP1 ^ ~(OP2 - 1)) =
+       = ~(OP1 ^ ~(OP2 - 1)) + 1 =
+       = (OP1 ^ (OP2 - 1)) + 1      */
+
+    op2_size = -op2_size;
+
+    opx = (mp_ptr) TMP_ALLOC (op2_size * BYTES_PER_MP_LIMB);
+    mpn_sub_1 (opx, op2_ptr, op2_size, (mp_limb_t) 1);
+    op2_ptr = opx;
+
+    res_alloc = MAX (op1_size, op2_size) + 1;
+    if (res->_mp_alloc < res_alloc)
+      {
+	_mpz_realloc (res, res_alloc);
+	op1_ptr = op1->_mp_d;
+	res_ptr = res->_mp_d;
+	/* Don't re-read OP2_PTR.  It points to temporary space--never
+	   to the space RES->_mp_d used to point to before reallocation.  */
+      }
+
+    if (op1_size > op2_size)
+      {
+	MPN_COPY (res_ptr + op2_size, op1_ptr + op2_size, op1_size - op2_size);
+	for (i = op2_size - 1; i >= 0; i--)
+	  res_ptr[i] = op1_ptr[i] ^ op2_ptr[i];
+	res_size = op1_size;
+      }
+    else
+      {
+	MPN_COPY (res_ptr + op1_size, op2_ptr + op1_size, op2_size - op1_size);
+	for (i = op1_size - 1; i >= 0; i--)
+	  res_ptr[i] = op1_ptr[i] ^ op2_ptr[i];
+	res_size = op2_size;
+      }
+
+    cy = mpn_add_1 (res_ptr, res_ptr, res_size, (mp_limb_t) 1);
+    if (cy)
+      {
+	res_ptr[res_size] = cy;
+	res_size++;
+      }
+
+    MPN_NORMALIZE (res_ptr, res_size);
+    res->_mp_size = -res_size;
+    TMP_FREE (marker);
+  }
+}
diff --git a/rts/gmp/rand.c b/rts/gmp/rand.c
new file mode 100644
index 0000000000..d1f9354511
--- /dev/null
+++ b/rts/gmp/rand.c
@@ -0,0 +1,171 @@
+/* gmp_randinit (state, algorithm, ...) -- Initialize a random state.
+
+Copyright (C) 1999, 2000  Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#if __STDC__
+# include <stdarg.h>
+#else
+# include <varargs.h>
+#endif
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Array of CL-schemes, ordered in increasing order of the first
+   member (the 'm2exp' value).  The end of the array is indicated with
+   an entry containing all zeros.  */
+
+/* All multipliers are in the range 0.01*m and 0.99*m, and are
+congruent to 5 (mod 8).
+They all pass the spectral test with Vt >= 2^(30/t) and merit >= 1.
+(Up to and including 196 bits, merit is >= 3.)  */
+
+struct __gmp_rand_lc_scheme_struct
+{
+  unsigned long int m2exp;	/* Modulus is 2 ^ m2exp. */
+  char *astr;			/* Multiplier in string form. */
+  unsigned long int c;		/* Adder. */
+};
+
+struct __gmp_rand_lc_scheme_struct __gmp_rand_lc_scheme[] =
+{
+  {32, "43840821", 	     1},
+  {33, "85943917", 	     1},
+  {34, "171799469", 	     1},
+  {35, "343825285", 	     1},
+  {36, "687285701", 	     1},
+  {37, "1374564613", 	     1},
+  {38, "2749193437", 	     1},
+  {39, "5497652029", 	     1},
+  {40, "10995212661", 	     1},
+  {56, "47988680294711517",  1},
+  {64, "13469374875402548381", 1},
+  {100, "203786806069096950756900463357", 1},	
+  {128, "96573135900076068624591706046897650309", 1},
+  {156, "43051576988660538262511726153887323360449035333", 1},
+  {196, "1611627857640767981443524165616850972435303571524033586421", 1},
+  {200, "491824250216153841876046962368396460896019632211283945747141", 1},
+  {256, "79336254595106925775099152154558630917988041692672147726148065355845551082677", 1},
+  {0, NULL, 0}			/* End of array. */
+};
+
+void
+#if __STDC__
+gmp_randinit (gmp_randstate_t rstate,
+	      gmp_randalg_t alg,
+	      ...)
+#else
+gmp_randinit (va_alist)
+     va_dcl
+#endif
+{
+  va_list ap;
+#if __STDC__
+#else
+  __gmp_randstate_struct *rstate;
+  gmp_randalg_t alg;
+#endif
+
+#if __STDC__
+  va_start (ap, alg);
+#else
+  va_start (ap);
+
+  rstate = va_arg (ap, __gmp_randstate_struct *);
+  alg = va_arg (ap, gmp_randalg_t);
+#endif
+
+  switch (alg)
+    {
+    case GMP_RAND_ALG_LC:	/* Linear congruential.  */
+      {
+	unsigned long int size;
+	struct __gmp_rand_lc_scheme_struct *sp;
+	mpz_t a;
+
+	size = va_arg (ap, unsigned long int);
+
+	/* Pick a scheme.  */
+	for (sp = __gmp_rand_lc_scheme; sp->m2exp != 0; sp++)
+	  if (sp->m2exp / 2 >= size)
+	    break;
+
+	if (sp->m2exp == 0)	/* Nothing big enough found.  */
+	  {
+	    gmp_errno |= GMP_ERROR_INVALID_ARGUMENT;
+	    return;
+	  }
+
+	/* Install scheme.  */
+	mpz_init_set_str (a, sp->astr, 0);
+	gmp_randinit_lc_2exp (rstate, a, sp->c, sp->m2exp);
+	mpz_clear (a);
+	break;
+      }
+
+#if 0
+    case GMP_RAND_ALG_BBS:	/* Blum, Blum, and Shub. */
+      {				
+	mpz_t p, q;
+	mpz_t ztmp;
+
+	/* FIXME: Generate p and q.  They must be ``large'' primes,
+           congruent to 3 mod 4.  Should we ensure that they meet some
+           of the criterias for being ``hard primes''?*/
+
+	/* These are around 128 bits. */
+	mpz_init_set_str (p, "148028650191182616877187862194899201391", 10); 
+	mpz_init_set_str (q, "315270837425234199477225845240496832591", 10);
+	
+	/* Allocate algorithm specific data. */
+	rstate->data.bbs = (__gmp_rand_data_bbs *)
+	  (*_mp_allocate_func) (sizeof (__gmp_rand_data_bbs));
+
+	mpz_init (rstate->data.bbs->bi); /* The Blum integer. */
+	mpz_mul (rstate->data.bbs->bi, p, q);
+
+	/* Find a seed, x, with gcd (x, bi) == 1. */
+	mpz_init (ztmp);
+	while (1)
+	  {
+	    mpz_gcd (ztmp, seed, rstate->data.bbs->bi);
+	    if (!mpz_cmp_ui (ztmp, 1))
+	      break;
+	    mpz_add_ui (seed, seed, 1);
+	  }
+
+	rstate->alg = alg;
+	rstate->size = size;		/* FIXME: Remove. */
+	mpz_set (rstate->seed, seed);
+
+	mpz_clear (p);
+	mpz_clear (q);
+	mpz_clear (ztmp);
+	break;
+      }
+#endif /* 0 */
+
+    default:			/* Bad choice. */
+      gmp_errno |= GMP_ERROR_UNSUPPORTED_ARGUMENT;
+    }
+
+  va_end (ap);
+}
diff --git a/rts/gmp/randclr.c b/rts/gmp/randclr.c
new file mode 100644
index 0000000000..5cb0291165
--- /dev/null
+++ b/rts/gmp/randclr.c
@@ -0,0 +1,54 @@
+/* gmp_randclear (state) -- Clear and deallocate random state STATE.
+
+Copyright (C) 1999, 2000  Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+gmp_randclear (gmp_randstate_t rstate)
+#else
+gmp_randclear (rstate)
+     gmp_randstate_t rstate;
+#endif
+{
+  mpz_clear (rstate->seed);
+
+  switch (rstate->alg)
+    {
+    case GMP_RAND_ALG_LC:
+      mpz_clear (rstate->algdata.lc->a);
+      if (rstate->algdata.lc->m2exp == 0)
+	mpz_clear (rstate->algdata.lc->m);
+      (*_mp_free_func) (rstate->algdata.lc, sizeof (*rstate->algdata.lc));
+      break;
+
+#if 0
+    case GMP_RAND_ALG_BBS:
+      mpz_clear (rstate->algdata.bbs->bi);
+      (*_mp_free_func) (rstate->algdata.bbs, sizeof (*rstate->algdata.bbs));
+      break;
+#endif /* 0 */
+
+    default:
+      gmp_errno |= GMP_ERROR_UNSUPPORTED_ARGUMENT;
+    }
+}
diff --git a/rts/gmp/randlc.c b/rts/gmp/randlc.c
new file mode 100644
index 0000000000..7079db827e
--- /dev/null
+++ b/rts/gmp/randlc.c
@@ -0,0 +1,56 @@
+/* gmp_randinit_lc (state, a, c, m) -- Initialize a random state for a
+   linear congruential generator with multiplier A, adder C, and
+   modulus M.
+
+Copyright (C) 1999, 2000  Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+gmp_randinit_lc (gmp_randstate_t rstate,
+		 mpz_t a,
+		 unsigned long int c,
+		 mpz_t m)
+#else
+gmp_randinit_lc (rstate, a, c, m)
+     gmp_randstate_t rstate;
+     mpz_t a;
+     unsigned long int c;
+     mpz_t m;
+#endif
+{
+  /* FIXME: Not finished.  We don't handle this in _gmp_rand() yet. */
+  abort ();			
+
+  mpz_init_set_ui (rstate->seed, 1);
+  _mpz_realloc (rstate->seed, ABSIZ (m));
+
+  /* Allocate algorithm specific data. */
+  rstate->algdata.lc = (__gmp_randata_lc *)
+    (*_mp_allocate_func) (sizeof (__gmp_randata_lc));
+
+  mpz_init_set (rstate->algdata.lc->a, a);
+  rstate->algdata.lc->c = c;
+  mpz_init_set (rstate->algdata.lc->m, m);
+
+  rstate->alg = GMP_RAND_ALG_LC;
+}
diff --git a/rts/gmp/randlc2x.c b/rts/gmp/randlc2x.c
new file mode 100644
index 0000000000..dbd5f041ee
--- /dev/null
+++ b/rts/gmp/randlc2x.c
@@ -0,0 +1,59 @@
+/* gmp_randinit_lc_2exp (state, a, c, m2exp) -- Initialize random
+   state STATE for a linear congruential generator with multiplier A,
+   adder C, and modulus 2 ^ M2EXP.
+
+Copyright (C) 2000  Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+gmp_randinit_lc_2exp (gmp_randstate_t rstate,
+		      mpz_t a,
+		      unsigned long int c,
+		      unsigned long int m2exp)
+#else
+gmp_randinit_lc_2exp (rstate, a, c, m2exp)
+     gmp_randstate_t rstate;
+     mpz_t a;
+     unsigned long int c;
+     unsigned long int m2exp;
+#endif
+{
+  mpz_init_set_ui (rstate->seed, 1);
+  _mpz_realloc (rstate->seed, m2exp / BITS_PER_MP_LIMB
+		+ (m2exp % BITS_PER_MP_LIMB != 0));
+
+  /* Allocate algorithm specific data. */
+  rstate->algdata.lc = (__gmp_randata_lc *)
+    (*_mp_allocate_func) (sizeof (__gmp_randata_lc));
+
+  mpz_init_set (rstate->algdata.lc->a, a);
+  rstate->algdata.lc->c = c;
+
+  /* Cover weird case where m2exp is 0, which means that m is used
+     instead of m2exp.  */
+  if (m2exp == 0)
+    mpz_init_set_ui (rstate->algdata.lc->m, 0);
+  rstate->algdata.lc->m2exp = m2exp;
+
+  rstate->alg = GMP_RAND_ALG_LC;
+}
diff --git a/rts/gmp/randraw.c b/rts/gmp/randraw.c
new file mode 100644
index 0000000000..c0c3889d33
--- /dev/null
+++ b/rts/gmp/randraw.c
@@ -0,0 +1,360 @@
+/* _gmp_rand (rp, state, nbits) -- Generate a random bitstream of
+   length NBITS in RP.  RP must have enough space allocated to hold
+   NBITS.
+
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* For linear congruential (LC), we use one of algorithms (1) or (2).
+   (gmp-3.0 uses algorithm (1) with 'm' as a power of 2.)
+
+LC algorithm (1).
+
+	X = (aX + c) mod m
+
+[D. Knuth, "The Art of Computer Programming: Volume 2, Seminumerical Algorithms",
+Third Edition, Addison Wesley, 1998, pp. 184-185.]
+
+	X is the seed and the result
+	a is chosen so that
+	    a mod 8 = 5 [3.2.1.2] and [3.2.1.3]
+	    .01m < a < .99m
+	    its binary or decimal digits is not a simple, regular pattern
+	    it has no large quotients when Euclid's algorithm is used to find
+	      gcd(a, m) [3.3.3]
+	    it passes the spectral test [3.3.4]
+	    it passes several tests of [3.3.2]
+	c has no factor in common with m (c=1 or c=a can be good)
+	m is large (2^30)
+	  is a power of 2 [3.2.1.1]
+
+The least significant digits of the generated number are not very
+random.  It should be regarded as a random fraction X/m.  To get a
+random integer between 0 and n-1, multiply X/m by n and truncate.
+(Don't use X/n [ex 3.4.1-3])
+
+The ``accuracy'' in t dimensions is one part in ``the t'th root of m'' [3.3.4].
+
+Don't generate more than about m/1000 numbers without changing a, c, or m.
+
+The sequence length depends on chosen a,c,m.
+
+
+LC algorithm (2).
+
+	X = a * (X mod q) - r * (long) (X/q)
+	if X<0 then X+=m
+
+[Knuth, pp. 185-186.]
+
+	X is the seed and the result
+	  as a seed is nonzero and less than m
+	a is a primitive root of m (which means that a^2 <= m)
+	q is (long) m / a
+	r is m mod a
+	m is a prime number near the largest easily computed integer
+
+which gives
+
+	X = a * (X % ((long) m / a)) -
+	    (M % a) * ((long) (X / ((long) m / a)))
+
+Since m is prime, the least-significant bits of X are just as random as
+the most-significant bits. */
+
+/* Blum, Blum, and Shub.
+
+   [Bruce Schneier, "Applied Cryptography", Second Edition, John Wiley
+   & Sons, Inc., 1996, pp. 417-418.]
+
+   "Find two large prime numbers, p and q, which are congruent to 3
+   modulo 4.  The product of those numbers, n, is a blum integer.
+   Choose another random integer, x, which is relatively prime to n.
+   Compute
+	x[0] = x^2 mod n
+   That's the seed for the generator."
+
+   To generate a random bit, compute
+	x[i] = x[i-1]^2 mod n
+   The least significant bit of x[i] is the one we want.
+
+   We can use more than one bit from x[i], namely the
+	log2(bitlength of x[i])
+   least significant bits of x[i].
+
+   So, for a 32-bit seed we get 5 bits per computation.
+
+   The non-predictability of this generator is based on the difficulty
+   of factoring n.
+ */
+
+/* -------------------------------------------------- */
+
+/* lc (rp, state) -- Generate next number in LC sequence.  Return the
+   number of valid bits in the result.  NOTE: If 'm' is a power of 2
+   (m2exp != 0), discard the lower half of the result.  */
+
+static
+unsigned long int
+#if __STDC__
+lc (mp_ptr rp, gmp_randstate_t rstate)
+#else
+lc (rp, rstate)
+     mp_ptr rp;
+     gmp_randstate_t rstate;
+#endif
+{
+  mp_ptr tp, seedp, ap;
+  mp_size_t ta;
+  mp_size_t tn, seedn, an;
+  mp_size_t retval;
+  int shiftcount = 0;
+  unsigned long int m2exp;
+  mp_limb_t c;
+  TMP_DECL (mark);
+
+  m2exp = rstate->algdata.lc->m2exp;
+  c = (mp_limb_t) rstate->algdata.lc->c;
+
+  seedp = PTR (rstate->seed);
+  seedn = SIZ (rstate->seed);
+
+  if (seedn == 0)
+    {
+      /* Seed is 0.  Result is C % M.  */
+      *rp = c;
+
+      if (m2exp != 0)
+	{
+	  /* M is a power of 2.  */
+	  if (m2exp < BITS_PER_MP_LIMB)
+	    {
+	      /* Only necessary when M may be smaller than C.  */
+	      *rp &= (((mp_limb_t) 1 << m2exp) - 1);
+	    }
+	}
+      else
+	{
+	  /* M is not a power of 2.  */
+	  abort ();		/* FIXME.  */
+	}
+
+      /* Save result as next seed.  */
+      *seedp = *rp;
+      SIZ (rstate->seed) = 1;
+      return BITS_PER_MP_LIMB;
+    }
+
+  ap = PTR (rstate->algdata.lc->a);
+  an = SIZ (rstate->algdata.lc->a);
+
+  /* Allocate temporary storage.  Let there be room for calculation of
+     (A * seed + C) % M, or M if bigger than that.  */
+
+  ASSERT_ALWAYS (m2exp != 0);	/* FIXME.  */
+
+  TMP_MARK (mark);
+  ta = an + seedn + 1;
+  tp = (mp_ptr) TMP_ALLOC (ta * BYTES_PER_MP_LIMB);
+  MPN_ZERO (tp, ta);
+
+  /* t = a * seed */
+  if (seedn >= an)
+    mpn_mul_basecase (tp, seedp, seedn, ap, an);
+  else
+    mpn_mul_basecase (tp, ap, an, seedp, seedn);
+  tn = an + seedn;
+
+  /* t = t + c */
+  mpn_incr_u (tp, c);
+
+  /* t = t % m */
+  if (m2exp != 0)
+    {
+      /* M is a power of 2.  The mod operation is trivial.  */
+
+      tp[m2exp / BITS_PER_MP_LIMB] &= ((mp_limb_t) 1 << m2exp % BITS_PER_MP_LIMB) - 1;
+      tn = (m2exp + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB;
+    }
+  else
+    {
+      abort ();			/* FIXME.  */
+    }
+
+  /* Save result as next seed.  */
+  MPN_COPY (PTR (rstate->seed), tp, tn);
+  SIZ (rstate->seed) = tn;
+
+  if (m2exp != 0)
+    {
+      /* Discard the lower half of the result.  */
+      unsigned long int discardb = m2exp / 2;
+      mp_size_t discardl = discardb / BITS_PER_MP_LIMB;
+
+      tn -= discardl;
+      if (tn > 0)
+	{
+	  if (discardb % BITS_PER_MP_LIMB != 0)
+	    {
+	      mpn_rshift (tp, tp + discardl, tn, discardb % BITS_PER_MP_LIMB);
+	      MPN_COPY (rp, tp, (discardb + BITS_PER_MP_LIMB -1) / BITS_PER_MP_LIMB);
+	    }
+	  else			/* Even limb boundary.  */
+	    MPN_COPY_INCR (rp, tp + discardl, tn);
+	}
+    }
+  else
+    {
+      MPN_COPY (rp, tp, tn);
+    }
+
+  TMP_FREE (mark);
+
+  /* Return number of valid bits in the result.  */
+  if (m2exp != 0)
+    retval = (m2exp + 1) / 2;
+  else
+    retval = SIZ (rstate->algdata.lc->m) * BITS_PER_MP_LIMB - shiftcount;
+  return retval;
+}
+
+#ifdef RAWRANDEBUG
+/* Set even bits to EVENBITS and odd bits to ! EVENBITS in RP.
+   Number of bits is m2exp in state.  */
+/* FIXME: Remove.  */
+unsigned long int
+lc_test (mp_ptr rp, gmp_randstate_t s, const int evenbits)
+{
+  unsigned long int rn, nbits;
+  int f;
+
+  nbits = s->algdata.lc->m2exp / 2;
+  rn = nbits / BITS_PER_MP_LIMB + (nbits % BITS_PER_MP_LIMB != 0);
+  MPN_ZERO (rp, rn);
+
+  for (f = 0; f < nbits; f++)
+    {
+      mpn_lshift (rp, rp, rn, 1);
+      if (f % 2 == ! evenbits)
+	rp[0] += 1;
+    }
+
+  return nbits;
+}
+#endif /* RAWRANDEBUG */
+
+void
+#if __STDC__
+_gmp_rand (mp_ptr rp, gmp_randstate_t rstate, unsigned long int nbits)
+#else
+_gmp_rand (rp, rstate, nbits)
+     mp_ptr rp;
+     gmp_randstate_t rstate;
+     unsigned long int nbits;
+#endif
+{
+  mp_size_t rn;			/* Size of R.  */
+
+  rn = (nbits + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB;
+
+  switch (rstate->alg)
+    {
+    case GMP_RAND_ALG_LC:
+      {
+	unsigned long int rbitpos;
+	int chunk_nbits;
+	mp_ptr tp;
+	mp_size_t tn;
+	TMP_DECL (lcmark);
+
+	TMP_MARK (lcmark);
+
+	chunk_nbits = rstate->algdata.lc->m2exp / 2;
+	tn = (chunk_nbits + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB;
+
+	tp = (mp_ptr) TMP_ALLOC (tn * BYTES_PER_MP_LIMB);
+
+	rbitpos = 0;
+	while (rbitpos + chunk_nbits <= nbits)
+	  {
+	    mp_ptr r2p = rp + rbitpos / BITS_PER_MP_LIMB;
+
+	    if (rbitpos % BITS_PER_MP_LIMB != 0)
+	      {
+		mp_limb_t savelimb, rcy;
+		/* Target of of new chunk is not bit aligned.  Use temp space
+		   and align things by shifting it up.  */
+		lc (tp, rstate);
+		savelimb = r2p[0];
+		rcy = mpn_lshift (r2p, tp, tn, rbitpos % BITS_PER_MP_LIMB);
+		r2p[0] |= savelimb;
+/* bogus */	if ((chunk_nbits % BITS_PER_MP_LIMB + rbitpos % BITS_PER_MP_LIMB)
+		    > BITS_PER_MP_LIMB)
+		  r2p[tn] = rcy;
+	      }
+	    else
+	      {
+		/* Target of of new chunk is bit aligned.  Let `lc' put bits
+		   directly into our target variable.  */
+		lc (r2p, rstate);
+	      }
+	    rbitpos += chunk_nbits;
+	  }
+
+	/* Handle last [0..chunk_nbits) bits.  */
+	if (rbitpos != nbits)
+	  {
+	    mp_ptr r2p = rp + rbitpos / BITS_PER_MP_LIMB;
+	    int last_nbits = nbits - rbitpos;
+	    tn = (last_nbits + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB;
+	    lc (tp, rstate);
+	    if (rbitpos % BITS_PER_MP_LIMB != 0)
+	      {
+		mp_limb_t savelimb, rcy;
+		/* Target of of new chunk is not bit aligned.  Use temp space
+		   and align things by shifting it up.  */
+		savelimb = r2p[0];
+		rcy = mpn_lshift (r2p, tp, tn, rbitpos % BITS_PER_MP_LIMB);
+		r2p[0] |= savelimb;
+		if (rbitpos + tn * BITS_PER_MP_LIMB - rbitpos % BITS_PER_MP_LIMB < nbits)
+		  r2p[tn] = rcy;
+	      }
+	    else
+	      {
+		MPN_COPY (r2p, tp, tn);
+	      }
+	    /* Mask off top bits if needed.  */
+	    if (nbits % BITS_PER_MP_LIMB != 0)
+	      rp[nbits / BITS_PER_MP_LIMB]
+		&= ~ ((~(mp_limb_t) 0) << nbits % BITS_PER_MP_LIMB);
+	  }
+
+	TMP_FREE (lcmark);
+	break;
+      }
+
+    default:
+      gmp_errno |= GMP_ERROR_UNSUPPORTED_ARGUMENT;
+      break;
+    }
+}
diff --git a/rts/gmp/randsd.c b/rts/gmp/randsd.c
new file mode 100644
index 0000000000..3bed14b578
--- /dev/null
+++ b/rts/gmp/randsd.c
@@ -0,0 +1,37 @@
+/* gmp_randseed (state, seed) -- Set initial seed SEED in random state
+   STATE.
+
+Copyright (C) 2000  Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+gmp_randseed (gmp_randstate_t rstate,
+	      mpz_t seed)
+#else
+gmp_randseed (rstate, seed)
+     gmp_randstate_t rstate;
+     mpz_t seed;
+#endif
+{
+  mpz_set (rstate->seed, seed);
+}
diff --git a/rts/gmp/randsdui.c b/rts/gmp/randsdui.c
new file mode 100644
index 0000000000..92f412f3ea
--- /dev/null
+++ b/rts/gmp/randsdui.c
@@ -0,0 +1,37 @@
+/* gmp_randseed_ui (state, seed) -- Set initial seed SEED in random
+   state STATE.
+
+Copyright (C) 2000  Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+gmp_randseed_ui (gmp_randstate_t rstate,
+		  unsigned long int seed)
+#else
+gmp_randseed_ui (rstate, seed)
+     gmp_randstate_t rstate;
+     mpz_t seed;
+#endif
+{
+  mpz_set_ui (rstate->seed, seed);
+}
diff --git a/rts/gmp/stack-alloc.c b/rts/gmp/stack-alloc.c
new file mode 100644
index 0000000000..9ab98fe5f9
--- /dev/null
+++ b/rts/gmp/stack-alloc.c
@@ -0,0 +1,136 @@
+/* Stack allocation routines.  This is intended for machines without support
+   for the `alloca' function.
+
+Copyright (C) 1996, 1997, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "stack-alloc.h"
+
+#define __need_size_t
+#include <stddef.h>
+#undef __need_size_t
+
+/* gmp-impl.h and stack-alloc.h conflict when not USE_STACK_ALLOC, so these
+   declarations are copied here */
+#if __STDC__
+extern void *	(*__gmp_allocate_func) (size_t);
+extern void	(*__gmp_free_func) (void *, size_t);
+#else
+extern void *	(*__gmp_allocate_func) ();
+extern void	(*__gmp_free_func) ();
+#endif
+
+typedef struct tmp_stack tmp_stack;
+
+static unsigned long max_total_allocation = 0;
+static unsigned long current_total_allocation = 0;
+
+static tmp_stack xxx = {&xxx, &xxx, 0};
+static tmp_stack *current = &xxx;
+
+/* The rounded size of the header of each allocation block.  */
+#define HSIZ ((sizeof (tmp_stack) + __TMP_ALIGN - 1) & -__TMP_ALIGN)
+
+/* Allocate a block of exactly <size> bytes.  This should only be called
+   through the TMP_ALLOC macro, which takes care of rounding/alignment.  */
+void *
+#if __STDC__
+__gmp_tmp_alloc (unsigned long size)
+#else
+__gmp_tmp_alloc (size)
+     unsigned long size;
+#endif
+{
+  void *that;
+
+  if (size > (char *) current->end - (char *) current->alloc_point)
+    {
+      void *chunk;
+      tmp_stack *header;
+      unsigned long chunk_size;
+      unsigned long now;
+
+      /* Allocate a chunk that makes the total current allocation somewhat
+	 larger than the maximum allocation ever.  If size is very large, we
+	 allocate that much.  */
+
+      now = current_total_allocation + size;
+      if (now > max_total_allocation)
+	{
+	  /* We need more temporary memory than ever before.  Increase
+	     for future needs.  */
+	  now = now * 3 / 2;
+	  chunk_size = now - current_total_allocation + HSIZ;
+	  current_total_allocation = now;
+	  max_total_allocation = current_total_allocation;
+	}
+      else
+	{
+	  chunk_size = max_total_allocation - current_total_allocation + HSIZ;
+	  current_total_allocation = max_total_allocation;
+	}
+
+      chunk = (*__gmp_allocate_func) (chunk_size);
+      header = (tmp_stack *) chunk;
+      header->end = (char *) chunk + chunk_size;
+      header->alloc_point = (char *) chunk + HSIZ;
+      header->prev = current;
+      current = header;
+    }
+
+  that = current->alloc_point;
+  current->alloc_point = (char *) that + size;
+  return that;
+}
+
+/* Typically called at function entry.  <mark> is assigned so that
+   __gmp_tmp_free can later be used to reclaim all subsequently allocated
+   storage.  */
+void
+#if __STDC__
+__gmp_tmp_mark (tmp_marker *mark)
+#else
+__gmp_tmp_mark (mark)
+     tmp_marker *mark;
+#endif
+{
+  mark->which_chunk = current;
+  mark->alloc_point = current->alloc_point;
+}
+
+/* Free everything allocated since <mark> was assigned by __gmp_tmp_mark */
+void
+#if __STDC__
+__gmp_tmp_free (tmp_marker *mark)
+#else
+__gmp_tmp_free (mark)
+     tmp_marker *mark;
+#endif
+{
+  while (mark->which_chunk != current)
+    {
+      tmp_stack *tmp;
+
+      tmp = current;
+      current = tmp->prev;
+      current_total_allocation -= (((char *) (tmp->end) - (char *) tmp) - HSIZ);
+      (*__gmp_free_func) (tmp, (char *) tmp->end - (char *) tmp);
+    }
+  current->alloc_point = mark->alloc_point;
+}
diff --git a/rts/gmp/stack-alloc.h b/rts/gmp/stack-alloc.h
new file mode 100644
index 0000000000..f59beec266
--- /dev/null
+++ b/rts/gmp/stack-alloc.h
@@ -0,0 +1,64 @@
+/* Stack allocation routines.  This is intended for machines without support
+   for the `alloca' function.
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+struct tmp_stack
+{
+  void *end;
+  void *alloc_point;
+  struct tmp_stack *prev;
+};
+
+struct tmp_marker
+{
+  struct tmp_stack *which_chunk;
+  void *alloc_point;
+};
+
+typedef struct tmp_marker tmp_marker;
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#if __STDC__
+void *__gmp_tmp_alloc (unsigned long);
+void __gmp_tmp_mark (tmp_marker *);
+void __gmp_tmp_free (tmp_marker *);
+#else
+void *__gmp_tmp_alloc ();
+void __gmp_tmp_mark ();
+void __gmp_tmp_free ();
+#endif
+
+#if defined (__cplusplus)
+}
+#endif
+
+#ifndef __TMP_ALIGN
+#define __TMP_ALIGN 8
+#endif
+
+#define TMP_DECL(marker) tmp_marker marker
+#define TMP_ALLOC(size) \
+  __gmp_tmp_alloc (((unsigned long) (size) + __TMP_ALIGN - 1) & -__TMP_ALIGN)
+#define TMP_MARK(marker) __gmp_tmp_mark (&marker)
+#define TMP_FREE(marker) __gmp_tmp_free (&marker)
diff --git a/rts/gmp/stamp-h.in b/rts/gmp/stamp-h.in
new file mode 100644
index 0000000000..9788f70238
--- /dev/null
+++ b/rts/gmp/stamp-h.in
@@ -0,0 +1 @@
+timestamp
diff --git a/rts/gmp/stamp-vti b/rts/gmp/stamp-vti
new file mode 100644
index 0000000000..e3186186b2
--- /dev/null
+++ b/rts/gmp/stamp-vti
@@ -0,0 +1,3 @@
+@set UPDATED 5 October 2000
+@set EDITION 3.1.1
+@set VERSION 3.1.1
diff --git a/rts/gmp/urandom.h b/rts/gmp/urandom.h
new file mode 100644
index 0000000000..313479e8b7
--- /dev/null
+++ b/rts/gmp/urandom.h
@@ -0,0 +1,86 @@
+/* urandom.h -- define urandom returning a full unsigned long random value.
+
+Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#if defined (__hpux) || defined (__svr4__) || defined (__SVR4)
+/* HPUX lacks random().  */
+static inline mp_limb_t
+urandom ()
+{
+  return mrand48 ();
+}
+#define __URANDOM
+#endif
+
+#if defined(_WIN32) && !(defined(__CYGWIN__) || defined(__CYGWIN32__))
+/* MS CRT supplies just the poxy rand(), with an upper bound of 0x7fff */
+static inline unsigned long
+urandom ()
+{
+  return rand () ^ (rand () << 16) ^ (rand() << 32);
+}
+#define __URANDOM
+#endif
+
+#if defined (__alpha) && !defined (__URANDOM)
+/* DEC OSF/1 1.2 random() returns a double.  */
+long mrand48 ();
+static inline mp_limb_t
+urandom ()
+{
+  return mrand48 () | (mrand48 () << 32);
+}
+#define __URANDOM
+#endif
+
+#if BITS_PER_MP_LIMB == 32 && !defined (__URANDOM)
+#if defined (__cplusplus)
+extern "C" {
+#endif
+long random ();
+#if defined (__cplusplus)
+}
+#endif
+static inline mp_limb_t
+urandom ()
+{
+  /* random() returns 31 bits, we want 32.  */
+  return random () ^ (random () << 1);
+}
+#define __URANDOM
+#endif
+
+#if BITS_PER_MP_LIMB == 64 && !defined (__URANDOM)
+#if defined (__cplusplus)
+extern "C" {
+#endif
+long random ();
+#if defined (__cplusplus)
+}
+#endif
+static inline mp_limb_t
+urandom ()
+{
+  /* random() returns 31 bits, we want 64.  */
+  return random () ^ ((mp_limb_t) random () << 31) ^ ((mp_limb_t) random () << 62);
+}
+#define __URANDOM
+#endif
+
diff --git a/rts/gmp/version.c b/rts/gmp/version.c
new file mode 100644
index 0000000000..9d544ee1d8
--- /dev/null
+++ b/rts/gmp/version.c
@@ -0,0 +1,26 @@
+/* gmp_version -- version number compiled into the library */
+
+/*
+Copyright (C) 1996, 1999, 2000  Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+const char *gmp_version = VERSION;
diff --git a/rts/gmp/version.texi b/rts/gmp/version.texi
new file mode 100644
index 0000000000..e3186186b2
--- /dev/null
+++ b/rts/gmp/version.texi
@@ -0,0 +1,3 @@
+@set UPDATED 5 October 2000
+@set EDITION 3.1.1
+@set VERSION 3.1.1
diff --git a/rts/hooks/FlagDefaults.c b/rts/hooks/FlagDefaults.c
new file mode 100644
index 0000000000..393d39bc39
--- /dev/null
+++ b/rts/hooks/FlagDefaults.c
@@ -0,0 +1,20 @@
+/* -----------------------------------------------------------------------------
+ *
+ * User-overridable RTS hooks.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+
+void
+defaultsHook (void)
+{ /* this is called *after* RTSflags has had
+     its defaults set, but *before* we start
+     processing the RTS command-line options.
+
+     This default version does *nothing*.
+     The user may provide a more interesting
+     one.
+  */
+}
+
diff --git a/rts/hooks/InitEachPE.c b/rts/hooks/InitEachPE.c
new file mode 100644
index 0000000000..cc9cdc0dba
--- /dev/null
+++ b/rts/hooks/InitEachPE.c
@@ -0,0 +1,23 @@
+/* -----------------------------------------------------------------------------
+ *
+ * User-overridable RTS hooks.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+
+#ifdef PAR
+void
+InitEachPEHook (void)
+{ /* In a GUM setup this is called on each
+     PE immediately before SynchroniseSystem.
+     It can be used to read in static data 
+     to each PE which has to be available to
+     each PE. See GPH-Maple as an example how to
+     use this in combination with foreign language
+     code:
+       http://www.risc.uni-linz.ac.at/software/ghc-maple/
+     -- HWL
+  */
+}
+#endif
diff --git a/rts/hooks/MallocFail.c b/rts/hooks/MallocFail.c
new file mode 100644
index 0000000000..1218d1d8d0
--- /dev/null
+++ b/rts/hooks/MallocFail.c
@@ -0,0 +1,16 @@
+/* -----------------------------------------------------------------------------
+ *
+ * User-overridable RTS hooks.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+
+#include <stdio.h>
+
+void
+MallocFailHook (lnat request_size /* in bytes */, char *msg)
+{
+    fprintf(stderr, "malloc: failed on request for %lu bytes; message: %s\n", request_size, msg);
+}
+
diff --git a/rts/hooks/OnExit.c b/rts/hooks/OnExit.c
new file mode 100644
index 0000000000..dd4c3b4bb0
--- /dev/null
+++ b/rts/hooks/OnExit.c
@@ -0,0 +1,19 @@
+/* -----------------------------------------------------------------------------
+ *
+ * User-overridable RTS hooks.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+
+/* Note: by the time this hook has been called, Haskell land
+ * will have been shut down completely.
+ *
+ * ToDo: feed the hook info on whether we're shutting down as a result
+ * of termination or run-time error ?
+ */
+ 
+void
+OnExitHook ()
+{
+}
diff --git a/rts/hooks/OutOfHeap.c b/rts/hooks/OutOfHeap.c
new file mode 100644
index 0000000000..98db0d7d49
--- /dev/null
+++ b/rts/hooks/OutOfHeap.c
@@ -0,0 +1,19 @@
+/* -----------------------------------------------------------------------------
+ *
+ * User-overridable RTS hooks.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+#include <stdio.h>
+
+void
+OutOfHeapHook (lnat request_size, lnat heap_size) /* both sizes in bytes */
+{
+  /*    fprintf(stderr, "Heap exhausted;\nwhile trying to allocate %lu bytes in a %lu-byte heap;\nuse `+RTS -H<size>' to increase the total heap size.\n", */
+
+  (void)request_size;   /* keep gcc -Wall happy */
+  fprintf(stderr, "Heap exhausted;\nCurrent maximum heap size is %lu bytes (%lu Mb);\nuse `+RTS -M<size>' to increase it.\n",
+	  heap_size, heap_size / (1024*1024));
+}
+
diff --git a/rts/hooks/RtsOpts.c b/rts/hooks/RtsOpts.c
new file mode 100644
index 0000000000..b934b05f1b
--- /dev/null
+++ b/rts/hooks/RtsOpts.c
@@ -0,0 +1,13 @@
+/* -----------------------------------------------------------------------------
+ *
+ * Default RTS options.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+
+#include <stdlib.h>
+
+// Default RTS options can be given by providing an alternate
+// definition for this variable, pointing to a string of RTS options.
+char *ghc_rts_opts = NULL;
diff --git a/rts/hooks/ShutdownEachPEHook.c b/rts/hooks/ShutdownEachPEHook.c
new file mode 100644
index 0000000000..f5e3ba9344
--- /dev/null
+++ b/rts/hooks/ShutdownEachPEHook.c
@@ -0,0 +1,19 @@
+/* -----------------------------------------------------------------------------
+ *
+ * User-overridable RTS hooks.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+
+#ifdef PAR
+void
+ShutdownEachPEHook (void)
+{ /* In a GUM setup this routine is called at the end of 
+     shutdownParallelSystem on each PE. Useful for
+     cleaning up stuff, especially when interfacing 
+     with foreign language code.
+     -- HWL 
+  */
+}
+#endif
diff --git a/rts/hooks/StackOverflow.c b/rts/hooks/StackOverflow.c
new file mode 100644
index 0000000000..a395a3a1a5
--- /dev/null
+++ b/rts/hooks/StackOverflow.c
@@ -0,0 +1,16 @@
+/* -----------------------------------------------------------------------------
+ *
+ * User-overridable RTS hooks.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+
+#include <stdio.h>
+
+void
+StackOverflowHook (lnat stack_size)    /* in bytes */
+{
+    fprintf(stderr, "Stack space overflow: current size %ld bytes.\nUse `+RTS -Ksize' to increase it.\n", stack_size);
+}
+
diff --git a/rts/package.conf.in b/rts/package.conf.in
new file mode 100644
index 0000000000..935b71d6a6
--- /dev/null
+++ b/rts/package.conf.in
@@ -0,0 +1,152 @@
+/* The RTS is just another package! */
+
+#include "ghcconfig.h"
+#include "RtsConfig.h"
+
+name:		PACKAGE
+version:	1.0
+license:	BSD3
+maintainer:	glasgow-haskell-users@haskell.org
+exposed:	True
+
+exposed-modules:
+hidden-modules:
+
+import-dirs:
+
+#ifdef INSTALLING
+library-dirs:		LIB_DIR
+# ifdef mingw32_HOST_OS
+			, LIB_DIR"/gcc-lib"
+			/* force the dist-provided gcc-lib/ into scope. */
+# endif
+#else /* !INSTALLING */
+library-dirs:		FPTOOLS_TOP_ABS"/rts"
+# if !defined(HAVE_LIBGMP) && !defined(HAVE_FRAMEWORK_GMP)
+			, FPTOOLS_TOP_ABS"/rts/gmp"
+# endif
+#endif
+
+hs-libraries:   "HSrts"
+
+extra-libraries:		"m"		/* for ldexp() */
+#ifndef HAVE_FRAMEWORK_GMP
+			      , "gmp"
+#ifdef HAVE_LIBDL
+			      , "dl"
+#endif
+#endif
+#ifdef HAVE_LIBRT
+			      , "rt"
+#endif
+#ifdef mingw32_HOST_OS
+			      ,"wsock32"	/* for the linker */
+#endif
+#ifdef WANT_DOTNET_SUPPORT
+			      , "oleaut32", "ole32", "uuid"
+#endif
+#if defined(DEBUG) && defined(HAVE_LIBBFD)
+			      ,"bfd", "iberty"	/* for debugging */
+#endif
+#ifdef HAVE_LIBMINGWEX
+# ifndef INSTALLING                             /* Bundled Mingw is behind */
+                              ,"mingwex"
+# endif
+#endif
+
+#ifdef INSTALLING
+include-dirs:		INCLUDE_DIR
+# ifdef mingw32_HOST_OS
+			, INCLUDE_DIR"/mingw"
+# endif
+#else /* !INSTALLING */
+include-dirs:		FPTOOLS_TOP_ABS"/includes"
+#endif
+
+includes:		Stg.h
+depends:	 	
+hugs-options:
+cc-options:
+
+ld-options:
+#ifdef LEADING_UNDERSCORE
+           "-u", "_GHCziBase_Izh_static_info"
+         , "-u", "_GHCziBase_Czh_static_info"
+         , "-u", "_GHCziFloat_Fzh_static_info"
+         , "-u", "_GHCziFloat_Dzh_static_info"
+         , "-u", "_GHCziPtr_Ptr_static_info"
+         , "-u", "_GHCziWord_Wzh_static_info"
+         , "-u", "_GHCziInt_I8zh_static_info"
+         , "-u", "_GHCziInt_I16zh_static_info"
+         , "-u", "_GHCziInt_I32zh_static_info"
+         , "-u", "_GHCziInt_I64zh_static_info"
+         , "-u", "_GHCziWord_W8zh_static_info"
+         , "-u", "_GHCziWord_W16zh_static_info"
+         , "-u", "_GHCziWord_W32zh_static_info"
+         , "-u", "_GHCziWord_W64zh_static_info"
+         , "-u", "_GHCziStable_StablePtr_static_info"
+         , "-u", "_GHCziBase_Izh_con_info"
+         , "-u", "_GHCziBase_Czh_con_info"
+         , "-u", "_GHCziFloat_Fzh_con_info"
+         , "-u", "_GHCziFloat_Dzh_con_info"
+         , "-u", "_GHCziPtr_Ptr_con_info"
+         , "-u", "_GHCziPtr_FunPtr_con_info"
+         , "-u", "_GHCziStable_StablePtr_con_info"
+         , "-u", "_GHCziBase_False_closure"
+         , "-u", "_GHCziBase_True_closure"
+         , "-u", "_GHCziPack_unpackCString_closure"
+         , "-u", "_GHCziIOBase_stackOverflow_closure"
+         , "-u", "_GHCziIOBase_heapOverflow_closure"
+         , "-u", "_GHCziIOBase_NonTermination_closure"
+         , "-u", "_GHCziIOBase_BlockedOnDeadMVar_closure"
+         , "-u", "_GHCziIOBase_BlockedIndefinitely_closure"
+         , "-u", "_GHCziIOBase_Deadlock_closure"
+         , "-u", "_GHCziIOBase_NestedAtomically_closure"
+         , "-u", "_GHCziWeak_runFinalizzerBatch_closure"
+#else
+           "-u", "GHCziBase_Izh_static_info"
+         , "-u", "GHCziBase_Czh_static_info"
+         , "-u", "GHCziFloat_Fzh_static_info"
+         , "-u", "GHCziFloat_Dzh_static_info"
+         , "-u", "GHCziPtr_Ptr_static_info"
+         , "-u", "GHCziWord_Wzh_static_info"
+         , "-u", "GHCziInt_I8zh_static_info"
+         , "-u", "GHCziInt_I16zh_static_info"
+         , "-u", "GHCziInt_I32zh_static_info"
+         , "-u", "GHCziInt_I64zh_static_info"
+         , "-u", "GHCziWord_W8zh_static_info"
+         , "-u", "GHCziWord_W16zh_static_info"
+         , "-u", "GHCziWord_W32zh_static_info"
+         , "-u", "GHCziWord_W64zh_static_info"
+         , "-u", "GHCziStable_StablePtr_static_info"
+         , "-u", "GHCziBase_Izh_con_info"
+         , "-u", "GHCziBase_Czh_con_info"
+         , "-u", "GHCziFloat_Fzh_con_info"
+         , "-u", "GHCziFloat_Dzh_con_info"
+         , "-u", "GHCziPtr_Ptr_con_info"
+         , "-u", "GHCziPtr_FunPtr_con_info"
+         , "-u", "GHCziStable_StablePtr_con_info"
+         , "-u", "GHCziBase_False_closure"
+         , "-u", "GHCziBase_True_closure"
+         , "-u", "GHCziPack_unpackCString_closure"
+         , "-u", "GHCziIOBase_stackOverflow_closure"
+         , "-u", "GHCziIOBase_heapOverflow_closure"
+         , "-u", "GHCziIOBase_NonTermination_closure"
+         , "-u", "GHCziIOBase_BlockedOnDeadMVar_closure"
+         , "-u", "GHCziIOBase_BlockedIndefinitely_closure"
+         , "-u", "GHCziIOBase_Deadlock_closure"
+         , "-u", "GHCziIOBase_NestedAtomically_closure"
+         , "-u", "GHCziWeak_runFinalizzerBatch_closure"
+#endif
+
+framework-dirs:
+
+#ifdef HAVE_FRAMEWORK_GMP
+frameworks:	"GMP"
+#else
+frameworks:
+#endif
+
+haddock-interfaces:
+haddock-html:
+
diff --git a/rts/parallel/0Hash.c b/rts/parallel/0Hash.c
new file mode 100644
index 0000000000..a471e30a66
--- /dev/null
+++ b/rts/parallel/0Hash.c
@@ -0,0 +1,320 @@
+/*-----------------------------------------------------------------------------
+ *
+ * (c) The AQUA Project, Glasgow University, 1995-1998
+ * (c) The GHC Team, 1999
+ *
+ * Dynamically expanding linear hash tables, as described in
+ * Per-\AAke Larson, ``Dynamic Hash Tables,'' CACM 31(4), April 1988,
+ * pp. 446 -- 457.
+ * -------------------------------------------------------------------------- */
+
+/* 
+   Replaced with ghc/rts/Hash.c in the new RTS
+*/
+
+#if 0
+
+#include "Rts.h"
+#include "Hash.h"
+#include "RtsUtils.h"
+
+#define HSEGSIZE    1024    /* Size of a single hash table segment */
+			    /* Also the minimum size of a hash table */
+#define HDIRSIZE    1024    /* Size of the segment directory */
+			    /* Maximum hash table size is HSEGSIZE * HDIRSIZE */
+#define HLOAD	    5	    /* Maximum average load of a single hash bucket */
+
+#define HCHUNK	    (1024 * sizeof(W_) / sizeof(HashList))
+			    /* Number of HashList cells to allocate in one go */
+
+
+/* Linked list of (key, data) pairs for separate chaining */
+struct hashlist {
+    StgWord key;
+    void *data;
+    struct hashlist *next;  /* Next cell in bucket chain (same hash value) */
+};
+
+typedef struct hashlist HashList;
+
+struct hashtable {
+    int split;		    /* Next bucket to split when expanding */
+    int max;		    /* Max bucket of smaller table */
+    int mask1;		    /* Mask for doing the mod of h_1 (smaller table) */
+    int mask2;		    /* Mask for doing the mod of h_2 (larger table) */
+    int kcount;		    /* Number of keys */
+    int bcount;		    /* Number of buckets */
+    HashList **dir[HDIRSIZE];	/* Directory of segments */
+};
+
+/* -----------------------------------------------------------------------------
+ * Hash first using the smaller table.  If the bucket is less than the
+ * next bucket to be split, re-hash using the larger table.
+ * -------------------------------------------------------------------------- */
+
+static int
+hash(HashTable *table, W_ key)
+{
+    int bucket;
+
+    /* Strip the boring zero bits */
+    key /= sizeof(StgWord);
+
+    /* Mod the size of the hash table (a power of 2) */
+    bucket = key & table->mask1;
+
+    if (bucket < table->split) {
+	/* Mod the size of the expanded hash table (also a power of 2) */
+	bucket = key & table->mask2;
+    }
+    return bucket;
+}
+
+/* -----------------------------------------------------------------------------
+ * Allocate a new segment of the dynamically growing hash table.
+ * -------------------------------------------------------------------------- */
+
+static void
+allocSegment(HashTable *table, int segment)
+{
+    table->dir[segment] = stgMallocBytes(HSEGSIZE * sizeof(HashList *), 
+					 "allocSegment");
+}
+
+
+/* -----------------------------------------------------------------------------
+ * Expand the larger hash table by one bucket, and split one bucket
+ * from the smaller table into two parts.  Only the bucket referenced
+ * by @table->split@ is affected by the expansion.
+ * -------------------------------------------------------------------------- */
+
+static void
+expand(HashTable *table)
+{
+    int oldsegment;
+    int oldindex;
+    int newbucket;
+    int newsegment;
+    int newindex;
+    HashList *hl;
+    HashList *next;
+    HashList *old, *new;
+
+    if (table->split + table->max >= HDIRSIZE * HSEGSIZE)
+	/* Wow!  That's big.  Too big, so don't expand. */
+	return;
+
+    /* Calculate indices of bucket to split */
+    oldsegment = table->split / HSEGSIZE;
+    oldindex = table->split % HSEGSIZE;
+
+    newbucket = table->max + table->split;
+
+    /* And the indices of the new bucket */
+    newsegment = newbucket / HSEGSIZE;
+    newindex = newbucket % HSEGSIZE;
+
+    if (newindex == 0)
+	allocSegment(table, newsegment);
+
+    if (++table->split == table->max) {
+	table->split = 0;
+	table->max *= 2;
+	table->mask1 = table->mask2;
+	table->mask2 = table->mask2 << 1 | 1;
+    }
+    table->bcount++;
+
+    /* Split the bucket, paying no attention to the original order */
+
+    old = new = NULL;
+    for (hl = table->dir[oldsegment][oldindex]; hl != NULL; hl = next) {
+	next = hl->next;
+	if (hash(table, hl->key) == newbucket) {
+	    hl->next = new;
+	    new = hl;
+	} else {
+	    hl->next = old;
+	    old = hl;
+	}
+    }
+    table->dir[oldsegment][oldindex] = old;
+    table->dir[newsegment][newindex] = new;
+
+    return;
+}
+
+void *
+lookupHashTable(HashTable *table, StgWord key)
+{
+    int bucket;
+    int segment;
+    int index;
+    HashList *hl;
+
+    bucket = hash(table, key);
+    segment = bucket / HSEGSIZE;
+    index = bucket % HSEGSIZE;
+
+    for (hl = table->dir[segment][index]; hl != NULL; hl = hl->next)
+	if (hl->key == key)
+	    return hl->data;
+
+    /* It's not there */
+    return NULL;
+}
+
+/* -----------------------------------------------------------------------------
+ * We allocate the hashlist cells in large chunks to cut down on malloc
+ * overhead.  Although we keep a free list of hashlist cells, we make
+ * no effort to actually return the space to the malloc arena.
+ * -------------------------------------------------------------------------- */
+
+static HashList *freeList = NULL;
+
+static HashList *
+allocHashList(void)
+{
+    HashList *hl, *p;
+
+    if ((hl = freeList) != NULL) {
+	freeList = hl->next;
+    } else {
+        hl = stgMallocBytes(HCHUNK * sizeof(HashList), "allocHashList");
+
+	freeList = hl + 1;
+	for (p = freeList; p < hl + HCHUNK - 1; p++)
+	    p->next = p + 1;
+	p->next = NULL;
+    }
+    return hl;
+}
+
+static void
+freeHashList(HashList *hl)
+{
+    hl->next = freeList;
+    freeList = hl;
+}
+
+void
+insertHashTable(HashTable *table, StgWord key, void *data)
+{
+    int bucket;
+    int segment;
+    int index;
+    HashList *hl;
+
+    /* We want no duplicates */
+    ASSERT(lookupHashTable(table, key) == NULL);
+    
+    /* When the average load gets too high, we expand the table */
+    if (++table->kcount >= HLOAD * table->bcount)
+	expand(table);
+
+    bucket = hash(table, key);
+    segment = bucket / HSEGSIZE;
+    index = bucket % HSEGSIZE;
+
+    hl = allocHashList();
+
+    hl->key = key;
+    hl->data = data;
+    hl->next = table->dir[segment][index];
+    table->dir[segment][index] = hl;
+
+}
+
+void *
+removeHashTable(HashTable *table, StgWord key, void *data)
+{
+    int bucket;
+    int segment;
+    int index;
+    HashList *hl;
+    HashList *prev = NULL;
+
+    bucket = hash(table, key);
+    segment = bucket / HSEGSIZE;
+    index = bucket % HSEGSIZE;
+
+    for (hl = table->dir[segment][index]; hl != NULL; hl = hl->next) {
+	if (hl->key == key && (data == NULL || hl->data == data)) {
+	    if (prev == NULL)
+		table->dir[segment][index] = hl->next;
+	    else
+		prev->next = hl->next;
+	    table->kcount--;
+	    return hl->data;
+	}
+	prev = hl;
+    }
+
+    /* It's not there */
+    ASSERT(data == NULL);
+    return NULL;
+}
+
+/* -----------------------------------------------------------------------------
+ * When we free a hash table, we are also good enough to free the
+ * data part of each (key, data) pair, as long as our caller can tell
+ * us how to do it.
+ * -------------------------------------------------------------------------- */
+
+void
+freeHashTable(HashTable *table, void (*freeDataFun)(void *) )
+{
+    long segment;
+    long index;
+    HashList *hl;
+    HashList *next;
+
+    /* The last bucket with something in it is table->max + table->split - 1 */
+    segment = (table->max + table->split - 1) / HSEGSIZE;
+    index = (table->max + table->split - 1) % HSEGSIZE;
+
+    while (segment >= 0) {
+	while (index >= 0) {
+	    for (hl = table->dir[segment][index]; hl != NULL; hl = next) {
+		next = hl->next;
+		if (freeDataFun != NULL)
+		    (*freeDataFun)(hl->data);
+		freeHashList(hl);
+	    }
+	    index--;
+	}
+	free(table->dir[segment]);
+	segment--;
+	index = HSEGSIZE - 1;
+    }
+    free(table);
+}
+
+/* -----------------------------------------------------------------------------
+ * When we initialize a hash table, we set up the first segment as well,
+ * initializing all of the first segment's hash buckets to NULL.
+ * -------------------------------------------------------------------------- */
+
+HashTable *
+allocHashTable(void)
+{
+    HashTable *table;
+    HashList **hb;
+
+    table = stgMallocBytes(sizeof(HashTable),"allocHashTable");
+
+    allocSegment(table, 0);
+
+    for (hb = table->dir[0]; hb < table->dir[0] + HSEGSIZE; hb++)
+	*hb = NULL;
+
+    table->split = 0;
+    table->max = HSEGSIZE;
+    table->mask1 = HSEGSIZE - 1;
+    table->mask2 = 2 * HSEGSIZE - 1;
+    table->kcount = 0;
+    table->bcount = HSEGSIZE;
+
+    return table;
+}
+#endif
diff --git a/rts/parallel/0Parallel.h b/rts/parallel/0Parallel.h
new file mode 100644
index 0000000000..d52bf00fc2
--- /dev/null
+++ b/rts/parallel/0Parallel.h
@@ -0,0 +1,414 @@
+/*
+  Time-stamp: <Mon Oct 04 1999 14:50:28 Stardate: [-30]3692.88 hwloidl>
+ 
+  Definitions for parallel machines.
+
+This section contains definitions applicable only to programs compiled
+to run on a parallel machine, i.e. on GUM. Some of these definitions
+are also used when simulating parallel execution, i.e. on GranSim.
+  */
+
+/*
+  ToDo: Check the PAR specfic part of this file 
+        Move stuff into Closures.h and ClosureMacros.h 
+	Clean-up GRAN specific code
+  -- HWL
+  */
+
+#ifndef PARALLEL_H
+#define PARALLEL_H
+
+#if defined(PAR) || defined(GRAN)        /* whole file */
+
+#include "Rts.h"
+#include "GranSim.h"
+//#include "ClosureTypes.h"
+
+//@menu
+//* Basic definitions::		
+//* Externs and types::		
+//* Dummy defs::		
+//* Par specific fixed headers::  
+//* Parallel only heap objects::  
+//* Packing definitions::	
+//* End of File::		
+//@end menu
+//*/
+
+//@node Basic definitions, Externs and types
+//@section Basic definitions
+
+/* SET_PAR_HDR and SET_STATIC_PAR_HDR now live in ClosureMacros.h */
+
+/* Needed for dumping routines */
+#if defined(PAR)
+# define TIME                      ullong
+# define CURRENT_TIME              msTime()
+# define TIME_ON_PROC(p)           msTime()
+# define CURRENT_PROC              thisPE
+# define BINARY_STATS              RtsFlags.ParFlags.granSimStats_Binary
+#elif defined(GRAN)
+# define TIME                      rtsTime
+# define CURRENT_TIME              CurrentTime[CurrentProc]
+# define TIME_ON_PROC(p)           CurrentTime[p]
+# define CURRENT_PROC              CurrentProc
+# define BINARY_STATS              RtsFlags.GranFlags.granSimStats_Binary
+#endif
+
+#if defined(PAR)
+#  define MAX_PES	256		/* Maximum number of processors */
+	/* MAX_PES is enforced by SysMan, which does not
+	   allow more than this many "processors".
+	   This is important because PackGA [GlobAddr.lc]
+	   **assumes** that a PE# can fit in 8+ bits.
+	*/
+#endif
+
+//@node Externs and types, Dummy defs, Basic definitions
+//@section Externs and types
+
+#if defined(PAR)
+/* GUM: one spark queue on each PE, and each PE sees only its own spark queue */
+extern rtsSparkQ pending_sparks_hd;
+extern rtsSparkQ pending_sparks_tl;
+#elif defined(GRAN)
+/* GranSim: a globally visible array of spark queues */
+extern rtsSparkQ pending_sparks_hds[];
+extern rtsSparkQ pending_sparks_tls[];
+#endif
+extern unsigned int /* nat */ spark_queue_len(PEs proc);
+
+extern StgInt SparksAvail;     /* How many sparks are available */
+
+/* prototypes of spark routines */
+/* ToDo: check whether all have to be visible -- HWL */
+#if defined(GRAN)
+rtsSpark *newSpark(StgClosure *node, StgInt name, StgInt gran_info, StgInt size_info, StgInt par_info, StgInt local);
+void disposeSpark(rtsSpark *spark);
+void disposeSparkQ(rtsSparkQ spark);
+void add_to_spark_queue(rtsSpark *spark);
+void delete_from_spark_queue (rtsSpark *spark);
+#endif
+
+#define STATS_FILENAME_MAXLEN	128
+
+/* Where to write the log file */
+//extern FILE *gr_file;
+extern char gr_filename[STATS_FILENAME_MAXLEN];
+
+#if defined(GRAN)
+int init_gr_simulation(char *rts_argv[], int rts_argc, char *prog_argv[], int prog_argc);
+void end_gr_simulation(void);
+#endif 
+
+#if defined(PAR)
+extern I_ do_sp_profile;
+
+extern P_ PendingFetches;
+extern GLOBAL_TASK_ID *PEs;
+
+extern rtsBool IAmMainThread, GlobalStopPending;
+extern rtsBool fishing;
+extern GLOBAL_TASK_ID SysManTask;
+extern int seed;			/*pseudo-random-number generator seed:*/
+					/*Initialised in ParInit*/
+extern I_ threadId;                     /*Number of Threads that have existed on a PE*/
+extern GLOBAL_TASK_ID mytid;
+
+extern int  nPEs;
+
+extern rtsBool InGlobalGC;  	/* Are we in the midst of performing global GC */
+
+extern HashTable *pGAtoGALAtable;
+extern HashTable *LAtoGALAtable;
+extern GALA *freeIndirections;
+extern GALA *liveIndirections;
+extern GALA *freeGALAList;
+extern GALA *liveRemoteGAs;
+extern int thisPE;
+
+void RunParallelSystem (StgPtr program_closure);
+void initParallelSystem();
+void SynchroniseSystem();
+
+void registerTask (GLOBAL_TASK_ID gtid);
+globalAddr *LAGAlookup (P_ addr);
+P_ GALAlookup (globalAddr *ga);
+globalAddr *MakeGlobal (P_ addr, rtsBool preferred);
+globalAddr *setRemoteGA (P_ addr, globalAddr *ga, rtsBool preferred);
+void splitWeight (globalAddr *to, globalAddr *from);
+globalAddr *addWeight (globalAddr *ga);
+void initGAtables();
+W_ taskIDtoPE (GLOBAL_TASK_ID gtid);
+void RebuildLAGAtable();
+
+void *lookupHashTable (HashTable *table, StgWord key);
+void insertHashTable (HashTable *table, StgWord key, void *data);
+void freeHashTable (HashTable *table, void (*freeDataFun) ((void *data)));
+HashTable *allocHashTable();
+void *removeHashTable (HashTable *table, StgWord key, void *data);
+#endif /* PAR */
+
+/* Interface for dumping routines (i.e. writing to log file) */
+void DumpGranEvent(GranEventType name, StgTSO *tso);
+void DumpRawGranEvent(PEs proc, PEs p, GranEventType name, 
+ 	              StgTSO *tso, StgClosure *node, StgInt sparkname, StgInt len);
+//void DumpEndEvent(PEs proc, StgTSO *tso, rtsBool mandatory_thread);
+
+//@node Dummy defs, Par specific fixed headers, Externs and types
+//@section Dummy defs
+
+/*
+Get this out of the way.  These are all null definitions.
+*/
+
+
+//#  define GA_HDR_SIZE			0 
+//#  define GA(closure)	        	/*nothing */ 
+  
+//#  define SET_GA(closure,ga)		/* nothing */ 
+//#  define SET_STATIC_GA(closure)	/* nothing */ 
+//#  define SET_GRAN_HDR(closure,pe)      /* nothing */ 
+//#  define SET_STATIC_PROCS(closure)	/* nothing */ 
+  
+//#  define SET_TASK_ACTIVITY(act)	/* nothing */ 
+
+#if defined(GRAN)
+
+#  define GA_HDR_SIZE			1
+
+#  define PROCS_HDR_POSN		PAR_HDR_POSN
+#  define PROCS_HDR_SIZE		1
+
+/* Accessing components of the field */
+#  define PROCS(closure)	        ((closure)->header.gran.procs)
+/* SET_PROCS is now SET_GRAN_HEADER in ClosureMacros.h. */
+#endif
+
+
+//@node Par specific fixed headers, Parallel only heap objects, Dummy defs
+//@section Par specific fixed headers
+
+/*
+Definitions relating to the entire parallel-only fixed-header field.
+
+On GUM, the global addresses for each local closure are stored in a separate
+hash table, rather then with the closure in the heap.  We call @getGA@ to
+look up the global address associated with a local closure (0 is returned
+for local closures that have no global address), and @setGA@ to store a new
+global address for a local closure which did not previously have one.
+*/
+
+#if defined(PAR) 
+
+#  define GA_HDR_SIZE			0
+  
+#  define GA(closure)		        getGA(closure)
+  
+#  define SET_GA(closure, ga)             setGA(closure,ga)
+#  define SET_STATIC_GA(closure)
+#  define SET_GRAN_HDR(closure,pe)
+#  define SET_STATIC_PROCS(closure)
+  
+#  define MAX_GA_WEIGHT			0	/* Treat as 2^n */
+  
+W_ PackGA ((W_, int));
+   /* There was a PACK_GA macro here; but we turned it into the PackGA
+      routine [GlobAddr.lc] (because it needs to do quite a bit of
+      paranoia checking.  Phil & Will (95/08)
+   */
+
+/* At the moment, there is no activity profiling for GUM.  This may change. */
+#  define SET_TASK_ACTIVITY(act)        /* nothing */
+#endif
+
+//@node Parallel only heap objects, Packing definitions, Par specific fixed headers
+//@section Parallel only heap objects
+
+// NB: The following definitons are BOTH for GUM and GrAnSim -- HWL
+
+/*   All in Closures.h and CLosureMacros.h */
+
+//@node Packing definitions, End of File, Parallel only heap objects
+//@section Packing definitions
+
+//@menu
+//* GUM::			
+//* GranSim::			
+//@end menu
+//*/
+
+//@node GUM, GranSim, Packing definitions, Packing definitions
+//@subsection GUM
+
+#if defined(PAR) 
+/*
+Symbolic constants for the packing code.
+
+This constant defines how many words of data we can pack into a single
+packet in the parallel (GUM) system.
+*/
+
+//@menu
+//* Externs::			
+//* Prototypes::		
+//* Macros::			
+//@end menu
+//*/
+
+//@node Externs, Prototypes, GUM, GUM
+//@subsubsection Externs
+
+extern W_      *PackBuffer;      /* size: can be set via option */
+extern long *buffer;             /* HWL_ */
+extern W_ *freeBuffer;           /* HWL_ */
+extern W_ *packBuffer;           /* HWL_ */
+
+extern void    InitPackBuffer(STG_NO_ARGS);
+extern void    InitMoreBuffers(STG_NO_ARGS);
+extern void    InitPendingGABuffer(W_ size); 
+extern void    AllocClosureQueue(W_ size);
+
+//@node Prototypes, Macros, Externs, GUM
+//@subsubsection Prototypes
+
+void	InitPackBuffer();
+P_      PackTSO (P_ tso, W_ *size);
+P_      PackStkO (P_ stko, W_ *size);
+P_	AllocateHeap (W_ size);          /* Doesn't belong */
+
+void    InitClosureQueue ();
+P_      DeQueueClosure();
+void    QueueClosure (P_ closure);
+rtsBool QueueEmpty();
+void    PrintPacket (P_ buffer);
+
+P_      get_closure_info (P_ closure, W_ *size, W_ *ptrs, W_ *nonptrs, W_ *vhs, char *type);
+
+rtsBool isOffset (globalAddr *ga),
+	isFixed (globalAddr *ga);
+
+void    doGlobalGC();
+
+P_      PackNearbyGraph (P_ closure,W_ *size);
+P_      UnpackGraph (W_ *buffer, globalAddr **gamap, W_ *nGAs);
+
+
+//@node Macros,  , Prototypes, GUM
+//@subsubsection Macros
+
+#    define PACK_HEAP_REQUIRED  \
+      ((RtsFlags.ParFlags.packBufferSize - PACK_HDR_SIZE) / (PACK_GA_SIZE + _FHS) * (SPEC_HS + 2))
+
+#  define MAX_GAS 	(RtsFlags.ParFlags.packBufferSize / PACK_GA_SIZE)
+
+
+#  define PACK_GA_SIZE	3	/* Size of a packed GA in words */
+			        /* Size of a packed fetch-me in words */
+#  define PACK_FETCHME_SIZE (PACK_GA_SIZE + FIXED_HS)
+
+#  define PACK_HDR_SIZE	1	/* Words of header in a packet */
+
+#  define PACK_PLC_SIZE	2	/* Size of a packed PLC in words */
+
+#endif /* PAR */
+
+//@node GranSim,  , GUM, Packing definitions
+//@subsection GranSim
+
+#if defined(GRAN)
+/* ToDo: Check which of the PAR routines are needed in GranSim -- HWL */
+
+//@menu
+//* Types::			
+//* Prototypes::		
+//* Macros::			
+//@end menu
+//*/
+
+//@node Types, Prototypes, GranSim, GranSim
+//@subsubsection Types
+
+typedef struct rtsPackBuffer_ {
+  StgInt /* nat */           size;
+  StgInt /* nat */           unpacked_size;
+  StgTSO       *tso;
+  StgClosure  **buffer;  
+} rtsPackBuffer;
+
+//@node Prototypes, Macros, Types, GranSim
+//@subsubsection Prototypes
+
+
+/* main packing functions */
+/*
+rtsPackBuffer *PackNearbyGraph(StgClosure* closure, StgTSO* tso, nat *packbuffersize);
+rtsPackBuffer *PackOneNode(StgClosure* closure, StgTSO* tso, nat *packbuffersize);
+void PrintPacket(rtsPackBuffer *buffer);
+StgClosure *UnpackGraph(rtsPackBuffer* buffer);
+*/
+/* important auxiliary functions */
+
+//StgInfoTable *get_closure_info(StgClosure* node, nat *size, nat *ptrs, nat *nonptrs, nat *vhs, char *info_hdr_ty);
+int IS_BLACK_HOLE(StgClosure* node);
+StgClosure *IS_INDIRECTION(StgClosure* node);
+int IS_THUNK(StgClosure* closure);
+char *display_info_type(StgClosure* closure, char *str);
+
+/* 
+OLD CODE -- HWL
+void  InitPackBuffer(void);
+P_    AllocateHeap (W_ size);
+P_    PackNearbyGraph (P_ closure, P_ tso, W_ *packbuffersize);
+P_    PackOneNode (P_ closure, P_ tso, W_ *packbuffersize);
+P_    UnpackGraph (P_ buffer);
+
+void    InitClosureQueue (void);
+P_      DeQueueClosure(void);
+void    QueueClosure (P_ closure);
+// rtsBool QueueEmpty();
+void    PrintPacket (P_ buffer);
+*/
+
+// StgInfoTable *get_closure_info(StgClosure* node, unsigned int /* nat */ *size, unsigned int /* nat */ *ptrs, unsigned int /* nat */ *nonptrs, unsigned int /* nat */ *vhs, char *info_hdr_ty);
+// int /* rtsBool */ IS_BLACK_HOLE(StgClosure* node)          ;
+
+//@node Macros,  , Prototypes, GranSim
+//@subsubsection Macros
+
+/* These are needed in the packing code to get the size of the packet
+   right. The closures itself are never built in GrAnSim. */
+#  define FETCHME_VHS				IND_VHS
+#  define FETCHME_HS				IND_HS
+  
+#  define FETCHME_GA_LOCN                       FETCHME_HS
+  
+#  define FETCHME_CLOSURE_SIZE(closure)		IND_CLOSURE_SIZE(closure)
+#  define FETCHME_CLOSURE_NoPTRS(closure)		0L
+#  define FETCHME_CLOSURE_NoNONPTRS(closure)	(IND_CLOSURE_SIZE(closure)-IND_VHS)
+  
+#  define MAX_GAS 	(RtsFlags.GranFlags.packBufferSize / PACK_GA_SIZE)
+#  define PACK_GA_SIZE	3	/* Size of a packed GA in words */
+			        /* Size of a packed fetch-me in words */
+#  define PACK_FETCHME_SIZE (PACK_GA_SIZE + FIXED_HS)
+#  define PACK_HDR_SIZE	4	/* Words of header in a packet */
+
+#    define PACK_HEAP_REQUIRED  \
+      (RtsFlags.GranFlags.packBufferSize * sizeofW(StgClosure*) + \
+      2 * sizeofW(StgInt) + sizeofW(StgTSO*))
+
+#    define PACK_FLAG_LOCN           0  
+#    define PACK_TSO_LOCN            1
+#    define PACK_UNPACKED_SIZE_LOCN  2
+#    define PACK_SIZE_LOCN           3
+#    define MAGIC_PACK_FLAG          0xfabc
+
+#endif   /* GRAN */
+
+//@node End of File,  , Packing definitions
+//@section End of File
+
+#endif /* defined(PAR) || defined(GRAN)         whole file */
+#endif /* Parallel_H */
+
+
diff --git a/rts/parallel/0Unpack.c b/rts/parallel/0Unpack.c
new file mode 100644
index 0000000000..fc4a8e50c3
--- /dev/null
+++ b/rts/parallel/0Unpack.c
@@ -0,0 +1,440 @@
+/*
+  Time-stamp: <Wed Jan 12 2000 13:29:08 Stardate: [-30]4193.85 hwloidl>
+
+  Unpacking closures which have been exported to remote processors
+
+  This module defines routines for unpacking closures in the parallel
+  runtime system (GUM).
+
+  In the case of GrAnSim, this module defines routines for *simulating* the
+  unpacking of closures as it is done in the parallel runtime system.
+*/
+
+/* 
+   Code in this file has been merged with Pack.c 
+*/
+
+#if 0
+
+//@node Unpacking closures, , ,
+//@section Unpacking closures
+
+//@menu
+//* Includes::			
+//* Prototypes::		
+//* GUM code::			
+//* GranSim Code::		
+//* Index::			
+//@end menu
+//*/
+
+//@node Includes, Prototypes, Unpacking closures, Unpacking closures
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+#include "ParallelDebug.h"
+#include "FetchMe.h"
+#include "Storage.h"
+
+//@node Prototypes, GUM code, Includes, Unpacking closures
+//@subsection Prototypes
+
+void     InitPacking(void);
+# if defined(PAR)
+void            InitPackBuffer(void);
+# endif
+/* Interface for ADT of closure queues */
+void    	  AllocClosureQueue(nat size);
+void    	  InitClosureQueue(void);
+rtsBool 	  QueueEmpty(void);
+void    	  QueueClosure(StgClosure *closure);
+StgClosure *DeQueueClosure(void);
+
+StgPtr AllocateHeap(nat size);
+
+//@node GUM code, GranSim Code, Prototypes, Unpacking closures
+//@subsection GUM code
+
+#if defined(PAR) 
+
+//@node Local Definitions,  , GUM code, GUM code
+//@subsubsection Local Definitions
+
+//@cindex PendingGABuffer
+static globalAddr *PendingGABuffer;  
+/* is initialised in main; */
+
+//@cindex InitPendingGABuffer
+void
+InitPendingGABuffer(size)
+nat size; 
+{
+  PendingGABuffer = (globalAddr *) 
+                      stgMallocBytes((size-PACK_HDR_SIZE)*2*sizeof(globalAddr),
+				     "InitPendingGABuffer");
+}
+
+/*
+  @CommonUp@ commons up two closures which we have discovered to be
+  variants of the same object.  One is made an indirection to the other.  */
+
+//@cindex CommonUp
+void
+CommonUp(StgClosure *src, StgClosure *dst)
+{
+  StgBlockingQueueElement *bqe;
+
+  ASSERT(src != dst);
+  switch (get_itbl(src)->type) {
+  case BLACKHOLE_BQ:
+    bqe = ((StgBlockingQueue *)src)->blocking_queue;
+    break;
+
+  case FETCH_ME_BQ:
+    bqe = ((StgFetchMeBlockingQueue *)src)->blocking_queue;
+    break;
+    
+  case RBH:
+    bqe = ((StgRBH *)src)->blocking_queue;
+    break;
+    
+  case BLACKHOLE:
+  case FETCH_ME:
+    bqe = END_BQ_QUEUE;
+    break;
+
+  default:
+    /* Don't common up anything else */
+    return;
+  }
+  /* We do not use UPD_IND because that would awaken the bq, too */
+  // UPD_IND(src, dst);
+  updateWithIndirection(get_itbl(src), src, dst);
+  //ASSERT(!IS_BIG_MOTHER(INFO_PTR(dst)));
+  if (bqe != END_BQ_QUEUE)
+    awaken_blocked_queue(bqe, src);
+}
+
+/*
+  @UnpackGraph@ unpacks the graph contained in a message buffer.  It
+  returns a pointer to the new graph.  The @gamap@ parameter is set to
+  point to an array of (oldGA,newGA) pairs which were created as a result
+  of unpacking the buffer; @nGAs@ is set to the number of GA pairs which
+  were created.
+
+  The format of graph in the pack buffer is as defined in @Pack.lc@.  */
+
+//@cindex UnpackGraph
+StgClosure *
+UnpackGraph(packBuffer, gamap, nGAs)
+rtsPackBuffer *packBuffer;
+globalAddr **gamap;
+nat *nGAs;
+{
+  nat size, ptrs, nonptrs, vhs;
+  StgWord **buffer, **bufptr, **slotptr;
+  globalAddr ga, *gaga;
+  StgClosure *closure, *existing,
+             *graphroot, *graph, *parent;
+  StgInfoTable *ip, *oldip;
+  nat bufsize, i,
+      pptr = 0, pptrs = 0, pvhs;
+  char str[80];
+
+  InitPackBuffer();                  /* in case it isn't already init'd */
+  graphroot = (StgClosure *)NULL;
+
+  gaga = PendingGABuffer;
+
+  InitClosureQueue();
+
+  /* Unpack the header */
+  bufsize = packBuffer->size;
+  buffer = packBuffer->buffer;
+  bufptr = buffer;
+
+  /* allocate heap */
+  if (bufsize > 0) {
+    graph = allocate(bufsize);
+    ASSERT(graph != NULL);
+  }
+
+  parent = (StgClosure *)NULL;
+
+  do {
+    /* This is where we will ultimately save the closure's address */
+    slotptr = bufptr;
+
+    /* First, unpack the next GA or PLC */
+    ga.weight = (rtsWeight) *bufptr++;
+
+    if (ga.weight > 0) {
+      ga.payload.gc.gtid = (GlobalTaskId) *bufptr++;
+      ga.payload.gc.slot = (int) *bufptr++;
+    } else
+      ga.payload.plc = (StgPtr) *bufptr++;
+    
+    /* Now unpack the closure body, if there is one */
+    if (isFixed(&ga)) {
+      /* No more to unpack; just set closure to local address */
+      IF_PAR_DEBUG(pack,
+		   belch("Unpacked PLC at %x", ga.payload.plc)); 
+      closure = ga.payload.plc;
+    } else if (isOffset(&ga)) {
+      /* No more to unpack; just set closure to cached address */
+      ASSERT(parent != (StgClosure *)NULL);
+      closure = (StgClosure *) buffer[ga.payload.gc.slot];
+    } else {
+      /* Now we have to build something. */
+
+      ASSERT(bufsize > 0);
+
+      /*
+       * Close your eyes.  You don't want to see where we're looking. You
+       * can't get closure info until you've unpacked the variable header,
+       * but you don't know how big it is until you've got closure info.
+       * So...we trust that the closure in the buffer is organized the
+       * same way as they will be in the heap...at least up through the
+       * end of the variable header.
+       */
+      ip = get_closure_info(bufptr, &size, &ptrs, &nonptrs, &vhs, str);
+	  
+      /* 
+	 Remember, the generic closure layout is as follows:
+	 +-------------------------------------------------+
+	 | FIXED HEADER | VARIABLE HEADER | PTRS | NON-PRS |
+	 +-------------------------------------------------+
+      */
+      /* Fill in the fixed header */
+      for (i = 0; i < FIXED_HS; i++)
+	((StgPtr)graph)[i] = *bufptr++;
+
+      if (ip->type == FETCH_ME)
+	size = ptrs = nonptrs = vhs = 0;
+
+      /* Fill in the packed variable header */
+      for (i = 0; i < vhs; i++)
+	((StgPtr)graph)[FIXED_HS + i] = *bufptr++;
+
+      /* Pointers will be filled in later */
+
+      /* Fill in the packed non-pointers */
+      for (i = 0; i < nonptrs; i++)
+	((StgPtr)graph)[FIXED_HS + i + vhs + ptrs] = *bufptr++;
+                
+      /* Indirections are never packed */
+      // ASSERT(INFO_PTR(graph) != (W_) Ind_info_TO_USE);
+
+      /* Add to queue for processing */
+      QueueClosure(graph);
+	
+      /*
+       * Common up the new closure with any existing closure having the same
+       * GA
+       */
+
+      if ((existing = GALAlookup(&ga)) == NULL) {
+	globalAddr *newGA;
+	/* Just keep the new object */
+	IF_PAR_DEBUG(pack,
+		     belch("Unpacking new (%x, %d, %x)\n", 
+			   ga.payload.gc.gtid, ga.payload.gc.slot, ga.weight));
+
+	closure = graph;
+	newGA = setRemoteGA(graph, &ga, rtsTrue);
+	if (ip->type == FETCH_ME)
+	  // FETCHME_GA(closure) = newGA;
+	  ((StgFetchMe *)closure)->ga = newGA;
+      } else {
+	/* Two closures, one global name.  Someone loses */
+	oldip = get_itbl(existing);
+
+	if ((oldip->type == FETCH_ME || IS_BLACK_HOLE(existing)) &&
+	    ip->type != FETCH_ME) {
+
+	  /* What we had wasn't worth keeping */
+	  closure = graph;
+	  CommonUp(existing, graph);
+	} else {
+
+	  /*
+	   * Either we already had something worthwhile by this name or
+	   * the new thing is just another FetchMe.  However, the thing we
+	   * just unpacked has to be left as-is, or the child unpacking
+	   * code will fail.  Remember that the way pointer words are
+	   * filled in depends on the info pointers of the parents being
+	   * the same as when they were packed.
+	   */
+	  IF_PAR_DEBUG(pack,
+		       belch("Unpacking old (%x, %d, %x), keeping %#lx", 
+			     ga.payload.gc.gtid, ga.payload.gc.slot, ga.weight,
+			     existing));
+
+	  closure = existing;
+	}
+	/* Pool the total weight in the stored ga */
+	(void) addWeight(&ga);
+      }
+
+      /* Sort out the global address mapping */
+      if ((ip_THUNK(ip) && !ip_UNPOINTED(ip)) || 
+	  (ip_MUTABLE(ip) && ip->type != FETCH_ME)) {
+	/* Make up new GAs for single-copy closures */
+	globalAddr *newGA = makeGlobal(closure, rtsTrue);
+	
+	ASSERT(closure == graph);
+
+	/* Create an old GA to new GA mapping */
+	*gaga++ = ga;
+	splitWeight(gaga, newGA);
+	ASSERT(gaga->weight == 1L << (BITS_IN(unsigned) - 1));
+	gaga++;
+      }
+      graph += FIXED_HS + (size < MIN_UPD_SIZE ? MIN_UPD_SIZE : size);
+    }
+
+    /*
+     * Set parent pointer to point to chosen closure.  If we're at the top of
+     * the graph (our parent is NULL), then we want to arrange to return the
+     * chosen closure to our caller (possibly in place of the allocated graph
+     * root.)
+     */
+    if (parent == NULL)
+      graphroot = closure;
+    else
+      ((StgPtr)parent)[FIXED_HS + pvhs + pptr] = (StgWord) closure;
+
+    /* Save closure pointer for resolving offsets */
+    *slotptr = (StgWord) closure;
+
+    /* Locate next parent pointer */
+    pptr++;
+    while (pptr + 1 > pptrs) {
+      parent = DeQueueClosure();
+
+      if (parent == NULL)
+	break;
+      else {
+	(void) get_closure_info(parent, &size, &pptrs, &nonptrs,
+					&pvhs, str);
+	pptr = 0;
+      }
+    }
+  } while (parent != NULL);
+
+  ASSERT(bufsize == 0 || graph - 1 <= SAVE_Hp);
+
+  *gamap = PendingGABuffer;
+  *nGAs = (gaga - PendingGABuffer) / 2;
+
+  /* ToDo: are we *certain* graphroot has been set??? WDP 95/07 */
+  ASSERT(graphroot!=NULL);
+  return (graphroot);
+}
+#endif  /* PAR */
+
+//@node GranSim Code, Index, GUM code, Unpacking closures
+//@subsection GranSim Code
+
+/*
+   For GrAnSim: In general no actual unpacking should be necessary. We just
+   have to walk over the graph and set the bitmasks appropriately. -- HWL */
+
+//@node Unpacking,  , GranSim Code, GranSim Code
+//@subsubsection Unpacking
+
+#if defined(GRAN)
+void
+CommonUp(StgClosure *src, StgClosure *dst)
+{
+  barf("CommonUp: should never be entered in a GranSim setup");
+}
+
+/* This code fakes the unpacking of a somewhat virtual buffer */
+StgClosure*
+UnpackGraph(buffer)
+rtsPackBuffer* buffer;
+{
+  nat size, ptrs, nonptrs, vhs,
+      bufptr = 0;
+  StgClosure *closure, *graphroot, *graph;
+  StgInfoTable *ip;
+  StgWord bufsize, unpackedsize,
+          pptr = 0, pptrs = 0, pvhs;
+  StgTSO* tso;
+  char str[240], str1[80];
+  int i;
+
+  bufptr = 0;
+  graphroot = buffer->buffer[0];
+
+  tso = buffer->tso;
+
+  /* Unpack the header */
+  unpackedsize = buffer->unpacked_size;
+  bufsize = buffer->size;
+
+  IF_GRAN_DEBUG(pack,
+		belch("<<< Unpacking <<%d>> (buffer @ %p):\n    (root @ %p, PE %d,size=%d), demanded by TSO %d (%p)[PE %d]",
+		      buffer->id, buffer, graphroot, where_is(graphroot), 
+		      bufsize, tso->id, tso, 
+		      where_is((StgClosure *)tso)));
+
+  do {
+    closure = buffer->buffer[bufptr++]; /* that's all we need for GrAnSim -- HWL */
+      
+    /* Actually only ip is needed; rest is useful for TESTING -- HWL */
+    ip = get_closure_info(closure, 
+			  &size, &ptrs, &nonptrs, &vhs, str);
+      
+    IF_GRAN_DEBUG(pack,
+		  sprintf(str, "**    (%p): Changing bitmask[%s]: 0x%x ",
+			  closure, (closure_HNF(closure) ? "NF" : "__"),
+			  PROCS(closure)));
+
+    if (ip->type == RBH) {
+      closure->header.gran.procs = PE_NUMBER(CurrentProc);    /* Move node */
+      
+      IF_GRAN_DEBUG(pack,
+		    strcat(str, " (converting RBH) ")); 
+
+      convertFromRBH(closure);   /* In GUM that's done by convertToFetchMe */
+    } else if (IS_BLACK_HOLE(closure)) {
+      closure->header.gran.procs |= PE_NUMBER(CurrentProc); /* Copy node */
+    } else if ( closure->header.gran.procs & PE_NUMBER(CurrentProc) == 0 ) {
+      if (closure_HNF(closure))
+	closure->header.gran.procs |= PE_NUMBER(CurrentProc); /* Copy node */
+      else
+	closure->header.gran.procs = PE_NUMBER(CurrentProc);  /* Move node */
+    }
+
+    IF_GRAN_DEBUG(pack,
+		  sprintf(str1, "0x%x",   PROCS(closure)); strcat(str, str1));
+    IF_GRAN_DEBUG(pack, belch(str));
+    
+  } while (bufptr<buffer->size) ;   /*  (parent != NULL);  */
+
+  /* In GrAnSim we allocate pack buffers dynamically! -- HWL */
+  free(buffer->buffer);
+  free(buffer);
+
+  IF_GRAN_DEBUG(pack,
+		belch("PrintGraph of %p is:", graphroot); PrintGraph(graphroot,0));
+
+  return (graphroot);
+}
+#endif  /* GRAN */
+#endif
+
+//@node Index,  , GranSim Code, Unpacking closures
+//@subsection Index
+
+//@index
+//* CommonUp::  @cindex\s-+CommonUp
+//* InitPendingGABuffer::  @cindex\s-+InitPendingGABuffer
+//* PendingGABuffer::  @cindex\s-+PendingGABuffer
+//* UnpackGraph::  @cindex\s-+UnpackGraph
+//@end index
diff --git a/rts/parallel/Dist.c b/rts/parallel/Dist.c
new file mode 100644
index 0000000000..eeec780716
--- /dev/null
+++ b/rts/parallel/Dist.c
@@ -0,0 +1,117 @@
+#include "Dist.h"
+
+#ifdef DIST /* whole file */
+
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "ParallelRts.h"
+#include "Parallel.h" // nPEs,allPEs,mytid 
+#include "HLC.h" //for sendReval
+#include "LLC.h" //for pvm stuff
+#include "FetchMe.h"     // for BLOCKED_FETCH_info 
+#include "Storage.h"       // for recordMutable
+
+/* hopefully the result>0  */
+StgWord32 cGetPECount(void)
+{ return nPEs;
+} 
+
+/* return taskID, n is 1..count, n=1 is always the mainPE */
+StgPEId cGetPEId(StgWord32 n)
+{ return allPEs[n-1];
+}
+
+/* return the taskID */
+StgPEId cGetMyPEId(void)
+{ return mytid;
+}
+
+/* return the taskID of the owning PE of an MVar/TSO:
+- MVAR/TSOs get converted to REMOTE_REFs when shipped, and
+  there is no mechanism for using these REMOTE_REFs 
+  apart from this code.
+*/   
+
+StgPEId cGetCertainOwner(StgClosure *mv)
+{ globalAddr *ga; 
+  switch(get_itbl(mv)->type)
+  { case TSO:
+    case MVAR:
+      return  mytid; // must be local 
+    case REMOTE_REF:
+      ga = LAGAlookup(mv);
+      ASSERT(ga);
+      return ga->payload.gc.gtid; // I know its global address
+  }   
+  barf("Dist.c:cGetCertainOwner() wrong closure type %s",info_type(mv));
+}
+
+/* for some additional fun, lets look up a certain host... */
+StgPEId cGetHostOwner(StgByteArray h) //okay h is a C string 
+{ int nArch,nHost,nTask,i;
+  StgPEId dtid;
+  struct pvmhostinfo *host;   
+  struct pvmtaskinfo *task;
+  
+  dtid=0;
+  pvm_config(&nHost,&nArch,&host); 
+  for(i=0;i<nHost;i++)
+    if(strcmp(host[i].hi_name,h)==0) 
+    { dtid=host[i].hi_tid;
+      break;
+    } 
+  if(dtid==0) return 0; // no host of that name
+  
+  for(i=0;i<nPEs;i++)
+  { pvm_tasks(allPEs[i],&nTask,&task);
+    ASSERT(nTask==1); //cause we lookup a single task
+    if(task[0].ti_host==dtid)
+      return allPEs[i];
+  }  
+  return 0;  //know host, put no PE on it
+}
+
+void cRevalIO(StgClosure *job,StgPEId p)
+{ nat size;
+  rtsPackBuffer *buffer=NULL;
+      
+  ASSERT(get_itbl(job)->type==MVAR);  
+  job=((StgMVar*)job)->value; // extract the job from the MVar
+
+  ASSERT(closure_THUNK(job)); // must be a closure!!!!!
+  ASSERT(p!=mytid);
+  
+  buffer = PackNearbyGraph(job, END_TSO_QUEUE, &size,p);
+  ASSERT(buffer != (rtsPackBuffer *)NULL);
+  ASSERT(get_itbl(job)->type==RBH);  
+  
+  IF_PAR_DEBUG(verbose,
+               belch("@;~) %x doing revalIO to %x\n",
+		     mytid,p)); 
+
+  sendReval(p,size,buffer);  
+  
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    globalParStats.tot_reval_mess++;
+  }
+  
+  /* 
+     We turn job into a FETCHME_BQ so that the thread will block
+     when it enters it.
+     
+     Note: it will not receive an ACK, thus no GA.   
+  */
+  
+  ASSERT(get_itbl(job)->type==RBH);  
+ 
+   /* put closure on mutables list, while it is still a RBH */
+  recordMutable((StgMutClosure *)job);
+
+  /* actually turn it into a FETCH_ME_BQ */
+  SET_INFO(job, &FETCH_ME_BQ_info);
+  ((StgFetchMe *)job)->ga = 0;     //hope this won't make anyone barf!!!
+  ((StgBlockingQueue*)job)->blocking_queue=END_BQ_QUEUE;
+}
+
+#endif
diff --git a/rts/parallel/Dist.h b/rts/parallel/Dist.h
new file mode 100644
index 0000000000..c67cce2748
--- /dev/null
+++ b/rts/parallel/Dist.h
@@ -0,0 +1,20 @@
+#ifndef __DIST_H
+#define __DIST_H
+
+#ifdef DIST 
+
+#include "Rts.h"
+
+typedef StgWord32 StgPEId;
+
+// interface functions for Haskell Language calls
+StgWord32 cGetPECount(void);
+StgPEId cGetPEId(StgWord32 n);
+StgPEId cGetMyPEId(void);
+StgPEId cGetCertainOwner(StgClosure *mv);
+void cRevalIO(StgClosure *job,StgPEId p);
+StgPEId cGetHostOwner(StgByteArray h);
+
+#endif /* DIST */
+
+#endif /* __DIST_H */
diff --git a/rts/parallel/FetchMe.h b/rts/parallel/FetchMe.h
new file mode 100644
index 0000000000..be5cbf6b54
--- /dev/null
+++ b/rts/parallel/FetchMe.h
@@ -0,0 +1,24 @@
+/* -----------------------------------------------------------------------------
+ *
+ * Closure types for the parallel system.
+ *
+ * ---------------------------------------------------------------------------*/
+
+EI_(stg_FETCH_ME_info);
+EF_(stg_FETCH_ME_entry);
+
+EI_(stg_FETCH_ME_BQ_info);
+EF_(stg_FETCH_ME_BQ_entry);
+
+EI_(stg_BLOCKED_FETCH_info);
+EF_(stg_BLOCKED_FETCH_entry);
+
+EI_(stg_REMOTE_REF_info);
+EF_(stg_REMOTE_REF_entry);
+
+EI_(stg_RBH_Save_0_info);
+EF_(stg_RBH_Save_0_entry);
+EI_(stg_RBH_Save_1_info);
+EF_(stg_RBH_Save_1_entry);
+EI_(stg_RBH_Save_2_info);
+EF_(stg_RBH_Save_2_entry);
diff --git a/rts/parallel/FetchMe.hc b/rts/parallel/FetchMe.hc
new file mode 100644
index 0000000000..f142e9e514
--- /dev/null
+++ b/rts/parallel/FetchMe.hc
@@ -0,0 +1,180 @@
+/* ----------------------------------------------------------------------------
+ Time-stamp: <Tue Mar 06 2001 17:01:46 Stardate: [-30]6288.54 hwloidl>
+
+ Entry code for a FETCH_ME closure
+
+ This module defines routines for handling remote pointers (@FetchMe@s)
+ in GUM.  It is threaded (@.hc@) because @FetchMe_entry@ will be
+ called during evaluation.
+
+ * --------------------------------------------------------------------------*/
+ 
+#ifdef PAR /* all of it */
+
+//@menu
+//* Includes::			
+//* Info tables::		
+//* Index::			
+//@end menu
+
+//@node Includes, Info tables
+//@subsection Includes
+
+#include "Stg.h"
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Storage.h"
+#include "GranSim.h"
+#include "GranSimRts.h"
+#include "Parallel.h"
+#include "ParallelRts.h"
+#include "FetchMe.h"
+#include "HLC.h"
+#include "StgRun.h"	/* for StgReturn and register saving */
+
+/* --------------------------------------------------------------------------
+   FETCH_ME closures.
+
+   A FETCH_ME closure represents data that currently resides on
+   another PE.  We issue a fetch message, and wait for the data to be
+   retrieved.
+
+   A word on the ptr/nonptr fields in the macros: they are unused at the
+   moment; all closures defined here have constant size (ie. no payload
+   that varies from closure to closure). Therefore, all routines that 
+   need to know the size of these closures have to do a sizeofW(StgFetchMe) 
+   etc to get the closure size. See get_closure_info(), evacuate() and
+   checkClosure() (using the same fcts for determining the size of the 
+   closures would be a good idea; at least it would be a nice step towards
+   making this code bug free).
+   ------------------------------------------------------------------------ */
+
+//@node Info tables, Index, Includes
+//@subsection Info tables
+
+//@cindex FETCH_ME_info
+INFO_TABLE(stg_FETCH_ME_info, stg_FETCH_ME_entry, 0,2, FETCH_ME,, EF_,"FETCH_ME","FETCH_ME");
+//@cindex FETCH_ME_entry
+STGFUN(stg_FETCH_ME_entry)
+{
+  FB_
+    TICK_ENT_BH();
+
+    ASSERT(((StgFetchMe *)R1.p)->ga->payload.gc.gtid != mytid);
+  
+    /* Turn the FETCH_ME into a FETCH_ME_BQ, and place the current thread
+     * on the blocking queue.
+     */
+    // ((StgFetchMeBlockingQueue *)R1.cl)->header.info = &FETCH_ME_BQ_info; // does the same as SET_INFO
+    SET_INFO((StgClosure *)R1.cl, &stg_FETCH_ME_BQ_info);
+  
+    /* Remember GA as a global var (used in blockThread); NB: not thread safe! */
+    ASSERT(theGlobalFromGA.payload.gc.gtid == (GlobalTaskId)0);
+    theGlobalFromGA = *((StgFetchMe *)R1.p)->ga; 
+
+    /* Put ourselves on the blocking queue for this black hole */
+    ASSERT(looks_like_ga(((StgFetchMe *)R1.p)->ga));
+    CurrentTSO->link = END_BQ_QUEUE;
+    ((StgFetchMeBlockingQueue *)R1.cl)->blocking_queue = (StgBlockingQueueElement *)CurrentTSO;
+  
+    /* jot down why and on what closure we are blocked */
+    CurrentTSO->why_blocked = BlockedOnGA;
+    CurrentTSO->block_info.closure = R1.cl;
+    /* closure is mutable since something has just been added to its BQ */
+    //recordMutable((StgMutClosure *)R1.cl);
+
+    /* sendFetch etc is now done in blockThread, which is called from the
+       scheduler -- HWL */
+
+    BLOCK_NP(1); 
+  FE_
+}
+
+/* ---------------------------------------------------------------------------
+   FETCH_ME_BQ
+   
+   On the first entry of a FETCH_ME closure, we turn the closure into
+   a FETCH_ME_BQ, which behaves just like a BLACKHOLE_BQ.  Any thread
+   entering the FETCH_ME_BQ will be placed in the blocking queue.
+   When the data arrives from the remote PE, all waiting threads are
+   woken up and the FETCH_ME_BQ is overwritten with the fetched data.
+
+   FETCH_ME_BQ_entry is almost identical to BLACKHOLE_BQ_entry -- HWL
+   ------------------------------------------------------------------------ */
+
+INFO_TABLE(stg_FETCH_ME_BQ_info, stg_FETCH_ME_BQ_entry,0,2,FETCH_ME_BQ,,EF_,"FETCH_ME_BQ","FETCH_ME_BQ");
+//@cindex FETCH_ME_BQ_info
+STGFUN(stg_FETCH_ME_BQ_entry)
+{
+  FB_
+    TICK_ENT_BH();
+
+    /* Put ourselves on the blocking queue for this node */
+    CurrentTSO->link = (StgTSO*)((StgBlockingQueue *)R1.p)->blocking_queue;
+    ((StgBlockingQueue *)R1.p)->blocking_queue = (StgBlockingQueueElement *)CurrentTSO;
+
+    /* jot down why and on what closure we are blocked */
+    CurrentTSO->why_blocked = BlockedOnGA_NoSend;
+    CurrentTSO->block_info.closure = R1.cl;
+
+    /* stg_gen_block is too heavyweight, use a specialised one */
+    BLOCK_NP(1);
+  FE_
+}
+
+/* ---------------------------------------------------------------------------
+   BLOCKED_FETCH_BQ
+   
+   A BLOCKED_FETCH closure only ever exists in the blocking queue of a
+   globally visible closure i.e. one with a GA. A BLOCKED_FETCH closure
+   indicates that a TSO on another PE is waiting for the result of this
+   computation. Thus, when updating the closure, the result has to be sent
+   to that PE. The relevant routines handling that are awakenBlockedQueue
+   and blockFetch (for putting BLOCKED_FETCH closure into a BQ).
+   ------------------------------------------------------------------------ */
+
+//@cindex BLOCKED_FETCH_info
+INFO_TABLE(stg_BLOCKED_FETCH_info, stg_BLOCKED_FETCH_entry,0,2,BLOCKED_FETCH,,EF_,"BLOCKED_FETCH","BLOCKED_FETCH");
+//@cindex BLOCKED_FETCH_entry
+STGFUN(stg_BLOCKED_FETCH_entry)
+{
+  FB_
+    /* see NON_ENTERABLE_ENTRY_CODE in StgMiscClosures.hc */
+    STGCALL2(fprintf,stderr,"BLOCKED_FETCH object entered!\n");
+    STGCALL1(shutdownHaskellAndExit, EXIT_FAILURE);
+  FE_
+}
+
+
+/* ---------------------------------------------------------------------------
+   REMOTE_REF
+   
+   A REMOTE_REF closure is generated whenever we wish to refer to a sticky
+   object on another PE.
+   ------------------------------------------------------------------------ */
+
+//@cindex REMOTE_REF_info
+INFO_TABLE(stg_REMOTE_REF_info, stg_REMOTE_REF_entry,0,2,REMOTE_REF,,EF_,"REMOTE_REF","REMOTE_REF");
+//@cindex REMOTE_REF_entry
+STGFUN(stg_REMOTE_REF_entry)
+{
+  FB_
+    /* see NON_ENTERABLE_ENTRY_CODE in StgMiscClosures.hc */
+    STGCALL2(fprintf,stderr,"REMOTE REF object entered!\n");
+    STGCALL1(shutdownHaskellAndExit, EXIT_FAILURE);
+  FE_
+}
+
+#endif /* PAR */
+
+//@node Index,  , Info tables
+//@subsection Index
+
+//@index
+//* BLOCKED_FETCH_entry::  @cindex\s-+BLOCKED_FETCH_entry
+//* BLOCKED_FETCH_info::  @cindex\s-+BLOCKED_FETCH_info
+//* FETCH_ME_BQ_info::  @cindex\s-+FETCH_ME_BQ_info
+//* FETCH_ME_entry::  @cindex\s-+FETCH_ME_entry
+//* FETCH_ME_info::  @cindex\s-+FETCH_ME_info
+//@end index
diff --git a/rts/parallel/Global.c b/rts/parallel/Global.c
new file mode 100644
index 0000000000..b2541357e1
--- /dev/null
+++ b/rts/parallel/Global.c
@@ -0,0 +1,1090 @@
+/* ---------------------------------------------------------------------------
+   Time-stamp: <Wed Mar 21 2001 16:32:23 Stardate: [-30]6363.44 hwloidl>
+
+   (c) The AQUA/Parade Projects, Glasgow University, 1995
+       The GdH/APART 624 Projects, Heriot-Watt University, Edinburgh, 1999
+
+   Global Address Manipulation.
+   
+   The GALA and LAGA tables for mapping global addresses to local addresses 
+   (i.e. heap pointers) are defined here. We use the generic hash tables
+   defined in Hash.c.
+   ------------------------------------------------------------------------- */
+
+#ifdef PAR /* whole file */
+
+//@menu
+//* Includes::			
+//* Global tables and lists::	
+//* Fcts on GALA tables::	
+//* Interface to taskId-PE table::  
+//* Interface to LAGA table::	
+//* Interface to GALA table::	
+//* GC functions for GALA tables::  
+//* Index::			
+//@end menu
+//*/
+
+//@node Includes, Global tables and lists, Global Address Manipulation, Global Address Manipulation
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Storage.h"
+#include "Hash.h"
+#include "HLC.h"
+#include "ParallelRts.h"
+#if defined(DEBUG)
+# include "Sanity.h"
+#include "ParallelDebug.h"
+#endif
+#if defined(DIST)
+# include "Dist.h"
+#endif
+
+/*
+  @globalAddr@ structures are allocated in chunks to reduce malloc overhead.
+*/
+
+//@node Global tables and lists, Fcts on GALA tables, Includes, Global Address Manipulation
+//@subsection Global tables and lists
+
+//@cindex thisPE
+nat thisPE;
+
+//@menu
+//* Free lists::		
+//* Hash tables::		
+//@end menu
+
+//@node Free lists, Hash tables, Global tables and lists, Global tables and lists
+//@subsubsection Free lists
+
+/* Free list of GALA entries */
+GALA *freeGALAList = NULL;
+
+/* Number of globalAddr cells to allocate in one go */
+#define GCHUNK	    (1024 * sizeof(StgWord) / sizeof(GALA))
+
+/* Free list of indirections */
+
+//@cindex nextIndirection
+static StgInt nextIndirection = 0;
+//@cindex freeIndirections
+GALA *freeIndirections = NULL;
+
+/* The list of live indirections has to be marked for GC (see makeGlobal) */
+//@cindex liveIndirections
+GALA *liveIndirections = NULL;
+
+/* The list of remote indirections has to be marked for GC (see setRemoteGA) */
+//@cindex liveRemoteGAs
+GALA *liveRemoteGAs = NULL;
+
+//@node Hash tables,  , Free lists, Global tables and lists
+//@subsubsection Hash tables
+
+/* Mapping global task ids PEs */
+//@cindex taskIDtoPEtable
+HashTable *taskIDtoPEtable = NULL;
+
+static int nextPE = 0;
+
+/* LAGA table: StgClosure* -> globalAddr*
+               (Remember: globalAddr = (GlobalTaskId, Slot, Weight))
+   Mapping local to global addresses (see interface below) 
+*/
+
+//@cindex LAtoGALAtable
+HashTable *LAtoGALAtable = NULL;
+
+/* GALA table: globalAddr* -> StgClosure*
+               (Remember: globalAddr = (GlobalTaskId, Slot, Weight))
+   Mapping global to local addresses (see interface below) 
+*/
+
+//@cindex pGAtoGALAtable
+HashTable *pGAtoGALAtable = NULL;
+
+//@node Fcts on GALA tables, Interface to taskId-PE table, Global tables and lists, Global Address Manipulation
+//@subsection Fcts on GALA tables
+
+//@cindex allocGALA
+static GALA *
+allocGALA(void)
+{
+  GALA *gl, *p;
+
+  if ((gl = freeGALAList) != NULL) {
+    IF_DEBUG(sanity,
+	     ASSERT(gl->ga.weight==0xdead0add);
+             ASSERT(gl->la==(StgPtr)0xdead00aa));
+    freeGALAList = gl->next;
+  } else {
+    gl = (GALA *) stgMallocBytes(GCHUNK * sizeof(GALA), "allocGALA");
+
+    freeGALAList = gl + 1;
+    for (p = freeGALAList; p < gl + GCHUNK - 1; p++) {
+      p->next = p + 1;
+      IF_DEBUG(sanity,
+	       p->ga.weight=0xdead0add;
+               p->la=(StgPtr)0xdead00aa);
+    }
+    /* last elem in the new block has NULL pointer in link field */
+    p->next = NULL;
+    IF_DEBUG(sanity,
+	     p->ga.weight=0xdead0add;
+	     p->la=(StgPtr)0xdead00aa);
+  }
+  IF_DEBUG(sanity,
+	   gl->ga.weight=0xdead0add;
+           gl->la=(StgPtr)0xdead00aa);
+  return gl;
+}
+
+//@node Interface to taskId-PE table, Interface to LAGA table, Fcts on GALA tables, Global Address Manipulation
+//@subsection Interface to taskId-PE table
+
+/*
+  We don't really like GLOBAL_TASK_ID, so we keep a table of TASK_ID to
+  PE mappings.  The idea is that a PE identifier will fit in 16 bits, whereas 
+  a TASK_ID may not.
+*/
+
+//@cindex taskIDtoPE
+PEs
+taskIDtoPE(GlobalTaskId gtid)
+{
+  return ((PEs) lookupHashTable(taskIDtoPEtable, gtid));
+}
+
+//@cindex registerTask
+void 
+registerTask(GlobalTaskId gtid) { 
+  nextPE++;               //start counting from 1
+  if (gtid == mytid)
+    thisPE = nextPE;
+
+  insertHashTable(taskIDtoPEtable, gtid, (void *) (StgWord) nextPE);
+}
+
+//@node Interface to LAGA table, Interface to GALA table, Interface to taskId-PE table, Global Address Manipulation
+//@subsection Interface to LAGA table
+
+/*
+  The local address to global address mapping returns a globalAddr structure
+  (pe task id, slot, weight) for any closure in the local heap which has a
+  global identity.  Such closures may be copies of normal form objects with
+  a remote `master' location, @FetchMe@ nodes referencing remote objects, or
+  globally visible objects in the local heap (for which we are the master).
+*/
+
+//@cindex LAGAlookup
+globalAddr *
+LAGAlookup(addr)
+StgClosure *addr;
+{
+  GALA *gala;
+
+  /* We never look for GA's on indirections. -- unknown hacker
+     Well, in fact at the moment we do in the new RTS. -- HWL
+     ToDo: unwind INDs when entering them into the hash table
+
+  ASSERT(IS_INDIRECTION(addr) == NULL);
+  */
+  if ((gala = lookupHashTable(LAtoGALAtable, (StgWord) addr)) == NULL)
+    return NULL;
+  else
+    return &(gala->ga);
+}
+
+//@node Interface to GALA table, GC functions for GALA tables, Interface to LAGA table, Global Address Manipulation
+//@subsection Interface to GALA table
+
+/*
+  We also manage a mapping of global addresses to local addresses, so that
+  we can ``common up'' multiple references to the same object as they arrive
+  in data packets from remote PEs.
+
+  The global address to local address mapping is actually managed via a
+  ``packed global address'' to GALA hash table.  The packed global
+  address takes the interesting part of the @globalAddr@ structure
+  (i.e. the pe and slot fields) and packs them into a single word
+  suitable for hashing.
+*/
+
+//@cindex GALAlookup
+StgClosure *
+GALAlookup(ga)
+globalAddr *ga;
+{
+  StgWord pga = PackGA(taskIDtoPE(ga->payload.gc.gtid), ga->payload.gc.slot);
+  GALA *gala;
+
+  if ((gala = (GALA *) lookupHashTable(pGAtoGALAtable, pga)) == NULL)
+    return NULL;
+  else {
+    /* 
+     * Bypass any indirections when returning a local closure to
+     * the caller.  Note that we do not short-circuit the entry in
+     * the GALA tables right now, because we would have to do a
+     * hash table delete and insert in the LAtoGALAtable to keep
+     * that table up-to-date for preferred GALA pairs.  That's
+     * probably a bit expensive.
+     */
+    return UNWIND_IND((StgClosure *)(gala->la));
+  }
+}
+
+/* ga becomes non-preferred (e.g. due to CommonUp) */
+void
+GALAdeprecate(ga)
+globalAddr *ga;
+{
+  StgWord pga = PackGA(taskIDtoPE(ga->payload.gc.gtid), ga->payload.gc.slot);
+  GALA *gala;
+
+  gala = (GALA *) lookupHashTable(pGAtoGALAtable, pga);
+  ASSERT(gala!=NULL);
+  ASSERT(gala->preferred==rtsTrue);
+  gala->preferred = rtsFalse;
+}
+
+/*
+  External references to our globally-visible closures are managed through an
+  indirection table.  The idea is that the closure may move about as the result
+  of local garbage collections, but its global identity is determined by its
+  slot in the indirection table, which never changes.
+
+  The indirection table is maintained implicitly as part of the global
+  address to local address table.  We need only keep track of the
+  highest numbered indirection index allocated so far, along with a free
+  list of lower numbered indices no longer in use.
+*/
+
+/* 
+   Allocate an indirection slot for the closure currently at address @addr@.
+*/
+
+//@cindex allocIndirection
+static GALA *
+allocIndirection(StgClosure *closure)
+{
+  GALA *gala;
+  
+  if ((gala = freeIndirections) != NULL) {
+    IF_DEBUG(sanity,
+	     ASSERT(gala->ga.weight==0xdead0add);
+             ASSERT(gala->la==(StgPtr)0xdead00aa));
+    freeIndirections = gala->next;
+  } else {
+    gala = allocGALA();
+    IF_DEBUG(sanity,
+	     ASSERT(gala->ga.weight==0xdead0add);
+             ASSERT(gala->la==(StgPtr)0xdead00aa));
+    gala->ga.payload.gc.gtid = mytid;
+    gala->ga.payload.gc.slot = nextIndirection++;
+    IF_DEBUG(sanity,
+	     if (nextIndirection>=MAX_SLOTS)
+	       barf("Cannot handle more than %d slots for GA in a sanity-checking setup (this is no error)"));
+  }
+  gala->ga.weight = MAX_GA_WEIGHT;
+  gala->la = (StgPtr)closure;
+  IF_DEBUG(sanity,
+	   gala->next=(struct gala *)0xcccccccc);
+  return gala;
+}
+
+/* 
+   This is only used for sanity checking (see LOOKS_LIKE_SLOT)
+*/
+StgInt
+highest_slot (void) { return nextIndirection; }
+
+/*
+  Make a local closure globally visible.  
+
+  Called from: GlobaliseAndPackGA
+  Args: 
+   closure ... closure to be made visible
+   preferred ... should the new GA become the preferred one (normalle=y true)
+
+  Allocate a GALA structure and add it to the (logical) Indirections table,
+  by inserting it into the LAtoGALAtable hash table and putting it onto the
+  liveIndirections list (only if it is preferred).
+   
+  We have to allocate an indirection slot for it, and update both the local
+  address to global address and global address to local address maps.  
+*/
+
+//@cindex makeGlobal
+globalAddr *
+makeGlobal(closure, preferred)
+StgClosure *closure;
+rtsBool preferred;
+{
+  /* check whether we already have a GA for this local closure */
+  GALA *oldGALA = lookupHashTable(LAtoGALAtable, (StgWord) closure);
+  /* create an entry in the LAGA table */
+  GALA *newGALA = allocIndirection(closure);
+  StgWord pga = PackGA(thisPE, newGALA->ga.payload.gc.slot);
+
+  IF_DEBUG(sanity,
+	   ASSERT(newGALA->next==(struct gala *)0xcccccccc););
+  // ASSERT(HEAP_ALLOCED(closure)); // check that closure might point into the heap; might be static, though
+  ASSERT(GALAlookup(&(newGALA->ga)) == NULL);
+  
+  /* global statistics gathering */
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    globalParStats.local_alloc_GA++;
+  }
+
+  newGALA->la = (StgPtr)closure;
+  newGALA->preferred = preferred;
+
+  if (preferred) {
+    /* The new GA is now the preferred GA for the LA */
+    if (oldGALA != NULL) {
+      oldGALA->preferred = rtsFalse;
+      (void) removeHashTable(LAtoGALAtable, (StgWord) closure, (void *) oldGALA);
+    }
+    insertHashTable(LAtoGALAtable, (StgWord) closure, (void *) newGALA);
+  }
+
+  ASSERT(!isOnLiveIndTable(&(newGALA->ga)));
+  /* put the new GALA entry on the list of live indirections */
+  newGALA->next = liveIndirections;
+  liveIndirections = newGALA;
+  
+  insertHashTable(pGAtoGALAtable, pga, (void *) newGALA);
+  
+  return &(newGALA->ga);
+}
+
+/*
+  Assign an existing remote global address to an existing closure.
+
+  Called from: Unpack in Pack.c
+  Args:
+   local_closure ... a closure that has just been unpacked 
+   remote_ga ... the GA that came with it, ie. the name under which the 
+                 closure is known while being transferred
+   preferred ... should the new GA become the preferred one (normalle=y true)
+
+  Allocate a GALA structure and add it to the (logical) RemoteGA table,
+  by inserting it into the LAtoGALAtable hash table and putting it onto the
+  liveRemoteGAs list (only if it is preferred).
+
+  We do not retain the @globalAddr@ structure that's passed in as an argument,
+  so it can be a static in the calling routine.
+*/
+
+//@cindex setRemoteGA
+globalAddr *
+setRemoteGA(local_closure, remote_ga, preferred)
+StgClosure *local_closure;
+globalAddr *remote_ga;
+rtsBool preferred;
+{
+  /* old entry ie the one with the GA generated when sending off the closure */
+  GALA *oldGALA = lookupHashTable(LAtoGALAtable, (StgWord) local_closure);
+  /* alloc new entry and fill it with contents of the newly arrives GA */
+  GALA *newGALA = allocGALA();
+  StgWord pga = PackGA(taskIDtoPE(remote_ga->payload.gc.gtid), 
+		       remote_ga->payload.gc.slot);
+
+  ASSERT(remote_ga->payload.gc.gtid != mytid);
+  ASSERT(remote_ga->weight > 0);
+  ASSERT(GALAlookup(remote_ga) == NULL);
+
+  newGALA->ga = *remote_ga;
+  newGALA->la = (StgPtr)local_closure;
+  newGALA->preferred = preferred;
+
+  if (preferred) {
+    /* The new GA is now the preferred GA for the LA */
+    if (oldGALA != NULL) {
+      oldGALA->preferred = rtsFalse;
+      (void) removeHashTable(LAtoGALAtable, (StgWord) local_closure, (void *) oldGALA);
+    }
+    insertHashTable(LAtoGALAtable, (StgWord) local_closure, (void *) newGALA);
+  }
+
+  ASSERT(!isOnRemoteGATable(&(newGALA->ga)));
+  /* add new entry to the (logical) RemoteGA table */
+  newGALA->next = liveRemoteGAs;
+  liveRemoteGAs = newGALA;
+  
+  insertHashTable(pGAtoGALAtable, pga, (void *) newGALA);
+  
+  /*
+    The weight carried by the incoming closure is transferred to the newGALA
+    entry (via the structure assign above). Therefore, we have to give back
+    the weight to the GA on the other processor, because that indirection is
+    no longer needed. 
+  */
+  remote_ga->weight = 0;
+  return &(newGALA->ga);
+}
+
+/*
+  Give me a bit of weight to give away on a new reference to a particular
+  global address.  If we run down to nothing, we have to assign a new GA.  
+*/
+
+//@cindex splitWeight
+#if 0
+void
+splitWeight(to, from)
+globalAddr *to, *from;
+{
+  /* Make sure we have enough weight to split */
+  if (from->weight!=MAX_GA_WEIGHT && from->weight<=3)  // fixed by UK in Eden implementation
+    from = makeGlobal(GALAlookup(from), rtsTrue);
+  
+  to->payload = from->payload;
+
+  if (from->weight == MAX_GA_WEIGHT)
+    to->weight = 1L << (BITS_IN(unsigned) - 1);
+  else
+    to->weight = from->weight / 2;
+
+  from->weight -= to->weight;
+}
+#else
+void
+splitWeight(to, from)
+globalAddr *to, *from;
+{
+  /* Make sure we have enough weight to split */
+  /* Splitting at 2 needed, as weight 1 is not legal in packets (UK+KH) */
+  
+  if (from->weight / 2 <= 2) /* old: weight== 1 (UK) */
+      from = makeGlobal(GALAlookup(from), rtsTrue);
+  
+  to->payload = from->payload;
+  
+  if (from->weight <= 1) /* old == 0 (UK) */
+      to->weight = 1L << (BITS_IN(unsigned) - 1);
+  else
+      to->weight = from->weight / 2;
+  
+  from->weight -= to->weight;
+}
+#endif
+/*
+  Here, I am returning a bit of weight that a remote PE no longer needs.
+*/
+
+//@cindex addWeight
+globalAddr *
+addWeight(ga)
+globalAddr *ga;
+{
+  StgWord pga;
+  GALA *gala;
+
+  ASSERT(LOOKS_LIKE_GA(ga));
+
+  pga = PackGA(taskIDtoPE(ga->payload.gc.gtid), ga->payload.gc.slot);
+  gala = (GALA *) lookupHashTable(pGAtoGALAtable, pga);
+
+  IF_PAR_DEBUG(weight,
+	       fprintf(stderr, "@* Adding weight %x to ", ga->weight);
+	       printGA(&(gala->ga));
+	       fputc('\n', stderr));
+
+  gala->ga.weight += ga->weight;    
+  ga->weight = 0;
+
+  return &(gala->ga);
+}
+
+/*
+  Initialize all of the global address structures: the task ID to PE id
+  map, the local address to global address map, the global address to
+  local address map, and the indirection table.
+*/
+
+//@cindex initGAtables
+void
+initGAtables(void)
+{
+  taskIDtoPEtable = allocHashTable();
+  LAtoGALAtable = allocHashTable();
+  pGAtoGALAtable = allocHashTable();
+}
+
+//@cindex PackGA
+StgWord
+PackGA (pe, slot)
+StgWord pe;
+int slot;
+{
+  int pe_shift = (BITS_IN(StgWord)*3)/4;
+  int pe_bits  = BITS_IN(StgWord) - pe_shift;
+
+  if ( pe_bits < 8 || slot >= (1L << pe_shift) ) { /* big trouble */
+    fflush(stdout);
+    fprintf(stderr, "PackGA: slot# too big (%d) or not enough pe_bits (%d)\n",
+	    slot,pe_bits);
+    stg_exit(EXIT_FAILURE);
+  }
+
+  return((((StgWord)(pe)) << pe_shift) | ((StgWord)(slot)));
+	
+    /* the idea is to use 3/4 of the bits (e.g., 24) for indirection-
+       table "slot", and 1/4 for the pe# (e.g., 8).
+       
+       We check for too many bits in "slot", and double-check (at
+       compile-time?) that we have enough bits for "pe".  We *don't*
+       check for too many bits in "pe", because SysMan enforces a
+       MAX_PEs limit at the very very beginning.
+
+       Phil & Will 95/08
+    */
+}
+
+//@node GC functions for GALA tables, Debugging routines, Interface to GALA table, Global Address Manipulation
+//@subsection GC functions for GALA tables
+
+/*
+  When we do a copying collection, we want to evacuate all of the local
+  entries in the GALA table for which there are outstanding remote
+  pointers (i.e. for which the weight is not MAX_GA_WEIGHT.)
+  This routine has to be run BEFORE doing the GC proper (it's a 
+  ``mark roots'' thing).
+*/
+//@cindex markLocalGAs
+void
+markLocalGAs(rtsBool full)
+{
+  GALA *gala, *next, *prev = NULL;
+  StgPtr old_la, new_la;
+  nat n=0, m=0; // debugging only
+  double start_time_GA; // stats only
+
+  IF_PAR_DEBUG(tables,
+	   belch("@@%%%% markLocalGAs (full=%d): Marking LIVE INDIRECTIONS in GALA table starting with GALA at %p\n",
+		 full, liveIndirections);
+	   printLAGAtable());
+
+  PAR_TICKY_MARK_LOCAL_GAS_START();
+
+  for (gala = liveIndirections, m=0; gala != NULL; gala = next, m++) {
+    IF_PAR_DEBUG(tables,
+		 fputs("@@ ",stderr);
+		 printGA(&(gala->ga));
+		 fprintf(stderr, ";@ %d: LA: %p (%s) ",
+			 m, (void*)gala->la, info_type((StgClosure*)gala->la)));
+    next = gala->next;
+    old_la = gala->la;
+    ASSERT(gala->ga.payload.gc.gtid == mytid); /* it's supposed to be local */
+    if (gala->ga.weight != MAX_GA_WEIGHT) {
+      /* Remote references exist, so we must evacuate the local closure */
+      if (get_itbl((StgClosure *)old_la)->type == EVACUATED) {
+	/* somebody else already evacuated this closure */
+	new_la = (StgPtr)((StgEvacuated *)old_la)->evacuee;
+	IF_PAR_DEBUG(tables,
+		 belch(" already evacuated to %p", new_la));
+      } else {
+#if 1
+	/* unwind any indirections we find */
+	StgClosure *foo = UNWIND_IND((StgClosure *)old_la) ; // debugging only
+	//ASSERT(HEAP_ALLOCED(foo));
+	n++;
+
+	new_la = (StgPtr) MarkRoot(foo);
+	IF_PAR_DEBUG(tables,
+		     belch(" evacuated %p to %p", foo, new_la));
+	/* ToDo: is this the right assertion to check that new_la is in to-space?
+	ASSERT(!HEAP_ALLOCED(new_la) || Bdescr(new_la)->evacuated);
+	*/
+#else
+	new_la = MarkRoot(old_la); // or just evacuate(old_ga)
+	IF_PAR_DEBUG(tables,
+		     belch(" evacuated %p to %p", old_la, new_la));
+#endif
+      }
+
+      gala->la = new_la;
+      /* remove old LA and replace with new LA */
+      if (/* !full && */ gala->preferred && new_la != old_la) {
+	GALA *q;
+	ASSERT(lookupHashTable(LAtoGALAtable, (StgWord)old_la));
+	(void) removeHashTable(LAtoGALAtable, (StgWord) old_la, (void *) gala);
+	if ((q = lookupHashTable(LAtoGALAtable, (StgWord) new_la))!=NULL) {
+	  if (q->preferred && gala->preferred) {
+	    q->preferred = rtsFalse;
+	    IF_PAR_DEBUG(tables,
+			 fprintf(stderr, "@@## found hash entry for closure %p (%s): deprecated GA ",
+			   new_la, info_type((StgClosure*)new_la));
+			 printGA(&(q->ga));
+			 fputc('\n', stderr)); 
+	  }
+	} else {
+	  insertHashTable(LAtoGALAtable, (StgWord) new_la, (void *) gala);
+	}
+	IF_PAR_DEBUG(tables,
+		 belch("__## Hash table update (%p --> %p): ",
+		       old_la, new_la));
+      }
+
+      gala->next = prev;
+      prev = gala;
+    } else if(LOOKS_LIKE_STATIC_CLOSURE(gala->la)) {
+      /* to handle the CAFs, is this all?*/
+      MarkRoot(gala->la);
+      IF_PAR_DEBUG(tables,
+		   belch(" processed static closure"));
+      n++;
+      gala->next = prev;
+      prev = gala;   
+    } else {
+      /* Since we have all of the weight, this GA is no longer needed */
+      StgWord pga = PackGA(thisPE, gala->ga.payload.gc.slot);
+      
+      IF_PAR_DEBUG(free,
+		   belch("@@!! Freeing slot %d", 
+			 gala->ga.payload.gc.slot));
+      /* put gala on free indirections list */
+      gala->next = freeIndirections;
+      freeIndirections = gala;
+      (void) removeHashTable(pGAtoGALAtable, pga, (void *) gala);
+      if (/* !full && */ gala->preferred)
+	(void) removeHashTable(LAtoGALAtable, (W_) gala->la, (void *) gala);
+
+      IF_DEBUG(sanity,
+	       gala->ga.weight = 0xdead0add;
+	       gala->la = (StgPtr) 0xdead00aa);
+    }
+  } /* for gala ... */
+  liveIndirections = prev;  /* list has been reversed during the marking */
+
+
+  PAR_TICKY_MARK_LOCAL_GAS_END(n);
+
+  IF_PAR_DEBUG(tables,
+	       belch("@@%%%% markLocalGAs: %d of %d GALAs marked on PE %x",
+		     n, m, mytid));
+}
+
+/*
+  Traverse the GALA table: for every live remote GA check whether it has been
+  touched during GC; if not it is not needed locally and we can free the 
+  closure (i.e. let go of its heap space and send a free message to the
+  PE holding its GA).
+  This routine has to be run AFTER doing the GC proper.
+*/
+void
+rebuildGAtables(rtsBool full)
+{
+  GALA *gala, *next, *prev;
+  StgClosure *closure;
+  nat n = 0, size_GA = 0; // stats only (no. of GAs, and their heap size in bytes)
+
+  IF_PAR_DEBUG(tables,
+	   belch("@@%%%% rebuildGAtables (full=%d): rebuilding LIVE REMOTE GAs in GALA table starting with GALA at %p\n",
+		 full, liveRemoteGAs));
+
+  PAR_TICKY_REBUILD_GA_TABLES_START();
+
+  prepareFreeMsgBuffers();
+
+  for (gala = liveRemoteGAs, prev = NULL; gala != NULL; gala = next) {
+    IF_PAR_DEBUG(tables,
+		 printGA(&(gala->ga)));
+    next = gala->next;
+    ASSERT(gala->ga.payload.gc.gtid != mytid); /* it's supposed to be remote */
+
+    closure = (StgClosure *) (gala->la);
+    IF_PAR_DEBUG(tables,
+		 fprintf(stderr, " %p (%s) ",
+			 (StgClosure *)closure, info_type(closure)));
+
+    if (/* !full && */ gala->preferred)
+      (void) removeHashTable(LAtoGALAtable, (StgWord) gala->la, (void *) gala);
+
+    /* Follow indirection chains to the end, just in case */
+    // should conform with unwinding in markLocalGAs
+    closure = UNWIND_IND(closure);
+
+    /*
+       If closure has been evacuated it is live; otherwise it's dead and we
+       can nuke the GA attached to it in the LAGA table.
+       This approach also drops global aliases for PLCs.
+    */
+
+    //ASSERT(!HEAP_ALLOCED(closure) || !(Bdescr((StgPtr)closure)->evacuated));
+    if (get_itbl(closure)->type == EVACUATED) {
+      closure = ((StgEvacuated *)closure)->evacuee;
+      IF_PAR_DEBUG(tables,
+		   fprintf(stderr, " EVAC %p (%s)\n",
+			   closure, info_type(closure)));
+    } else {
+      /* closure is not alive any more, thus remove GA and send free msg */
+      int pe = taskIDtoPE(gala->ga.payload.gc.gtid);
+      StgWord pga = PackGA(pe, gala->ga.payload.gc.slot);
+
+      /* check that the block containing this closure is not in to-space */
+      IF_PAR_DEBUG(tables,
+		   fprintf(stderr, " !EVAC %p (%s); sending free to PE %d\n",
+			   closure, info_type(closure), pe));
+
+      (void) removeHashTable(pGAtoGALAtable, pga, (void *) gala);
+      freeRemoteGA(pe-1, &(gala->ga)); //-1 cause ids start at 1... not 0
+      gala->next = freeGALAList;
+      freeGALAList = gala;
+      IF_DEBUG(sanity,
+	       gala->ga.weight = 0xdead0add;
+	       gala->la = (StgPtr)0xdead00aa);
+      continue;
+    }
+    gala->la = (StgPtr)closure;
+    if (/* !full && */ gala->preferred) {
+      GALA *q;
+      if ((q = lookupHashTable(LAtoGALAtable, (StgWord) gala->la))!=NULL) {
+	if (q->preferred && gala->preferred) {
+	    q->preferred = rtsFalse;
+	    IF_PAR_DEBUG(tables,
+			 fprintf(stderr, "@@## found hash entry for closure %p (%s): deprecated GA ",
+			   gala->la, info_type((StgClosure*)gala->la));
+			 printGA(&(q->ga));
+			 fputc('\n', stderr)); 
+	}
+      } else {
+	insertHashTable(LAtoGALAtable, (StgWord) gala->la, (void *) gala);
+      }
+    }
+    gala->next = prev;
+    prev = gala;
+    /* Global statistics: count GAs and total size
+    if (RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+      StgInfoTable *info;
+      nat size, ptrs, nonptrs, vhs, i;
+      char str[80];
+
+      info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+
+      size_GA += size ;
+      n++; // stats: count number of GAs we add to the new table
+    }
+    */
+  }
+  liveRemoteGAs = prev; /* list is reversed during marking */
+
+  /* If we have any remaining FREE messages to send off, do so now */
+  sendFreeMessages();
+
+  PAR_TICKY_CNT_FREE_GA();
+
+  IF_DEBUG(sanity,
+	   checkFreeGALAList();
+	   checkFreeIndirectionsList());
+
+  rebuildLAGAtable();
+
+#if defined(PAR_TICKY)
+  getLAGAtableSize(&n, &size_GA);        // determine no of GAs and global heap
+  PAR_TICKY_REBUILD_GA_TABLES_END(n, size_GA); // record these values
+#endif
+
+  IF_PAR_DEBUG(tables,
+	   belch("@#%%%% rebuildGAtables: After ReBuilding GALA table starting with GALA at %p",
+		 liveRemoteGAs);
+	   printLAGAtable());
+}
+
+/*
+  Rebuild the LA->GA table, assuming that the addresses in the GALAs are
+  correct.  
+  A word on the lookupHashTable check in both loops:
+  After GC we may end up with 2 preferred GAs for the same LA! For example,
+  if we received a closure whose GA already exists on this PE we CommonUp
+  both closures, making one an indirection to the other. Before GC everything
+  is fine: one preferred GA refers to the IND, the other preferred GA refers
+  to the closure it points to. After GC, however, we have short cutted the 
+  IND and suddenly we have 2 preferred GAs for the same closure. We detect
+  this case in the loop below and deprecate one GA, so that we always just
+  have one preferred GA per LA.
+*/
+
+//@cindex rebuildLAGAtable
+void
+rebuildLAGAtable(void)
+{
+  GALA *gala;
+  nat n=0, m=0; // debugging
+
+  /* The old LA->GA table is worthless */
+  freeHashTable(LAtoGALAtable, NULL);
+  LAtoGALAtable = allocHashTable();
+
+  IF_PAR_DEBUG(tables,
+	   belch("@@%%%% rebuildLAGAtable: new LAGA table at %p",
+		 LAtoGALAtable)); 
+  
+  for (gala = liveIndirections; gala != NULL; gala = gala->next) {
+    n++;
+    if (gala->preferred) {
+      GALA *q;
+      if ((q = lookupHashTable(LAtoGALAtable, (StgWord) gala->la))!=NULL) {
+	if (q->preferred && gala->preferred) {
+	  /* this deprecates q (see also GALAdeprecate) */
+	  q->preferred = rtsFalse;
+	  (void) removeHashTable(LAtoGALAtable, (StgWord) gala->la, (void *)q);
+	  IF_PAR_DEBUG(tables,
+		       fprintf(stderr, "@@## found hash entry for closure %p (%s): deprecated GA ",
+			       gala->la, info_type((StgClosure*)gala->la));
+		       printGA(&(q->ga));
+		       fputc('\n', stderr)); 
+	}
+      }
+      insertHashTable(LAtoGALAtable, (StgWord) gala->la, (void *) gala);
+    }
+  }
+
+  for (gala = liveRemoteGAs; gala != NULL; gala = gala->next) {
+    m++;
+    if (gala->preferred) {
+      GALA *q;
+      if ((q = lookupHashTable(LAtoGALAtable, (StgWord) gala->la))!=NULL) {
+	if (q->preferred && gala->preferred) {
+	  /* this deprecates q (see also GALAdeprecate) */
+	  q->preferred = rtsFalse;
+	  (void) removeHashTable(LAtoGALAtable, (StgWord) gala->la, (void *)q);
+	  IF_PAR_DEBUG(tables,
+		       fprintf(stderr, "@@## found hash entry for closure %p (%s): deprecated GA ",
+			       (StgClosure*)gala->la, info_type((StgClosure*)gala->la));
+		       printGA(&(q->ga));
+		       fputc('\n', stderr)); 
+	}
+      }
+      insertHashTable(LAtoGALAtable, (StgWord) gala->la, (void *) gala);
+    }
+  }
+
+  IF_PAR_DEBUG(tables,
+	   belch("@@%%%% rebuildLAGAtable: inserted %d entries from liveIndirections and %d entries from liveRemoteGAs",
+		 n,m)); 
+}
+
+/*
+  Determine the size of the LAGA and GALA tables.
+  Has to be done after rebuilding the tables. 
+  Only used for global statistics gathering.
+*/
+
+//@cindex getLAGAtableSize
+void
+getLAGAtableSize(nat *nP, nat *sizeP)
+{
+  GALA *gala;
+  // nat n=0, tot_size=0;
+  StgClosure *closure;
+  StgInfoTable *info;
+  nat size, ptrs, nonptrs, vhs, i;
+  char str[80];
+  /* IN order to avoid counting closures twice we maintain a hash table
+     of all closures seen so far.
+     ToDo: collect this data while rebuilding the GALA table and make use
+           of the existing hash tables;
+  */
+  HashTable *closureTable;  // hash table for closures encountered already
+
+  closureTable = allocHashTable();
+
+  (*nP) = (*sizeP) = 0;
+  for (gala = liveIndirections; gala != NULL; gala = gala->next) {
+    closure = (StgClosure*) gala->la;
+    if (lookupHashTable(closureTable, (StgWord)closure)==NULL) { // not seen yet
+      insertHashTable(closureTable, (StgWord)closure, (void *)1);
+      info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+      (*sizeP) += size ;   // stats: measure total heap size of global closures
+      (*nP)++;             // stats: count number of GAs
+    }
+  }
+
+  for (gala = liveRemoteGAs; gala != NULL; gala = gala->next) {
+    closure = (StgClosure*) gala->la;
+    if (lookupHashTable(closureTable, (StgWord)closure)==NULL) { // not seen yet
+      insertHashTable(closureTable, (StgWord)closure, (void *)1);
+      info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+      (*sizeP) += size ;   // stats: measure total heap size of global closures
+      (*nP)++;             // stats: count number of GAs
+    }
+  }
+
+  freeHashTable(closureTable, NULL);
+}
+
+//@node Debugging routines, Index, GC functions for GALA tables, Global Address Manipulation
+//@subsection Debugging routines
+
+//@cindex printGA
+void
+printGA (globalAddr *ga)
+{
+  fprintf(stderr, "((%x, %d, %x))", 
+	  ga->payload.gc.gtid,
+	  ga->payload.gc.slot,
+	  ga->weight);
+}
+
+//@cindex printGALA
+void 
+printGALA (GALA *gala)
+{
+  printGA(&(gala->ga));
+  fprintf(stderr, " -> %p (%s)",
+	  (StgClosure*)gala->la, info_type((StgClosure*)gala->la));
+  fprintf(stderr, " %s",
+	  (gala->preferred) ? "PREF" : "____");
+}
+
+/*
+  Printing the LA->GA table.
+*/
+
+//@cindex printLiveIndTable
+void
+printLiveIndTable(void)
+{
+  GALA *gala, *q;
+  nat n=0; // debugging
+
+  belch("@@%%%%:: logical LiveIndTable (%p) (liveIndirections=%p):",
+	LAtoGALAtable, liveIndirections); 
+  
+  for (gala = liveIndirections; gala != NULL; gala = gala->next) {
+    n++;
+    printGALA(gala);
+    /* check whether this gala->la is hashed into the LAGA table */
+    q = lookupHashTable(LAtoGALAtable, (StgWord)(gala->la));
+    fprintf(stderr, "\t%s\n", (q==NULL) ? "...." : (q==gala) ?  "====" : "####");
+    //ASSERT(lookupHashTable(LAtoGALAtable, (StgWord)(gala->la)));
+  }
+  belch("@@%%%%:: %d live indirections",
+	n);
+}
+
+void
+printRemoteGATable(void)
+{
+  GALA *gala, *q;
+  nat m=0; // debugging
+
+  belch("@@%%%%:: logical RemoteGATable (%p) (liveRemoteGAs=%p):",
+	LAtoGALAtable, liveRemoteGAs);
+
+  for (gala = liveRemoteGAs; gala != NULL; gala = gala->next) {
+    m++;
+    printGALA(gala);
+    /* check whether this gala->la is hashed into the LAGA table */
+    q = lookupHashTable(LAtoGALAtable, (StgWord)(gala->la));
+    fprintf(stderr, "\t%s\n", (q==NULL) ? "...." : (q==gala) ? "====" : "####");
+    // ASSERT(lookupHashTable(LAtoGALAtable, (StgWord)(gala->la)));
+  }
+  belch("@@%%%%:: %d remote GAs",
+	m);
+}
+
+//@cindex printLAGAtable
+void
+printLAGAtable(void)
+{
+  belch("@@%%: LAGAtable (%p) with liveIndirections=%p, liveRemoteGAs=%p:",
+	LAtoGALAtable, liveIndirections, liveRemoteGAs); 
+
+  printLiveIndTable();
+  printRemoteGATable();
+}
+
+/*
+  Check whether a GA is already in a list.
+*/
+rtsBool
+isOnLiveIndTable(globalAddr *ga)
+{
+  GALA *gala;
+
+  for (gala = liveIndirections; gala != NULL; gala = gala->next) 
+    if (gala->ga.weight==ga->weight &&
+	gala->ga.payload.gc.slot==ga->payload.gc.slot &&
+	gala->ga.payload.gc.gtid==ga->payload.gc.gtid)
+      return rtsTrue;
+
+  return rtsFalse;
+}
+
+rtsBool
+isOnRemoteGATable(globalAddr *ga)
+{
+  GALA *gala;
+
+  for (gala = liveRemoteGAs; gala != NULL; gala = gala->next) 
+    if (gala->ga.weight==ga->weight &&
+	gala->ga.payload.gc.slot==ga->payload.gc.slot &&
+	gala->ga.payload.gc.gtid==ga->payload.gc.gtid)
+      return rtsTrue;
+
+  return rtsFalse;
+}
+
+/* 
+   Sanity check for free lists.
+*/
+void
+checkFreeGALAList(void) {
+  GALA *gl;
+
+  for (gl=freeGALAList; gl != NULL; gl=gl->next) {
+    ASSERT(gl->ga.weight==0xdead0add);
+    ASSERT(gl->la==(StgPtr)0xdead00aa);
+  }
+}
+
+void
+checkFreeIndirectionsList(void) {
+  GALA *gl;
+
+  for (gl=freeIndirections; gl != NULL; gl=gl->next) {
+    ASSERT(gl->ga.weight==0xdead0add);
+    ASSERT(gl->la==(StgPtr)0xdead00aa);
+  }
+}
+#endif /* PAR -- whole file */
+
+//@node Index,  , Debugging routines, Global Address Manipulation
+//@subsection Index
+
+//@index
+//* DebugPrintLAGAtable::  @cindex\s-+DebugPrintLAGAtable
+//* GALAlookup::  @cindex\s-+GALAlookup
+//* LAGAlookup::  @cindex\s-+LAGAlookup
+//* LAtoGALAtable::  @cindex\s-+LAtoGALAtable
+//* PackGA::  @cindex\s-+PackGA
+//* addWeight::  @cindex\s-+addWeight
+//* allocGALA::  @cindex\s-+allocGALA
+//* allocIndirection::  @cindex\s-+allocIndirection
+//* freeIndirections::  @cindex\s-+freeIndirections
+//* initGAtables::  @cindex\s-+initGAtables
+//* liveIndirections::  @cindex\s-+liveIndirections
+//* liveRemoteGAs::  @cindex\s-+liveRemoteGAs
+//* makeGlobal::  @cindex\s-+makeGlobal
+//* markLocalGAs::  @cindex\s-+markLocalGAs
+//* nextIndirection::  @cindex\s-+nextIndirection
+//* pGAtoGALAtable::  @cindex\s-+pGAtoGALAtable
+//* printGA::  @cindex\s-+printGA
+//* printGALA::  @cindex\s-+printGALA
+//* rebuildLAGAtable::  @cindex\s-+rebuildLAGAtable
+//* registerTask::  @cindex\s-+registerTask
+//* setRemoteGA::  @cindex\s-+setRemoteGA
+//* splitWeight::  @cindex\s-+splitWeight
+//* taskIDtoPE::  @cindex\s-+taskIDtoPE
+//* taskIDtoPEtable::  @cindex\s-+taskIDtoPEtable
+//* thisPE::  @cindex\s-+thisPE
+//@end index
diff --git a/rts/parallel/GranSim.c b/rts/parallel/GranSim.c
new file mode 100644
index 0000000000..b1cc0962be
--- /dev/null
+++ b/rts/parallel/GranSim.c
@@ -0,0 +1,3015 @@
+/* 
+   Time-stamp: <Tue Mar 06 2001 00:17:42 Stardate: [-30]6285.06 hwloidl>
+
+   Variables and functions specific to GranSim the parallelism simulator
+   for GPH.
+*/
+
+//@node GranSim specific code, , ,
+//@section GranSim specific code
+
+/*
+   Macros for dealing with the new and improved GA field for simulating
+   parallel execution. Based on @CONCURRENT@ package. The GA field now
+   contains a mask, where the n-th bit stands for the n-th processor, where
+   this data can be found. In case of multiple copies, several bits are
+   set. The total number of processors is bounded by @MAX_PROC@, which
+   should be <= the length of a word in bits.  -- HWL 
+*/
+
+//@menu
+//* Includes::			
+//* Prototypes and externs::	
+//* Constants and Variables::	
+//* Initialisation::		
+//* Global Address Operations::	 
+//* Global Event Queue::	
+//* Spark queue functions::	
+//* Scheduling functions::	
+//* Thread Queue routines::	
+//* GranSim functions::		
+//* GranSimLight routines::	
+//* Code for Fetching Nodes::	
+//* Idle PEs::			
+//* Routines directly called from Haskell world::  
+//* Emiting profiling info for GrAnSim::  
+//* Dumping routines::		
+//* Index::			
+//@end menu
+
+//@node Includes, Prototypes and externs, GranSim specific code, GranSim specific code
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "StgMiscClosures.h"
+#include "StgTypes.h"
+#include "Schedule.h"
+#include "SchedAPI.h"       // for pushClosure
+#include "GranSimRts.h"
+#include "GranSim.h"
+#include "ParallelRts.h"
+#include "ParallelDebug.h"
+#include "Sparks.h"
+#include "Storage.h"       // for recordMutable
+
+
+//@node Prototypes and externs, Constants and Variables, Includes, GranSim specific code
+//@subsection Prototypes and externs
+
+#if defined(GRAN)
+
+/* Prototypes */
+static inline PEs      ga_to_proc(StgWord);
+static inline rtsBool  any_idle(void);
+static inline nat      idlers(void);
+       PEs             where_is(StgClosure *node);
+
+static rtsBool         stealSomething(PEs proc, rtsBool steal_spark, rtsBool steal_thread);
+static rtsBool         stealSpark(PEs proc);
+static rtsBool         stealThread(PEs proc);
+static rtsBool         stealSparkMagic(PEs proc);
+static rtsBool         stealThreadMagic(PEs proc);
+/* subsumed by stealSomething
+static void            stealThread(PEs proc); 
+static void            stealSpark(PEs proc);
+*/
+static rtsTime         sparkStealTime(void);
+static nat             natRandom(nat from, nat to);
+static PEs             findRandomPE(PEs proc);
+static void            sortPEsByTime (PEs proc, PEs *pes_by_time, 
+				      nat *firstp, nat *np);
+
+void GetRoots(void);
+
+#endif /* GRAN */
+
+//@node Constants and Variables, Initialisation, Prototypes and externs, GranSim specific code
+//@subsection Constants and Variables
+
+#if defined(GRAN) || defined(PAR)
+/* See GranSim.h for the definition of the enum gran_event_types */
+char *gran_event_names[] = {
+    "START", "START(Q)",
+    "STEALING", "STOLEN", "STOLEN(Q)",
+    "FETCH", "REPLY", "BLOCK", "RESUME", "RESUME(Q)",
+    "SCHEDULE", "DESCHEDULE",
+    "END",
+    "SPARK", "SPARKAT", "USED", "PRUNED", "EXPORTED", "ACQUIRED",
+    "ALLOC",
+    "TERMINATE",
+    "SYSTEM_START", "SYSTEM_END",           /* only for debugging */
+    "??"
+};
+#endif
+
+#if defined(GRAN)                                              /* whole file */
+char *proc_status_names[] = {
+  "Idle", "Sparking", "Starting", "Fetching", "Fishing", "Busy", 
+  "UnknownProcStatus"
+};
+
+/* For internal use (event statistics) only */
+char *event_names[] =
+    { "ContinueThread", "StartThread", "ResumeThread", 
+      "MoveSpark", "MoveThread", "FindWork",
+      "FetchNode", "FetchReply",
+      "GlobalBlock", "UnblockThread"
+    };
+
+//@cindex CurrentProc
+PEs CurrentProc = 0;
+
+/*
+  ToDo: Create a structure for the processor status and put all the 
+        arrays below into it. 
+  -- HWL */
+
+//@cindex CurrentTime
+/* One clock for each PE */
+rtsTime CurrentTime[MAX_PROC];  
+
+/* Useful to restrict communication; cf fishing model in GUM */
+nat OutstandingFetches[MAX_PROC], OutstandingFishes[MAX_PROC];
+
+/* Status of each PE (new since but independent of GranSim Light) */
+rtsProcStatus procStatus[MAX_PROC];
+
+# if defined(GRAN) && defined(GRAN_CHECK)
+/* To check if the RTS ever tries to run a thread that should be blocked
+   because of fetching remote data */
+StgTSO *BlockedOnFetch[MAX_PROC];
+# define FETCH_MASK_TSO  0x08000000      /* only bits 0, 1, 2 should be used */
+# endif
+
+nat SparksAvail = 0;     /* How many sparks are available */
+nat SurplusThreads = 0;  /* How many excess threads are there */
+
+/* Do we need to reschedule following a fetch? */
+rtsBool NeedToReSchedule = rtsFalse, IgnoreEvents = rtsFalse, IgnoreYields = rtsFalse; 
+rtsTime TimeOfNextEvent, TimeOfLastEvent, EndOfTimeSlice; /* checked from the threaded world! */
+
+//@cindex spark queue
+/* GranSim: a globally visible array of spark queues */
+rtsSparkQ pending_sparks_hds[MAX_PROC];
+rtsSparkQ pending_sparks_tls[MAX_PROC];
+
+nat sparksIgnored = 0, sparksCreated = 0;
+
+GlobalGranStats globalGranStats;
+
+nat gran_arith_cost, gran_branch_cost, gran_load_cost, 
+    gran_store_cost, gran_float_cost;
+
+/*
+Old comment from 0.29. ToDo: Check and update -- HWL
+
+The following variables control the behaviour of GrAnSim. In general, there
+is one RTS option for enabling each of these features. In getting the
+desired setup of GranSim the following questions have to be answered:
+\begin{itemize}
+\item {\em Which scheduling algorithm} to use (@RtsFlags.GranFlags.DoFairSchedule@)? 
+      Currently only unfair scheduling is supported.
+\item What to do when remote data is fetched (@RtsFlags.GranFlags.DoAsyncFetch@)? 
+      Either block and wait for the
+      data or reschedule and do some other work.
+      Thus, if this variable is true, asynchronous communication is
+      modelled. Block on fetch mainly makes sense for incremental fetching.
+
+      There is also a simplified fetch variant available
+      (@RtsFlags.GranFlags.SimplifiedFetch@). This variant does not use events to model
+      communication. It is faster but the results will be less accurate.
+\item How aggressive to be in getting work after a reschedule on fetch
+      (@RtsFlags.GranFlags.FetchStrategy@)?
+      This is determined by the so-called {\em fetching
+      strategy\/}. Currently, there are four possibilities:
+      \begin{enumerate}
+       \item Only run a runnable thread.
+       \item Turn a spark into a thread, if necessary.
+       \item Steal a remote spark, if necessary.
+       \item Steal a runnable thread from another processor, if necessary.
+      \end{itemize}
+      The variable @RtsFlags.GranFlags.FetchStrategy@ determines how far to go in this list
+      when rescheduling on a fetch.
+\item Should sparks or threads be stolen first when looking for work
+      (@RtsFlags.GranFlags.DoStealThreadsFirst@)? 
+      The default is to steal sparks first (much cheaper).
+\item Should the RTS use a lazy thread creation scheme
+      (@RtsFlags.GranFlags.DoAlwaysCreateThreads@)?  By default yes i.e.\ sparks are only
+      turned into threads when work is needed. Also note, that sparks
+      can be discarded by the RTS (this is done in the case of an overflow
+      of the spark pool). Setting @RtsFlags.GranFlags.DoAlwaysCreateThreads@  to @True@ forces
+      the creation of threads at the next possibility (i.e.\ when new work
+      is demanded the next time).
+\item Should data be fetched closure-by-closure or in packets
+      (@RtsFlags.GranFlags.DoBulkFetching@)? The default strategy is a GRIP-like incremental 
+      (i.e.\ closure-by-closure) strategy. This makes sense in a
+      low-latency setting but is bad in a high-latency system. Setting 
+      @RtsFlags.GranFlags.DoBulkFetching@ to @True@ enables bulk (packet) fetching. Other
+      parameters determine the size of the packets (@pack_buffer_size@) and the number of
+      thunks that should be put into one packet (@RtsFlags.GranFlags.ThunksToPack@).
+\item If there is no other possibility to find work, should runnable threads
+      be moved to an idle processor (@RtsFlags.GranFlags.DoThreadMigration@)? In any case, the
+      RTS tried to get sparks (either local or remote ones) first. Thread
+      migration is very expensive, since a whole TSO has to be transferred
+      and probably data locality becomes worse in the process. Note, that
+      the closure, which will be evaluated next by that TSO is not
+      transferred together with the TSO (that might block another thread).
+\item Should the RTS distinguish between sparks created by local nodes and
+      stolen sparks (@RtsFlags.GranFlags.PreferSparksOfLocalNodes@)?  The idea is to improve 
+      data locality by preferring sparks of local nodes (it is more likely
+      that the data for those sparks is already on the local processor). 
+      However, such a distinction also imposes an overhead on the spark
+      queue management, and typically a large number of sparks are
+      generated during execution. By default this variable is set to @False@.
+\item Should the RTS use granularity control mechanisms? The idea of a 
+      granularity control mechanism is to make use of granularity
+      information provided via annotation of the @par@ construct in order
+      to prefer bigger threads when either turning a spark into a thread or
+      when choosing the next thread to schedule. Currently, three such
+      mechanisms are implemented:
+      \begin{itemize}
+        \item Cut-off: The granularity information is interpreted as a
+              priority. If a threshold priority is given to the RTS, then
+              only those sparks with a higher priority than the threshold 
+              are actually created. Other sparks are immediately discarded.
+              This is similar to a usual cut-off mechanism often used in 
+              parallel programs, where parallelism is only created if the 
+              input data is lage enough. With this option, the choice is 
+              hidden in the RTS and only the threshold value has to be 
+              provided as a parameter to the runtime system.
+        \item Priority Sparking: This mechanism keeps priorities for sparks
+              and chooses the spark with the highest priority when turning
+              a spark into a thread. After that the priority information is
+              discarded. The overhead of this mechanism comes from
+              maintaining a sorted spark queue.
+        \item Priority Scheduling: This mechanism keeps the granularity
+              information for threads, to. Thus, on each reschedule the 
+              largest thread is chosen. This mechanism has a higher
+              overhead, as the thread queue is sorted, too.
+       \end{itemize}  
+\end{itemize}
+*/
+
+//@node Initialisation, Global Address Operations, Constants and Variables, GranSim specific code
+//@subsection Initialisation
+
+void 
+init_gr_stats (void) {
+  memset(&globalGranStats, '\0', sizeof(GlobalGranStats));
+#if 0
+  /* event stats */
+  globalGranStats.noOfEvents = 0;
+  for (i=0; i<MAX_EVENT; i++) globalGranStats.event_counts[i]=0;
+
+  /* communication stats */
+  globalGranStats.fetch_misses = 0;
+  globalGranStats.tot_low_pri_sparks = 0;
+
+  /* obscure stats */  
+  globalGranStats.rs_sp_count = 0;
+  globalGranStats.rs_t_count = 0;
+  globalGranStats.ntimes_total = 0, 
+  globalGranStats.fl_total = 0;
+  globalGranStats.no_of_steals = 0;
+
+  /* spark queue stats */
+  globalGranStats.tot_sq_len = 0, 
+  globalGranStats.tot_sq_probes = 0; 
+  globalGranStats.tot_sparks = 0;
+  globalGranStats.withered_sparks = 0;
+  globalGranStats.tot_add_threads = 0;
+  globalGranStats.tot_tq_len = 0;
+  globalGranStats.non_end_add_threads = 0;
+
+  /* thread stats */
+  globalGranStats.tot_threads_created = 0;
+  for (i=0; i<MAX_PROC; i++) globalGranStats.threads_created_on_PE[i]=0;
+#endif /* 0 */
+}
+
+//@node Global Address Operations, Global Event Queue, Initialisation, GranSim specific code
+//@subsection Global Address Operations
+/*
+  ----------------------------------------------------------------------
+  Global Address Operations
+
+  These functions perform operations on the global-address (ga) part of a
+  closure. The ga is the only new field (1 word) in a closure introduced by
+  GrAnSim. It serves as a bitmask, indicating on which processor the
+  closure is residing. Since threads are described by Thread State Object
+  (TSO), which is nothing but another kind of closure, this scheme allows
+  gives placement information about threads.
+
+  A ga is just a bitmask, so the operations on them are mainly bitmask
+  manipulating functions. Note, that there are important macros like PROCS,
+  IS_LOCAL_TO etc. They are defined in @GrAnSim.lh@.
+
+  NOTE: In GrAnSim-light we don't maintain placement information. This
+  allows to simulate an arbitrary number of processors. The price we have
+  to be is the lack of costing any communication properly. In short,
+  GrAnSim-light is meant to reveal the maximal parallelism in a program.
+  From an implementation point of view the important thing is: {\em
+  GrAnSim-light does not maintain global-addresses}.  */
+
+/* ga_to_proc returns the first processor marked in the bitmask ga.
+   Normally only one bit in ga should be set. But for PLCs all bits
+   are set. That shouldn't hurt since we only need IS_LOCAL_TO for PLCs */
+ 
+//@cindex ga_to_proc
+
+static inline PEs
+ga_to_proc(StgWord ga)
+{
+    PEs i;
+    for (i = 0; i < RtsFlags.GranFlags.proc && !IS_LOCAL_TO(ga, i); i++);
+    ASSERT(i<RtsFlags.GranFlags.proc);
+    return (i);
+}
+
+/* NB: This takes a *node* rather than just a ga as input */
+//@cindex where_is
+PEs
+where_is(StgClosure *node)
+{ return (ga_to_proc(PROCS(node))); }
+
+// debugging only
+//@cindex is_unique
+rtsBool
+is_unique(StgClosure *node)
+{ 
+  PEs i;
+  rtsBool unique = rtsFalse;
+
+  for (i = 0; i < RtsFlags.GranFlags.proc ; i++)
+    if (IS_LOCAL_TO(PROCS(node), i))
+      if (unique)          // exactly 1 instance found so far
+	return rtsFalse;   // found a 2nd instance => not unique
+      else 
+	unique = rtsTrue;  // found 1st instance 
+  ASSERT(unique);          // otherwise returned from within loop
+  return (unique);
+}
+
+//@cindex any_idle
+static inline rtsBool
+any_idle(void) { /* any (map (\ i -> procStatus[i] == Idle)) [0,..,MAX_PROC] */
+ PEs i; 
+ rtsBool any_idle; 
+ for(i=0, any_idle=rtsFalse; 
+     !any_idle && i<RtsFlags.GranFlags.proc; 
+     any_idle = any_idle || procStatus[i] == Idle, i++) 
+ {} ;
+}
+
+//@cindex idlers
+static inline nat
+idlers(void) {  /* number of idle PEs */
+ PEs i, j; 
+ for(i=0, j=0;
+     i<RtsFlags.GranFlags.proc; 
+     j += (procStatus[i] == Idle) ? 1 : 0, i++) 
+ {} ;
+ return j;
+}
+
+//@node Global Event Queue, Spark queue functions, Global Address Operations, GranSim specific code
+//@subsection Global Event Queue
+/*
+The following routines implement an ADT of an event-queue (FIFO). 
+ToDo: Put that in an own file(?)
+*/
+
+/* Pointer to the global event queue; events are currently malloc'ed */
+rtsEventQ EventHd = NULL;
+
+//@cindex get_next_event
+rtsEvent *
+get_next_event(void)
+{
+  static rtsEventQ entry = NULL;
+
+  if (EventHd == NULL) {
+    barf("No next event. This may be caused by a circular data dependency in the program.");
+  }
+
+  if (entry != NULL)
+    free((char *)entry);
+
+  if (RtsFlags.GranFlags.GranSimStats.Global) {     /* count events */
+    globalGranStats.noOfEvents++;
+    globalGranStats.event_counts[EventHd->evttype]++;
+  }
+
+  entry = EventHd;
+
+  IF_GRAN_DEBUG(event_trace,
+	   print_event(entry));
+
+  EventHd = EventHd->next;
+  return(entry);
+}
+
+/* When getting the time of the next event we ignore CONTINUETHREAD events:
+   we don't want to be interrupted before the end of the current time slice
+   unless there is something important to handle. 
+*/
+//@cindex get_time_of_next_event
+rtsTime
+get_time_of_next_event(void)
+{ 
+  rtsEventQ event = EventHd;
+
+  while (event != NULL && event->evttype==ContinueThread) {
+    event = event->next;
+  }
+  if(event == NULL)
+      return ((rtsTime) 0);
+  else
+      return (event->time);
+}
+
+/* ToDo: replace malloc/free with a free list */
+//@cindex insert_event
+void
+insert_event(newentry)
+rtsEvent *newentry;
+{
+  rtsEventType evttype = newentry->evttype;
+  rtsEvent *event, **prev;
+
+  /* if(evttype >= CONTINUETHREAD1) evttype = CONTINUETHREAD; */
+
+  /* Search the queue and insert at the right point:
+     FINDWORK before everything, CONTINUETHREAD after everything.
+
+     This ensures that we find any available work after all threads have
+     executed the current cycle.  This level of detail would normally be
+     irrelevant, but matters for ridiculously low latencies...
+  */
+
+  /* Changed the ordering: Now FINDWORK comes after everything but 
+     CONTINUETHREAD. This makes sure that a MOVESPARK comes before a 
+     FINDWORK. This is important when a GranSimSparkAt happens and
+     DoAlwaysCreateThreads is turned on. Also important if a GC occurs
+     when trying to build a new thread (see much_spark)  -- HWL 02/96  */
+
+  if(EventHd == NULL)
+    EventHd = newentry;
+  else {
+    for (event = EventHd, prev=(rtsEvent**)&EventHd; 
+	 event != NULL; 
+         prev = (rtsEvent**)&(event->next), event = event->next) {
+      switch (evttype) {
+        case FindWork: if ( event->time < newentry->time ||
+                            ( (event->time == newentry->time) &&
+			      (event->evttype != ContinueThread) ) )
+                         continue;
+                       else
+                         break;
+        case ContinueThread: if ( event->time <= newentry->time )
+			       continue;
+			     else
+                               break;
+        default: if ( event->time < newentry->time || 
+	              ((event->time == newentry->time) &&
+		       (event->evttype == newentry->evttype)) )
+		   continue;
+		 else
+                   break;
+       }
+       /* Insert newentry here (i.e. before event) */
+       *prev = newentry;
+       newentry->next = event;
+       break;
+    }
+    if (event == NULL)
+      *prev = newentry;
+  }
+}
+
+//@cindex new_event
+void
+new_event(proc,creator,time,evttype,tso,node,spark)
+PEs proc, creator;
+rtsTime time;
+rtsEventType evttype;
+StgTSO *tso;
+StgClosure *node;
+rtsSpark *spark;
+{
+  rtsEvent *newentry = (rtsEvent *) stgMallocBytes(sizeof(rtsEvent), "new_event");
+
+  newentry->proc     = proc;
+  newentry->creator  = creator;
+  newentry->time     = time;
+  newentry->evttype  = evttype;
+  newentry->tso      = tso;
+  newentry->node     = node;
+  newentry->spark    = spark;
+  newentry->gc_info  = 0;
+  newentry->next     = NULL;
+
+  insert_event(newentry);
+
+  IF_DEBUG(gran, 
+	   fprintf(stderr, "GRAN: new_event: \n"); 
+	   print_event(newentry));
+}
+
+//@cindex prepend_event
+void
+prepend_event(event)       /* put event at beginning of EventQueue */
+rtsEvent *event;
+{				  /* only used for GC! */
+ event->next = EventHd;
+ EventHd = event;
+}
+
+//@cindex grab_event
+rtsEventQ
+grab_event(void)             /* undo prepend_event i.e. get the event */
+{			 /* at the head of EventQ but don't free anything */
+ rtsEventQ event = EventHd;
+
+ if (EventHd == NULL) {
+   barf("No next event (in grab_event). This may be caused by a circular data dependency in the program.");
+ }
+
+ EventHd = EventHd->next;
+ return (event);
+}
+
+//@cindex traverse_eventq_for_gc
+void 
+traverse_eventq_for_gc(void)
+{
+ rtsEventQ event = EventHd;
+ StgWord bufsize;
+ StgClosure *closurep;
+ StgTSO *tsop;
+ StgPtr buffer, bufptr;
+ PEs proc, creator;
+
+ /* Traverse eventq and replace every FETCHREPLY by a FETCHNODE for the
+    orig closure (root of packed graph). This means that a graph, which is
+    between processors at the time of GC is fetched again at the time when
+    it would have arrived, had there been no GC. Slightly inaccurate but
+    safe for GC.
+    This is only needed for GUM style fetchng. -- HWL */
+ if (!RtsFlags.GranFlags.DoBulkFetching)
+   return;
+
+ for(event = EventHd; event!=NULL; event=event->next) {
+   if (event->evttype==FetchReply) {
+     buffer = stgCast(StgPtr,event->node);
+     ASSERT(buffer[PACK_FLAG_LOCN]==MAGIC_PACK_FLAG);  /* It's a pack buffer */
+     bufsize = buffer[PACK_SIZE_LOCN];
+     closurep = stgCast(StgClosure*,buffer[PACK_HDR_SIZE]);
+     tsop = stgCast(StgTSO*,buffer[PACK_TSO_LOCN]);
+     proc = event->proc;
+     creator = event->creator;                 /* similar to unpacking */
+     for (bufptr=buffer+PACK_HDR_SIZE; 
+	  bufptr<(buffer+bufsize);
+	  bufptr++) {
+	 // if ( (INFO_TYPE(INFO_PTR(*bufptr)) == INFO_SPEC_RBH_TYPE) ||
+	 //      (INFO_TYPE(INFO_PTR(*bufptr)) == INFO_GEN_RBH_TYPE) ) {
+	   if ( GET_INFO(stgCast(StgClosure*,bufptr)) ) {
+	     convertFromRBH(stgCast(StgClosure *,bufptr));
+	 }
+     }
+     free(buffer);
+     event->evttype = FetchNode;
+     event->proc    = creator;
+     event->creator = proc;
+     event->node    = closurep;
+     event->tso     = tsop;
+     event->gc_info = 0;
+   }
+ }
+}
+
+void
+markEventQueue(void)
+{ 
+  StgClosure *MarkRoot(StgClosure *root); // prototype
+
+  rtsEventQ event = EventHd;
+  nat len;
+
+  /* iterate over eventq and register relevant fields in event as roots */
+  for(event = EventHd, len =  0; event!=NULL; event=event->next, len++) {
+    switch (event->evttype) {
+      case ContinueThread:  
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	break;
+      case StartThread: 
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	event->node = (StgClosure *)MarkRoot((StgClosure *)event->node);
+	break;
+      case ResumeThread:
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	event->node = (StgClosure *)MarkRoot((StgClosure *)event->node);
+	break;
+      case MoveSpark:
+	event->spark->node = (StgClosure *)MarkRoot((StgClosure *)event->spark->node);
+	break;
+      case MoveThread:
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	break;
+      case FindWork:
+	break;
+      case FetchNode: 
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	event->node = (StgClosure *)MarkRoot((StgClosure *)event->node);
+  	break;
+      case FetchReply:
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	if (RtsFlags.GranFlags.DoBulkFetching)
+	  // ToDo: traverse_eventw_for_gc if GUM-Fetching!!! HWL
+	  belch("ghuH: packets in BulkFetching not marked as roots; mayb be fatal");
+	else
+	  event->node = (StgClosure *)MarkRoot((StgClosure *)event->node);
+	break;
+      case GlobalBlock:
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	event->node = (StgClosure *)MarkRoot((StgClosure *)event->node);
+	break;
+      case UnblockThread:
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	event->node = (StgClosure *)MarkRoot((StgClosure *)event->node);
+	break;
+      default:
+	barf("markEventQueue: trying to mark unknown event @ %p", event);
+    }}
+  IF_DEBUG(gc,
+	   belch("GC: markEventQueue: %d events in queue", len));
+}
+
+/*
+  Prune all ContinueThread events related to tso or node in the eventq.
+  Currently used if a thread leaves STG land with ThreadBlocked status,
+  i.e. it blocked on a closure and has been put on its blocking queue.  It
+  will be reawakended via a call to awakenBlockedQueue. Until then no
+  event effecting this tso should appear in the eventq.  A bit of a hack,
+  because ideally we shouldn't generate such spurious ContinueThread events
+  in the first place.  
+*/
+//@cindex prune_eventq 
+void 
+prune_eventq(tso, node) 
+StgTSO *tso; 
+StgClosure *node; 
+{ rtsEventQ prev = (rtsEventQ)NULL, event = EventHd;
+
+  /* node unused for now */ 
+  ASSERT(node==NULL); 
+  /* tso must be valid, then */
+  ASSERT(tso!=END_TSO_QUEUE);
+  while (event != NULL) {
+    if (event->evttype==ContinueThread && 
+	(event->tso==tso)) {
+      IF_GRAN_DEBUG(event_trace, // ToDo: use another debug flag
+		    belch("prune_eventq: pruning ContinueThread event for TSO %d (%p) on PE %d @ %lx (%p)",
+			  event->tso->id, event->tso, event->proc, event->time, event));
+      if (prev==(rtsEventQ)NULL) { // beginning of eventq
+	EventHd = event->next;
+	free(event); 
+	event = EventHd;
+      } else {
+	prev->next = event->next;
+	free(event); 
+	event = prev->next;
+      }
+    } else { // no pruning necessary; go to next event
+      prev = event;
+      event = event->next;
+    }
+  }
+}
+
+//@cindex print_event
+void
+print_event(event)
+rtsEvent *event;
+{
+  char str_tso[16], str_node[16];
+  StgThreadID tso_id;
+
+  if (event->tso==END_TSO_QUEUE) {
+    strcpy(str_tso, "______");
+    tso_id = 0;
+  } else { 
+    sprintf(str_tso, "%p", event->tso);
+    tso_id = (event->tso==NULL) ? 0 : event->tso->id;
+  }
+  if  (event->node==(StgClosure*)NULL) {
+    strcpy(str_node, "______");
+  } else {
+    sprintf(str_node, "%p", event->node);
+  }
+  // HWL: shouldn't be necessary; ToDo: nuke
+  //str_tso[6]='\0';
+  //str_node[6]='\0';
+
+  if (event==NULL)
+    fprintf(stderr,"Evt: NIL\n");
+  else
+    fprintf(stderr, "Evt: %s (%u), PE %u [%u], Time %lu, TSO %d (%s), Node %s\n", //"Evt: %s (%u), PE %u [%u], Time %u, TSO %s (%#l), Node %s\n",
+	      event_names[event->evttype], event->evttype,
+              event->proc, event->creator, event->time, 
+	      tso_id, str_tso, str_node
+	      /*, event->spark, event->next */ );
+
+}
+
+//@cindex print_eventq
+void
+print_eventq(hd)
+rtsEvent *hd;
+{
+  rtsEvent *x;
+
+  fprintf(stderr,"Event Queue with root at %p:\n", hd);
+  for (x=hd; x!=NULL; x=x->next) {
+    print_event(x);
+  }
+}
+
+/* 
+   Spark queue functions are now all  in Sparks.c!!
+*/
+//@node Scheduling functions, Thread Queue routines, Spark queue functions, GranSim specific code
+//@subsection Scheduling functions
+
+/* 
+   These functions are variants of thread initialisation and therefore
+   related to initThread and friends in Schedule.c. However, they are
+   specific to a GranSim setup in storing more info in the TSO's statistics
+   buffer and sorting the thread queues etc.  
+*/
+
+/*
+   A large portion of startThread deals with maintaining a sorted thread
+   queue, which is needed for the Priority Sparking option. Without that
+   complication the code boils down to FIFO handling.  
+*/
+//@cindex insertThread
+void
+insertThread(tso, proc)
+StgTSO*     tso;
+PEs         proc;
+{
+  StgTSO *prev = NULL, *next = NULL;
+  nat count = 0;
+  rtsBool found = rtsFalse;
+
+  ASSERT(CurrentProc==proc);
+  ASSERT(!is_on_queue(tso,proc));
+  /* Idle proc: put the thread on the run queue
+     same for pri spark and basic version */
+  if (run_queue_hds[proc] == END_TSO_QUEUE)
+    {
+      /* too strong!
+      ASSERT((CurrentProc==MainProc &&   
+	      CurrentTime[MainProc]==0 &&
+	      procStatus[MainProc]==Idle) ||
+	     procStatus[proc]==Starting);
+      */
+      run_queue_hds[proc] = run_queue_tls[proc] = tso;
+
+      CurrentTime[proc] += RtsFlags.GranFlags.Costs.threadqueuetime;
+
+      /* new_event of ContinueThread has been moved to do_the_startthread */
+
+      /* too strong!
+      ASSERT(procStatus[proc]==Idle || 
+             procStatus[proc]==Fishing || 
+             procStatus[proc]==Starting);
+      procStatus[proc] = Busy;
+      */
+      return;
+    }
+
+  if (RtsFlags.GranFlags.Light)
+    GranSimLight_insertThread(tso, proc);
+
+  /* Only for Pri Scheduling: find place where to insert tso into queue */
+  if (RtsFlags.GranFlags.DoPriorityScheduling && tso->gran.pri!=0)
+    /* {add_to_spark_queue}vo' jInIHta'; Qu' wa'DIch yIleghQo' */
+    for (prev = run_queue_hds[proc], next = run_queue_hds[proc]->link, count=0;
+	 (next != END_TSO_QUEUE) && 
+	 !(found = tso->gran.pri >= next->gran.pri);
+	 prev = next, next = next->link, count++) 
+      { 
+       ASSERT((prev!=(StgTSO*)NULL || next==run_queue_hds[proc]) &&
+	      (prev==(StgTSO*)NULL || prev->link==next));
+      }
+
+  ASSERT(!found || next != END_TSO_QUEUE);
+  ASSERT(procStatus[proc]!=Idle);
+ 
+  if (found) {
+     /* found can only be rtsTrue if pri scheduling enabled */ 
+     ASSERT(RtsFlags.GranFlags.DoPriorityScheduling);
+     if (RtsFlags.GranFlags.GranSimStats.Global) 
+       globalGranStats.non_end_add_threads++;
+     /* Add tso to ThreadQueue between prev and next */
+     tso->link = next;
+     if ( next == (StgTSO*)END_TSO_QUEUE ) {
+       run_queue_tl = tso;
+     } else {
+       /* no back link for TSO chain */
+     }
+     
+     if ( prev == (StgTSO*)END_TSO_QUEUE ) {
+       /* Never add TSO as first elem of thread queue; the first */
+       /* element should be the one that is currently running -- HWL */
+       IF_DEBUG(gran,
+		belch("GRAN: Qagh: NewThread (w/ PriorityScheduling): Trying to add TSO %p (PRI=%d) as first elem of threadQ (%p) on proc %u (@ %u)\n",
+		    tso, tso->gran.pri, run_queue_hd, proc,
+		    CurrentTime[proc]));
+     } else {
+      prev->link = tso;
+     }
+  } else { /* !found */ /* or not pri sparking! */
+    /* Add TSO to the end of the thread queue on that processor */
+    run_queue_tls[proc]->link = tso;
+    run_queue_tls[proc] = tso;
+  }
+  ASSERT(RtsFlags.GranFlags.DoPriorityScheduling || count==0);
+  CurrentTime[proc] += count * RtsFlags.GranFlags.Costs.pri_sched_overhead +
+                       RtsFlags.GranFlags.Costs.threadqueuetime;
+
+  /* ToDo: check if this is still needed -- HWL 
+  if (RtsFlags.GranFlags.DoThreadMigration)
+    ++SurplusThreads;
+
+  if (RtsFlags.GranFlags.GranSimStats.Full &&
+      !(( event_type == GR_START || event_type == GR_STARTQ) && 
+	RtsFlags.GranFlags.labelling) )
+    DumpRawGranEvent(proc, creator, event_type+1, tso, node, 
+	             tso->gran.sparkname, spark_queue_len(proc));
+  */
+
+# if defined(GRAN_CHECK)
+  /* Check if thread queue is sorted. Only for testing, really!  HWL */
+  if ( RtsFlags.GranFlags.DoPriorityScheduling && 
+       (RtsFlags.GranFlags.Debug.sortedQ) ) {
+    rtsBool sorted = rtsTrue;
+    StgTSO *prev, *next;
+
+    if (run_queue_hds[proc]==END_TSO_QUEUE || 
+	run_queue_hds[proc]->link==END_TSO_QUEUE) {
+      /* just 1 elem => ok */
+    } else {
+      /* Qu' wa'DIch yIleghQo' (ignore first elem)! */
+      for (prev = run_queue_hds[proc]->link, next = prev->link;
+	   (next != END_TSO_QUEUE) ;
+	   prev = next, next = prev->link) {
+	ASSERT((prev!=(StgTSO*)NULL || next==run_queue_hds[proc]) &&
+	       (prev==(StgTSO*)NULL || prev->link==next));
+	sorted = sorted && 
+	         (prev->gran.pri >= next->gran.pri);
+      }
+    }
+    if (!sorted) {
+      fprintf(stderr,"Qagh: THREADQ on PE %d is not sorted:\n",
+	      CurrentProc);
+      G_THREADQ(run_queue_hd,0x1);
+    }
+  }
+# endif
+}
+
+/*
+  insertThread, which is only used for GranSim Light, is similar to
+  startThread in that it adds a TSO to a thread queue. However, it assumes
+  that the thread queue is sorted by local clocks and it inserts the TSO at
+  the right place in the queue. Don't create any event, just insert.  
+*/
+//@cindex GranSimLight_insertThread
+rtsBool
+GranSimLight_insertThread(tso, proc)
+StgTSO* tso;
+PEs proc;
+{
+  StgTSO *prev, *next;
+  nat count = 0;
+  rtsBool found = rtsFalse;
+
+  ASSERT(RtsFlags.GranFlags.Light);
+
+  /* In GrAnSim-Light we always have an idle `virtual' proc.
+     The semantics of the one-and-only thread queue is different here:
+     all threads in the queue are running (each on its own virtual processor);
+     the queue is only needed internally in the simulator to interleave the
+     reductions of the different processors.
+     The one-and-only thread queue is sorted by the local clocks of the TSOs.
+  */
+  ASSERT(run_queue_hds[proc] != END_TSO_QUEUE);
+  ASSERT(tso->link == END_TSO_QUEUE);
+
+  /* If only one thread in queue so far we emit DESCHEDULE in debug mode */
+  if (RtsFlags.GranFlags.GranSimStats.Full &&
+      (RtsFlags.GranFlags.Debug.checkLight) && 
+      (run_queue_hd->link == END_TSO_QUEUE)) {
+    DumpRawGranEvent(proc, proc, GR_DESCHEDULE,
+		     run_queue_hds[proc], (StgClosure*)NULL, 
+		     tso->gran.sparkname, spark_queue_len(proc)); // ToDo: check spar_queue_len
+    // resched = rtsTrue;
+  }
+
+  /* this routine should only be used in a GrAnSim Light setup */
+  /* && CurrentProc must be 0 in GrAnSim Light setup */
+  ASSERT(RtsFlags.GranFlags.Light && CurrentProc==0);
+
+  /* Idle proc; same for pri spark and basic version */
+  if (run_queue_hd==END_TSO_QUEUE)
+    {
+      run_queue_hd = run_queue_tl = tso;
+      /* MAKE_BUSY(CurrentProc); */
+      return rtsTrue;
+    }
+
+  for (prev = run_queue_hds[proc], next = run_queue_hds[proc]->link, count = 0;
+       (next != END_TSO_QUEUE) && 
+       !(found = (tso->gran.clock < next->gran.clock));
+       prev = next, next = next->link, count++) 
+    { 
+       ASSERT((prev!=(StgTSO*)NULL || next==run_queue_hds[proc]) &&
+	      (prev==(StgTSO*)NULL || prev->link==next));
+    }
+
+  /* found can only be rtsTrue if pri sparking enabled */ 
+  if (found) {
+     /* Add tso to ThreadQueue between prev and next */
+     tso->link = next;
+     if ( next == END_TSO_QUEUE ) {
+       run_queue_tls[proc] = tso;
+     } else {
+       /* no back link for TSO chain */
+     }
+     
+     if ( prev == END_TSO_QUEUE ) {
+       run_queue_hds[proc] = tso;
+     } else {
+       prev->link = tso;
+     }
+  } else { /* !found */ /* or not pri sparking! */
+    /* Add TSO to the end of the thread queue on that processor */
+    run_queue_tls[proc]->link = tso;
+    run_queue_tls[proc] = tso;
+  }
+
+  if ( prev == END_TSO_QUEUE ) {        /* new head of queue */
+    new_event(proc, proc, CurrentTime[proc],
+	      ContinueThread,
+	      tso, (StgClosure*)NULL, (rtsSpark*)NULL);
+  }
+  /*
+  if (RtsFlags.GranFlags.GranSimStats.Full && 
+      !(( event_type == GR_START || event_type == GR_STARTQ) && 
+	RtsFlags.GranFlags.labelling) )
+    DumpRawGranEvent(proc, creator, gr_evttype, tso, node,
+		     tso->gran.sparkname, spark_queue_len(proc));
+  */
+  return rtsTrue;
+}
+
+/*
+  endThread is responsible for general clean-up after the thread tso has
+  finished. This includes emitting statistics into the profile etc.  
+*/
+void
+endThread(StgTSO *tso, PEs proc) 
+{
+  ASSERT(procStatus[proc]==Busy);        // coming straight out of STG land
+  ASSERT(tso->what_next==ThreadComplete);
+  // ToDo: prune ContinueThreads for this TSO from event queue
+  DumpEndEvent(proc, tso, rtsFalse /* not mandatory */);
+
+  /* if this was the last thread on this PE then make it Idle */
+  if (run_queue_hds[proc]==END_TSO_QUEUE) {
+    procStatus[CurrentProc] = Idle;
+  }
+}
+
+//@node Thread Queue routines, GranSim functions, Scheduling functions, GranSim specific code
+//@subsection Thread Queue routines
+
+/* 
+   Check whether given tso resides on the run queue of the current processor.
+   Only used for debugging.
+*/
+   
+//@cindex is_on_queue
+rtsBool
+is_on_queue (StgTSO *tso, PEs proc) 
+{
+  StgTSO *t;
+  rtsBool found;
+
+  for (t=run_queue_hds[proc], found=rtsFalse; 
+       t!=END_TSO_QUEUE && !(found = t==tso);
+       t=t->link)
+    /* nothing */ ;
+
+  return found;
+}
+
+/* This routine  is only  used for keeping   a statistics  of thread  queue
+   lengths to evaluate the impact of priority scheduling. -- HWL 
+   {spark_queue_len}vo' jInIHta'
+*/
+//@cindex thread_queue_len
+nat
+thread_queue_len(PEs proc) 
+{
+ StgTSO *prev, *next;
+ nat len;
+
+ for (len = 0, prev = END_TSO_QUEUE, next = run_queue_hds[proc];
+      next != END_TSO_QUEUE; 
+      len++, prev = next, next = prev->link)
+   {}
+
+ return (len);
+}
+
+//@node GranSim functions, GranSimLight routines, Thread Queue routines, GranSim specific code
+//@subsection GranSim functions
+
+/* -----------------------------------------------------------------  */
+/* The main event handling functions; called from Schedule.c (schedule) */
+/* -----------------------------------------------------------------  */
+ 
+//@cindex do_the_globalblock
+
+void 
+do_the_globalblock(rtsEvent* event)
+{ 
+  PEs proc          = event->proc;        /* proc that requested node */
+  StgTSO *tso       = event->tso;         /* tso that requested node */
+  StgClosure  *node = event->node;        /* requested, remote node */
+
+  IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the GlobalBlock\n"));
+  /* There should be no GLOBALBLOCKs in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+  /* GlobalBlock events only valid with GUM fetching */
+  ASSERT(RtsFlags.GranFlags.DoBulkFetching);
+
+  IF_GRAN_DEBUG(bq, // globalBlock,
+    if (IS_LOCAL_TO(PROCS(node),proc)) {
+      belch("## Qagh: GlobalBlock: Blocking TSO %d (%p) on LOCAL node %p (PE %d).\n",
+	    tso->id, tso, node, proc);
+    });
+
+  /* CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.munpacktime; */
+  if ( blockFetch(tso,proc,node) != 0 )
+    return;                     /* node has become local by now */
+
+#if 0
+ ToDo: check whether anything has to be done at all after blockFetch -- HWL
+
+  if (!RtsFlags.GranFlags.DoAsyncFetch) { /* head of queue is next thread */
+    StgTSO* tso = run_queue_hds[proc];       /* awaken next thread */
+    if (tso != (StgTSO*)NULL) {
+      new_event(proc, proc, CurrentTime[proc],
+		ContinueThread,
+		tso, (StgClosure*)NULL, (rtsSpark*)NULL);
+      CurrentTime[proc] += RtsFlags.GranFlags.Costs.threadcontextswitchtime;
+      if (RtsFlags.GranFlags.GranSimStats.Full)
+        DumpRawGranEvent(proc, CurrentProc, GR_SCHEDULE, tso,
+			 (StgClosure*)NULL, tso->gran.sparkname, spark_queue_len(CurrentProc));  // ToDo: check sparkname and spar_queue_len
+      procStatus[proc] = Busy;                  /* might have been fetching */
+    } else {
+      procStatus[proc] = Idle;                     /* no work on proc now */
+    }
+  } else {  /* RtsFlags.GranFlags.DoAsyncFetch i.e. block-on-fetch */
+	      /* other thread is already running */
+	      /* 'oH 'utbe' 'e' vIHar ; I think that's not needed -- HWL 
+	      new_event(proc,proc,CurrentTime[proc],
+		       CONTINUETHREAD,EVENT_TSO(event),
+		       (RtsFlags.GranFlags.DoBulkFetching ? closure :
+		       EVENT_NODE(event)),NULL);
+	      */
+  }
+#endif
+}
+
+//@cindex do_the_unblock
+
+void 
+do_the_unblock(rtsEvent* event) 
+{
+  PEs proc = event->proc,       /* proc that requested node */
+      creator = event->creator; /* proc that requested node */
+  StgTSO* tso = event->tso;     /* tso that requested node */
+  StgClosure* node = event->node;  /* requested, remote node */
+  
+  IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the UnBlock\n"))
+  /* There should be no UNBLOCKs in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+  /* UnblockThread means either FetchReply has arrived or
+     a blocking queue has been awakened;
+     ToDo: check with assertions
+  ASSERT(procStatus[proc]==Fetching || IS_BLACK_HOLE(event->node));
+  */
+  if (!RtsFlags.GranFlags.DoAsyncFetch) {  /* block-on-fetch */
+    /* We count block-on-fetch as normal block time */    
+    tso->gran.blocktime += CurrentTime[proc] - tso->gran.blockedat;
+    /* Dumping now done when processing the event
+       No costs for contextswitch or thread queueing in this case 
+       if (RtsFlags.GranFlags.GranSimStats.Full)
+         DumpRawGranEvent(proc, CurrentProc, GR_RESUME, tso, 
+                          (StgClosure*)NULL, tso->gran.sparkname, spark_queue_len(CurrentProc));
+    */
+    /* Maybe do this in FetchReply already 
+    if (procStatus[proc]==Fetching)
+      procStatus[proc] = Busy;
+    */
+    /*
+    new_event(proc, proc, CurrentTime[proc],
+	      ContinueThread,
+	      tso, node, (rtsSpark*)NULL);
+    */
+  } else {
+    /* Asynchr comm causes additional costs here: */
+    /* Bring the TSO from the blocked queue into the threadq */
+  }
+  /* In all cases, the UnblockThread causes a ResumeThread to be scheduled */
+  new_event(proc, proc, 
+	    CurrentTime[proc]+RtsFlags.GranFlags.Costs.threadqueuetime,
+	    ResumeThread,
+	    tso, node, (rtsSpark*)NULL);
+}
+
+//@cindex do_the_fetchnode
+
+void
+do_the_fetchnode(rtsEvent* event)
+{
+  PEs proc = event->proc,       /* proc that holds the requested node */
+      creator = event->creator; /* proc that requested node */
+  StgTSO* tso = event->tso;
+  StgClosure* node = event->node;  /* requested, remote node */
+  rtsFetchReturnCode rc;
+
+  ASSERT(CurrentProc==proc);
+  /* There should be no FETCHNODEs in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+
+  IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the FetchNode\n"));
+
+  CurrentTime[proc] += RtsFlags.GranFlags.Costs.munpacktime;
+
+  /* ToDo: check whether this is the right place for dumping the event */
+  if (RtsFlags.GranFlags.GranSimStats.Full)
+    DumpRawGranEvent(creator, proc, GR_FETCH, tso, node, (StgInt)0, 0);
+
+  do {
+    rc = handleFetchRequest(node, proc, creator, tso);
+    if (rc == OutOfHeap) {                                   /* trigger GC */
+# if defined(GRAN_CHECK)  && defined(GRAN)
+     if (RtsFlags.GcFlags.giveStats)
+       fprintf(RtsFlags.GcFlags.statsFile,"*****   veQ boSwI'  PackNearbyGraph(node %p, tso %p (%d))\n",
+     	        node, tso, tso->id);
+# endif
+     barf("//// do_the_fetchnode: out of heap after handleFetchRequest; ToDo: call GarbageCollect()");
+     prepend_event(event);
+     GarbageCollect(GetRoots, rtsFalse); 
+     // HWL: ToDo: check whether a ContinueThread has to be issued
+     // HWL old: ReallyPerformThreadGC(PACK_HEAP_REQUIRED, rtsFalse);
+# if 0 && defined(GRAN_CHECK)  && defined(GRAN)
+     if (RtsFlags.GcFlags.giveStats) {
+       fprintf(RtsFlags.GcFlags.statsFile,"*****      SAVE_Hp=%p, SAVE_HpLim=%p, PACK_HEAP_REQUIRED=%d\n",
+     	        Hp, HpLim, 0) ; // PACK_HEAP_REQUIRED);  ???
+       fprintf(stderr,"*****      No. of packets so far: %d (total size: %d)\n", 
+     	        globalGranStats.tot_packets, globalGranStats.tot_packet_size);
+     }
+# endif 
+     event = grab_event();
+     // Hp -= PACK_HEAP_REQUIRED; // ???
+
+     /* GC knows that events are special and follows the pointer i.e. */
+     /* events are valid even if they moved. An EXIT is triggered */
+     /* if there is not enough heap after GC. */
+    }
+  } while (rc == OutOfHeap);
+}
+
+//@cindex do_the_fetchreply
+void 
+do_the_fetchreply(rtsEvent* event)
+{
+  PEs proc = event->proc,       /* proc that requested node */
+      creator = event->creator; /* proc that holds the requested node */
+  StgTSO* tso = event->tso;
+  StgClosure* node = event->node;  /* requested, remote node */
+  StgClosure* closure=(StgClosure*)NULL;
+
+  ASSERT(CurrentProc==proc);
+  ASSERT(RtsFlags.GranFlags.DoAsyncFetch || procStatus[proc]==Fetching);
+
+  IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the FetchReply\n"));
+  /* There should be no FETCHREPLYs in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+
+  /* assign message unpack costs *before* dumping the event */
+  CurrentTime[proc] += RtsFlags.GranFlags.Costs.munpacktime;
+  
+  /* ToDo: check whether this is the right place for dumping the event */
+  if (RtsFlags.GranFlags.GranSimStats.Full)
+    DumpRawGranEvent(proc, creator, GR_REPLY, tso, node, 
+  		      tso->gran.sparkname, spark_queue_len(proc));
+
+  /* THIS SHOULD NEVER HAPPEN 
+     If tso is in the BQ of node this means that it actually entered the 
+     remote closure, due to a missing GranSimFetch at the beginning of the 
+     entry code; therefore, this is actually a faked fetch, triggered from 
+     within GranSimBlock; 
+     since tso is both in the EVQ and the BQ for node, we have to take it out 
+     of the BQ first before we can handle the FetchReply;
+     ToDo: special cases in awakenBlockedQueue, since the BQ magically moved.
+  */
+  if (tso->block_info.closure!=(StgClosure*)NULL) {
+    IF_GRAN_DEBUG(bq,
+		  belch("## ghuH: TSO %d (%p) in FetchReply is blocked on node %p (shouldn't happen AFAIK)",
+			tso->id, tso, node));
+    // unlink_from_bq(tso, node);
+  }
+    
+  if (RtsFlags.GranFlags.DoBulkFetching) {      /* bulk (packet) fetching */
+    rtsPackBuffer *buffer = (rtsPackBuffer*)node;
+    nat size = buffer->size;
+  
+    /* NB: Fetch misses can't occur with GUM fetching, as */
+    /* updatable closure are turned into RBHs and therefore locked */
+    /* for other processors that try to grab them. */
+  
+    closure = UnpackGraph(buffer);
+    CurrentTime[proc] += size * RtsFlags.GranFlags.Costs.munpacktime;
+  } else  // incremental fetching
+      /* Copy or  move node to CurrentProc */
+      if (fetchNode(node, creator, proc)) {
+        /* Fetch has failed i.e. node has been grabbed by another PE */
+        PEs p = where_is(node);
+        rtsTime fetchtime;
+     
+	if (RtsFlags.GranFlags.GranSimStats.Global)
+	  globalGranStats.fetch_misses++;
+
+	IF_GRAN_DEBUG(thunkStealing,
+		 belch("== Qu'vatlh! fetch miss @ %u: node %p is at proc %u (rather than proc %u)\n",
+		       CurrentTime[proc],node,p,creator));
+
+	CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.mpacktime;
+	
+	/* Count fetch again !? */
+	++(tso->gran.fetchcount);
+	tso->gran.fetchtime += RtsFlags.GranFlags.Costs.fetchtime;
+        
+	fetchtime = stg_max(CurrentTime[CurrentProc],CurrentTime[p]) +
+		    RtsFlags.GranFlags.Costs.latency;
+	
+	/* Chase the grabbed node */
+	new_event(p, proc, fetchtime,
+		  FetchNode,
+		  tso, node, (rtsSpark*)NULL);
+
+# if 0 && defined(GRAN_CHECK) && defined(GRAN) /* Just for testing */
+       IF_GRAN_DEBUG(blockOnFetch,
+		     BlockedOnFetch[CurrentProc] = tso;) /*-rtsTrue;-*/
+	
+       IF_GRAN_DEBUG(blockOnFetch_sanity,
+		     tso->type |= FETCH_MASK_TSO;)
+# endif
+
+        CurrentTime[proc] += RtsFlags.GranFlags.Costs.mtidytime;
+	
+        return; /* NB: no REPLy has been processed; tso still sleeping */
+    }
+
+    /* -- Qapla'! Fetch has been successful; node is here, now  */
+    ++(event->tso->gran.fetchcount);
+    event->tso->gran.fetchtime += RtsFlags.GranFlags.Costs.fetchtime;
+
+    /* this is now done at the beginning of this routine
+    if (RtsFlags.GranFlags.GranSimStats.Full)
+       DumpRawGranEvent(proc,event->creator, GR_REPLY, event->tso,
+			(RtsFlags.GranFlags.DoBulkFetching ? 
+			       closure : 
+			       event->node),
+                        tso->gran.sparkname, spark_queue_len(proc));
+    */
+
+    ASSERT(OutstandingFetches[proc] > 0);
+    --OutstandingFetches[proc];
+    new_event(proc, proc, CurrentTime[proc],
+	      ResumeThread,
+	      event->tso, (RtsFlags.GranFlags.DoBulkFetching ? 
+			   closure : 
+			   event->node),
+	      (rtsSpark*)NULL);
+}
+
+//@cindex do_the_movethread
+
+void
+do_the_movethread(rtsEvent* event) {
+  PEs proc = event->proc,       /* proc that requested node */
+      creator = event->creator; /* proc that holds the requested node */
+  StgTSO* tso = event->tso;
+
+ IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the MoveThread\n"));
+
+ ASSERT(CurrentProc==proc);
+ /* There should be no MOVETHREADs in GrAnSim Light setup */
+ ASSERT(!RtsFlags.GranFlags.Light);
+ /* MOVETHREAD events should never occur without -bM */
+ ASSERT(RtsFlags.GranFlags.DoThreadMigration);
+ /* Bitmask of moved thread should be 0 */
+ ASSERT(PROCS(tso)==0);
+ ASSERT(procStatus[proc] == Fishing ||
+	RtsFlags.GranFlags.DoAsyncFetch);
+ ASSERT(OutstandingFishes[proc]>0);
+
+ /* ToDo: exact costs for unpacking the whole TSO  */
+ CurrentTime[proc] +=  5l * RtsFlags.GranFlags.Costs.munpacktime;
+
+ /* ToDo: check whether this is the right place for dumping the event */
+ if (RtsFlags.GranFlags.GranSimStats.Full)
+   DumpRawGranEvent(proc, creator, 
+		    GR_STOLEN, tso, (StgClosure*)NULL, (StgInt)0, 0);
+
+ // ToDo: check cost functions
+ --OutstandingFishes[proc];
+ SET_GRAN_HDR(tso, ThisPE);         // adjust the bitmask for the TSO
+ insertThread(tso, proc);
+
+ if (procStatus[proc]==Fishing)
+   procStatus[proc] = Idle;
+
+ if (RtsFlags.GranFlags.GranSimStats.Global)
+   globalGranStats.tot_TSOs_migrated++;
+}
+
+//@cindex do_the_movespark
+
+void
+do_the_movespark(rtsEvent* event) {
+ PEs proc = event->proc,       /* proc that requested spark */
+     creator = event->creator; /* proc that holds the requested spark */
+ StgTSO* tso = event->tso;
+ rtsSparkQ spark = event->spark;
+
+ IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the MoveSpark\n"))
+
+ ASSERT(CurrentProc==proc);
+ ASSERT(spark!=NULL);
+ ASSERT(procStatus[proc] == Fishing ||
+	RtsFlags.GranFlags.DoAsyncFetch);
+ ASSERT(OutstandingFishes[proc]>0); 
+
+ CurrentTime[proc] += RtsFlags.GranFlags.Costs.munpacktime;
+          
+ /* record movement of spark only if spark profiling is turned on */
+ if (RtsFlags.GranFlags.GranSimStats.Sparks)
+    DumpRawGranEvent(proc, creator,
+		     SP_ACQUIRED,
+		     tso, spark->node, spark->name, spark_queue_len(proc));
+
+ /* global statistics */
+ if ( RtsFlags.GranFlags.GranSimStats.Global &&
+      !closure_SHOULD_SPARK(spark->node))
+   globalGranStats.withered_sparks++;
+   /* Not adding the spark to the spark queue would be the right */
+   /* thing here, but it also would be cheating, as this info can't be */
+   /* available in a real system. -- HWL */
+
+ --OutstandingFishes[proc];
+
+ add_to_spark_queue(spark);
+
+ IF_GRAN_DEBUG(randomSteal, // ToDo: spark-distribution flag
+	       print_sparkq_stats());
+
+ /* Should we treat stolen sparks specially? Currently, we don't. */
+
+ if (procStatus[proc]==Fishing)
+   procStatus[proc] = Idle;
+
+ /* add_to_spark_queue will increase the time of the current proc. */
+ /*
+   If proc was fishing, it is Idle now with the new spark in its spark
+   pool. This means that the next time handleIdlePEs is called, a local
+   FindWork will be created on this PE to turn the spark into a thread. Of
+   course another PE might steal the spark in the meantime (that's why we
+   are using events rather than inlining all the operations in the first
+   place). */
+}
+
+/*
+  In the Constellation class version of GranSim the semantics of StarThread
+  events has changed. Now, StartThread has to perform 3 basic operations:
+   - create a new thread (previously this was done in ActivateSpark);
+   - insert the thread into the run queue of the current processor
+   - generate a new event for actually running the new thread
+  Note that the insertThread is called via createThread. 
+*/
+  
+//@cindex do_the_startthread
+
+void
+do_the_startthread(rtsEvent *event)
+{
+  PEs proc          = event->proc;        /* proc that requested node */
+  StgTSO *tso       = event->tso;         /* tso that requested node */
+  StgClosure  *node = event->node;        /* requested, remote node */
+  rtsSpark *spark   = event->spark;
+  GranEventType gr_evttype;
+
+  ASSERT(CurrentProc==proc);
+  ASSERT(!RtsFlags.GranFlags.Light || CurrentProc==0);
+  ASSERT(event->evttype == ResumeThread || event->evttype == StartThread);
+  /* if this was called via StartThread: */
+  ASSERT(event->evttype!=StartThread || tso == END_TSO_QUEUE); // not yet created
+  // ToDo: check: ASSERT(event->evttype!=StartThread || procStatus[proc]==Starting);
+  /* if this was called via ResumeThread: */
+  ASSERT(event->evttype!=ResumeThread || 
+	   RtsFlags.GranFlags.DoAsyncFetch ||!is_on_queue(tso,proc)); 
+
+  /* startThread may have been called from the main event handler upon
+     finding either a ResumeThread or a StartThread event; set the
+     gr_evttype (needed for writing to .gr file) accordingly */
+  // gr_evttype = (event->evttype == ResumeThread) ? GR_RESUME : GR_START;
+
+  if ( event->evttype == StartThread ) {
+    GranEventType gr_evttype = (run_queue_hds[proc]==END_TSO_QUEUE) ? 
+                                 GR_START : GR_STARTQ;
+
+    tso = createThread(BLOCK_SIZE_W, spark->gran_info);// implicit insertThread!
+    pushClosure(tso, node);
+
+    // ToDo: fwd info on local/global spark to thread -- HWL
+    // tso->gran.exported =  spark->exported;
+    // tso->gran.locked =   !spark->global;
+    tso->gran.sparkname = spark->name;
+
+    ASSERT(CurrentProc==proc);
+    if (RtsFlags.GranFlags.GranSimStats.Full)
+      DumpGranEvent(gr_evttype,tso);
+
+    CurrentTime[proc] += RtsFlags.GranFlags.Costs.threadcreatetime;
+  } else { // event->evttype == ResumeThread
+    GranEventType gr_evttype = (run_queue_hds[proc]==END_TSO_QUEUE) ? 
+                                 GR_RESUME : GR_RESUMEQ;
+
+    insertThread(tso, proc);
+
+    ASSERT(CurrentProc==proc);
+    if (RtsFlags.GranFlags.GranSimStats.Full)
+      DumpGranEvent(gr_evttype,tso);
+  }
+
+  ASSERT(run_queue_hds[proc]!=END_TSO_QUEUE); // non-empty run queue
+  procStatus[proc] = Busy;
+  /* make sure that this thread is actually run */
+  new_event(proc, proc, 
+	    CurrentTime[proc],
+	    ContinueThread,
+	    tso, node, (rtsSpark*)NULL);
+  
+  /* A wee bit of statistics gathering */
+  if (RtsFlags.GranFlags.GranSimStats.Global) {
+    globalGranStats.tot_add_threads++;
+    globalGranStats.tot_tq_len += thread_queue_len(CurrentProc);
+  }
+
+}
+
+//@cindex do_the_findwork
+void
+do_the_findwork(rtsEvent* event) 
+{
+  PEs proc = event->proc,       /* proc to search for work */
+      creator = event->creator; /* proc that requested work */
+  rtsSparkQ spark = event->spark;
+  /* ToDo: check that this size is safe -- HWL */
+#if 0
+ ToDo: check available heap
+
+  nat req_heap = sizeofW(StgTSO) + MIN_STACK_WORDS;
+                 // add this? -- HWL:RtsFlags.ConcFlags.stkChunkSize;
+#endif
+
+  IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the Findwork\n"));
+
+  /* If GUM style fishing is enabled, the contents of the spark field says
+     what to steal (spark(1) or thread(2)); */
+  ASSERT(!(RtsFlags.GranFlags.Fishing && event->spark==(rtsSpark*)0));
+
+  /* Make sure that we have enough heap for creating a new
+     thread. This is a conservative estimate of the required heap.
+     This eliminates special checks for GC around NewThread within
+     ActivateSpark.                                                 */
+
+#if 0
+ ToDo: check available heap
+
+  if (Hp + req_heap > HpLim ) {
+    IF_DEBUG(gc, 
+	     belch("GC: Doing GC from within Findwork handling (that's bloody dangerous if you ask me)");)
+      GarbageCollect(GetRoots);
+      // ReallyPerformThreadGC(req_heap, rtsFalse);   old -- HWL
+      Hp -= req_heap;
+      if (procStatus[CurrentProc]==Sparking) 
+	procStatus[CurrentProc]=Idle;
+      return;
+  }
+#endif
+  
+  if ( RtsFlags.GranFlags.DoAlwaysCreateThreads ||
+       RtsFlags.GranFlags.Fishing ||
+       ((procStatus[proc]==Idle || procStatus[proc]==Sparking) &&
+	(RtsFlags.GranFlags.FetchStrategy >= 2 || 
+	 OutstandingFetches[proc] == 0)) ) 
+   {
+    rtsBool found;
+    rtsSparkQ  prev, spark;
+    
+    /* ToDo: check */
+    ASSERT(procStatus[proc]==Sparking ||
+	   RtsFlags.GranFlags.DoAlwaysCreateThreads ||
+	   RtsFlags.GranFlags.Fishing);
+    
+    /* SImmoHwI' yInej! Search spark queue! */
+    /* gimme_spark (event, &found, &spark); */
+    findLocalSpark(event, &found, &spark);
+
+    if (!found) { /* pagh vumwI' */
+      /*
+        If no spark has been found this can mean 2 things:
+	 1/ The FindWork was a fish (i.e. a message sent by another PE) and 
+	    the spark pool of the receiver is empty
+	    --> the fish has to be forwarded to another PE
+         2/ The FindWork was local to this PE (i.e. no communication; in this
+            case creator==proc) and the spark pool of the PE is not empty 
+	    contains only sparks of closures that should not be sparked 
+	    (note: if the spark pool were empty, handleIdlePEs wouldn't have 
+	    generated a FindWork in the first place)
+	    --> the PE has to be made idle to trigger stealing sparks the next
+	        time handleIdlePEs is performed
+      */ 
+
+      ASSERT(pending_sparks_hds[proc]==(rtsSpark*)NULL);
+      if (creator==proc) {
+	/* local FindWork */
+	if (procStatus[proc]==Busy) {
+	  belch("ghuH: PE %d in Busy state while processing local FindWork (spark pool is empty!) @ %lx",
+		proc, CurrentTime[proc]);
+	  procStatus[proc] = Idle;
+	}
+      } else {
+	/* global FindWork i.e. a Fish */
+	ASSERT(RtsFlags.GranFlags.Fishing);
+	/* actually this generates another request from the originating PE */
+	ASSERT(OutstandingFishes[creator]>0);
+	OutstandingFishes[creator]--;
+	/* ToDo: assign costs for sending fish to proc not to creator */
+	stealSpark(creator); /* might steal from same PE; ToDo: fix */
+	ASSERT(RtsFlags.GranFlags.maxFishes!=1 || procStatus[creator] == Fishing);
+	/* any assertions on state of proc possible here? */
+      }
+    } else {
+      /* DaH chu' Qu' yIchen! Now create new work! */ 
+      IF_GRAN_DEBUG(findWork,
+		    belch("+- munching spark %p; creating thread for node %p",
+			  spark, spark->node));
+      activateSpark (event, spark);
+      ASSERT(spark != (rtsSpark*)NULL);
+      spark = delete_from_sparkq (spark, proc, rtsTrue);
+    }
+
+    IF_GRAN_DEBUG(findWork,
+		  belch("+- Contents of spark queues at the end of FindWork @ %lx",
+			CurrentTime[proc]); 
+		  print_sparkq_stats());
+
+    /* ToDo: check ; not valid if GC occurs in ActivateSpark */
+    ASSERT(!found ||
+	    /* forward fish  or */
+	    (proc!=creator ||
+	    /* local spark  or */
+            (proc==creator && procStatus[proc]==Starting)) || 
+	   //(!found && procStatus[proc]==Idle) ||
+	   RtsFlags.GranFlags.DoAlwaysCreateThreads); 
+   } else {
+    IF_GRAN_DEBUG(findWork,
+		  belch("+- RTS refuses to findWork on PE %d @ %lx",
+			proc, CurrentTime[proc]);
+		  belch("  procStatus[%d]=%s, fetch strategy=%d, outstanding fetches[%d]=%d", 
+			proc, proc_status_names[procStatus[proc]],
+			RtsFlags.GranFlags.FetchStrategy, 
+			proc, OutstandingFetches[proc]));
+   }  
+}
+ 
+//@node GranSimLight routines, Code for Fetching Nodes, GranSim functions, GranSim specific code
+//@subsection GranSimLight routines
+
+/* 
+   This code is called from the central scheduler after having rgabbed a
+   new event and is only needed for GranSim-Light. It mainly adjusts the
+   ActiveTSO so that all costs that have to be assigned from within the
+   scheduler are assigned to the right TSO. The choice of ActiveTSO depends
+   on the type of event that has been found.  
+*/
+
+void
+GranSimLight_enter_system(event, ActiveTSOp)
+rtsEvent *event;
+StgTSO **ActiveTSOp;
+{
+  StgTSO *ActiveTSO = *ActiveTSOp;
+
+  ASSERT (RtsFlags.GranFlags.Light);
+  
+  /* Restore local clock of the virtual processor attached to CurrentTSO.
+     All costs will be associated to the `virt. proc' on which the tso
+     is living. */
+  if (ActiveTSO != NULL) {                     /* already in system area */
+    ActiveTSO->gran.clock = CurrentTime[CurrentProc];
+    if (RtsFlags.GranFlags.DoFairSchedule)
+      {
+	if (RtsFlags.GranFlags.GranSimStats.Full &&
+	    RtsFlags.GranFlags.Debug.checkLight)
+	  DumpGranEvent(GR_SYSTEM_END,ActiveTSO);
+      }
+  }
+  switch (event->evttype)
+    { 
+    case ContinueThread: 
+    case FindWork:       /* inaccurate this way */
+      ActiveTSO = run_queue_hd;
+      break;
+    case ResumeThread:   
+    case StartThread:
+    case MoveSpark:      /* has tso of virt proc in tso field of event */
+      ActiveTSO = event->tso;
+      break;
+    default: barf("Illegal event type %s (%d) in GrAnSim Light setup\n",
+		  event_names[event->evttype],event->evttype);
+    }
+  CurrentTime[CurrentProc] = ActiveTSO->gran.clock;
+  if (RtsFlags.GranFlags.DoFairSchedule) {
+      if (RtsFlags.GranFlags.GranSimStats.Full &&
+	  RtsFlags.GranFlags.Debug.checkLight)
+	DumpGranEvent(GR_SYSTEM_START,ActiveTSO);
+  }
+}
+
+void
+GranSimLight_leave_system(event, ActiveTSOp)
+rtsEvent *event;
+StgTSO **ActiveTSOp;
+{
+  StgTSO *ActiveTSO = *ActiveTSOp;
+
+  ASSERT(RtsFlags.GranFlags.Light);
+
+  /* Save time of `virt. proc' which was active since last getevent and
+     restore time of `virt. proc' where CurrentTSO is living on. */
+  if(RtsFlags.GranFlags.DoFairSchedule) {
+    if (RtsFlags.GranFlags.GranSimStats.Full &&
+	RtsFlags.GranFlags.Debug.checkLight) // ToDo: clean up flags
+      DumpGranEvent(GR_SYSTEM_END,ActiveTSO);
+  }
+  ActiveTSO->gran.clock = CurrentTime[CurrentProc];
+  ActiveTSO = (StgTSO*)NULL;
+  CurrentTime[CurrentProc] = CurrentTSO->gran.clock;
+  if (RtsFlags.GranFlags.DoFairSchedule /* &&  resched */ ) {
+    // resched = rtsFalse;
+    if (RtsFlags.GranFlags.GranSimStats.Full &&
+	RtsFlags.GranFlags.Debug.checkLight)
+      DumpGranEvent(GR_SCHEDULE,run_queue_hd);
+  }
+  /* 
+     if (TSO_LINK(ThreadQueueHd)!=PrelBase_Z91Z93_closure &&
+     (TimeOfNextEvent == 0 ||
+     TSO_CLOCK(TSO_LINK(ThreadQueueHd))+1000<TimeOfNextEvent)) {
+     new_event(CurrentProc,CurrentProc,TSO_CLOCK(TSO_LINK(ThreadQueueHd))+1000,
+     CONTINUETHREAD,TSO_LINK(ThreadQueueHd),PrelBase_Z91Z93_closure,NULL);
+     TimeOfNextEvent = get_time_of_next_event();
+     }
+  */
+}
+
+//@node Code for Fetching Nodes, Idle PEs, GranSimLight routines, GranSim specific code
+//@subsection Code for Fetching Nodes
+
+/*
+   The following GrAnSim routines simulate the fetching of nodes from a
+   remote processor. We use a 1 word bitmask to indicate on which processor
+   a node is lying. Thus, moving or copying a node from one processor to
+   another just requires an appropriate change in this bitmask (using
+   @SET_GA@).  Additionally, the clocks have to be updated.
+
+   A special case arises when the node that is needed by processor A has
+   been moved from a processor B to a processor C between sending out a
+   @FETCH@ (from A) and its arrival at B. In that case the @FETCH@ has to
+   be forwarded to C. This is simulated by issuing another FetchNode event
+   on processor C with A as creator.
+*/
+ 
+/* ngoqvam che' {GrAnSim}! */
+
+/* Fetch node "node" to processor "p" */
+
+//@cindex fetchNode
+
+rtsFetchReturnCode
+fetchNode(node,from,to)
+StgClosure* node;
+PEs from, to;
+{
+  /* In case of RtsFlags.GranFlags.DoBulkFetching this fct should never be 
+     entered! Instead, UnpackGraph is used in ReSchedule */
+  StgClosure* closure;
+
+  ASSERT(to==CurrentProc);
+  /* Should never be entered  in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+  /* fetchNode should never be entered with DoBulkFetching */
+  ASSERT(!RtsFlags.GranFlags.DoBulkFetching);
+
+  /* Now fetch the node */
+  if (!IS_LOCAL_TO(PROCS(node),from) &&
+      !IS_LOCAL_TO(PROCS(node),to) ) 
+    return NodeHasMoved;
+  
+  if (closure_HNF(node))                /* node already in head normal form? */
+    node->header.gran.procs |= PE_NUMBER(to);           /* Copy node */
+  else
+    node->header.gran.procs = PE_NUMBER(to);            /* Move node */
+
+  return Ok;
+}
+
+/* 
+   Process a fetch request. 
+   
+   Cost of sending a packet of size n = C + P*n
+   where C = packet construction constant, 
+         P = cost of packing one word into a packet
+   [Should also account for multiple packets].
+*/
+
+//@cindex handleFetchRequest
+
+rtsFetchReturnCode
+handleFetchRequest(node,to,from,tso)
+StgClosure* node;   // the node which is requested
+PEs to, from;       // fetch request: from -> to
+StgTSO* tso;        // the tso which needs the node
+{
+  ASSERT(!RtsFlags.GranFlags.Light);
+  /* ToDo: check assertion */
+  ASSERT(OutstandingFetches[from]>0);
+
+  /* probably wrong place; */
+  ASSERT(CurrentProc==to);
+
+  if (IS_LOCAL_TO(PROCS(node), from)) /* Somebody else moved node already => */
+    {                                 /* start tso */
+      IF_GRAN_DEBUG(thunkStealing,
+		    fprintf(stderr,"ghuH: handleFetchRequest entered with local node %p (%s) (PE %d)\n", 
+			    node, info_type(node), from));
+
+      if (RtsFlags.GranFlags.DoBulkFetching) {
+	nat size;
+	rtsPackBuffer *graph;
+
+	/* Create a 1-node-buffer and schedule a FETCHREPLY now */
+	graph = PackOneNode(node, tso, &size); 
+	new_event(from, to, CurrentTime[to],
+		  FetchReply,
+		  tso, (StgClosure *)graph, (rtsSpark*)NULL);
+      } else {
+	new_event(from, to, CurrentTime[to],
+		  FetchReply,
+		  tso, node, (rtsSpark*)NULL);
+      }
+      IF_GRAN_DEBUG(thunkStealing,
+		    belch("== majQa'! closure %p is local on PE %d already (this is a good thing)", node, from));
+      return (NodeIsLocal);
+    }
+  else if (IS_LOCAL_TO(PROCS(node), to) )   /* Is node still here? */
+    {
+      if (RtsFlags.GranFlags.DoBulkFetching) { /* {GUM}vo' ngoqvam vInIHta' */
+	nat size;                              /* (code from GUM) */
+	StgClosure* graph;
+
+	if (IS_BLACK_HOLE(node)) {   /* block on BH or RBH */
+	  new_event(from, to, CurrentTime[to],
+		    GlobalBlock,
+		    tso, node, (rtsSpark*)NULL);
+	  /* Note: blockFetch is done when handling GLOBALBLOCK event; 
+	           make sure the TSO stays out of the run queue */
+          /* When this thread is reawoken it does the usual: it tries to 
+             enter the updated node and issues a fetch if it's remote.
+             It has forgotten that it has sent a fetch already (i.e. a
+             FETCHNODE is swallowed by a BH, leaving the thread in a BQ) */
+          --OutstandingFetches[from];
+
+	  IF_GRAN_DEBUG(thunkStealing,
+			belch("== majQa'! closure %p on PE %d is a BH (demander=PE %d); faking a FMBQ", 
+			      node, to, from));
+	  if (RtsFlags.GranFlags.GranSimStats.Global) {
+	    globalGranStats.tot_FMBQs++;
+	  }
+	  return (NodeIsBH);
+	}
+
+	/* The tso requesting the node is blocked and cannot be on a run queue */
+	ASSERT(!is_on_queue(tso, from));
+	
+	// ToDo: check whether graph is ever used as an rtsPackBuffer!!
+	if ((graph = (StgClosure *)PackNearbyGraph(node, tso, &size, 0)) == NULL) 
+	  return (OutOfHeap);  /* out of heap */
+
+	/* Actual moving/copying of node is done on arrival; see FETCHREPLY */
+	/* Send a reply to the originator */
+	/* ToDo: Replace that by software costs for doing graph packing! */
+	CurrentTime[to] += size * RtsFlags.GranFlags.Costs.mpacktime;
+
+	new_event(from, to,
+		  CurrentTime[to]+RtsFlags.GranFlags.Costs.latency,
+		  FetchReply,
+		  tso, (StgClosure *)graph, (rtsSpark*)NULL);
+        
+	CurrentTime[to] += RtsFlags.GranFlags.Costs.mtidytime;
+	return (Ok);
+      } else {                   /* incremental (single closure) fetching */
+	/* Actual moving/copying of node is done on arrival; see FETCHREPLY */
+	/* Send a reply to the originator */
+	CurrentTime[to] += RtsFlags.GranFlags.Costs.mpacktime;
+
+	new_event(from, to,
+		  CurrentTime[to]+RtsFlags.GranFlags.Costs.latency,
+		  FetchReply,
+		  tso, node, (rtsSpark*)NULL);
+      
+	CurrentTime[to] += RtsFlags.GranFlags.Costs.mtidytime;
+	return (Ok);
+      }
+    }
+  else       /* Qu'vatlh! node has been grabbed by another proc => forward */
+    {    
+      PEs node_loc = where_is(node);
+      rtsTime fetchtime;
+
+      IF_GRAN_DEBUG(thunkStealing,
+		    belch("== Qu'vatlh! node %p has been grabbed by PE %d from PE %d (demander=%d) @ %d\n",
+			  node,node_loc,to,from,CurrentTime[to]));
+      if (RtsFlags.GranFlags.GranSimStats.Global) {
+	globalGranStats.fetch_misses++;
+      }
+
+      /* Prepare FORWARD message to proc p_new */
+      CurrentTime[to] += RtsFlags.GranFlags.Costs.mpacktime;
+      
+      fetchtime = stg_max(CurrentTime[to], CurrentTime[node_loc]) +
+                  RtsFlags.GranFlags.Costs.latency;
+          
+      new_event(node_loc, from, fetchtime,
+		FetchNode,
+		tso, node, (rtsSpark*)NULL);
+
+      CurrentTime[to] += RtsFlags.GranFlags.Costs.mtidytime;
+
+      return (NodeHasMoved);
+    }
+}
+
+/*
+   blockFetch blocks a BlockedFetch node on some kind of black hole.
+
+   Taken from gum/HLComms.lc.   [find a  better  place for that ?] --  HWL  
+
+   {\bf Note:} In GranSim we don't have @FETCHME@ nodes and therefore don't
+   create @FMBQ@'s (FetchMe blocking queues) to cope with global
+   blocking. Instead, non-local TSO are put into the BQ in the same way as
+   local TSOs. However, we have to check if a TSO is local or global in
+   order to account for the latencies involved and for keeping track of the
+   number of fetches that are really going on.  
+*/
+
+//@cindex blockFetch
+
+rtsFetchReturnCode
+blockFetch(tso, proc, bh)
+StgTSO* tso;                        /* TSO which gets blocked */
+PEs proc;                           /* PE where that tso was running */
+StgClosure* bh;                     /* closure to block on (BH, RBH, BQ) */
+{
+  StgInfoTable *info;
+
+  IF_GRAN_DEBUG(bq,
+		fprintf(stderr,"## blockFetch: blocking TSO %p (%d)[PE %d] on node %p (%s) [PE %d]. No graph is packed!\n", 
+		tso, tso->id, proc, bh, info_type(bh), where_is(bh)));
+
+    if (!IS_BLACK_HOLE(bh)) {                      /* catches BHs and RBHs */
+      IF_GRAN_DEBUG(bq,
+		    fprintf(stderr,"## blockFetch: node %p (%s) is not a BH => awakening TSO %p (%d) [PE %u]\n", 
+			    bh, info_type(bh), tso, tso->id, proc));
+
+      /* No BH anymore => immediately unblock tso */
+      new_event(proc, proc, CurrentTime[proc],
+	        UnblockThread,
+                tso, bh, (rtsSpark*)NULL);
+
+      /* Is this always a REPLY to a FETCH in the profile ? */
+      if (RtsFlags.GranFlags.GranSimStats.Full)
+	DumpRawGranEvent(proc, proc, GR_REPLY, tso, bh, (StgInt)0, 0);
+      return (NodeIsNoBH);
+    }
+
+    /* DaH {BQ}Daq Qu' Suq 'e' wISov!
+       Now we know that we have to put the tso into the BQ.
+       2 cases: If block-on-fetch, tso is at head of threadq => 
+                => take it out of threadq and into BQ
+                If reschedule-on-fetch, tso is only pointed to be event
+                => just put it into BQ
+
+    ngoq ngo'!!
+    if (!RtsFlags.GranFlags.DoAsyncFetch) {
+      GranSimBlock(tso, proc, bh);
+    } else {
+      if (RtsFlags.GranFlags.GranSimStats.Full)
+	DumpRawGranEvent(proc, where_is(bh), GR_BLOCK, tso, bh, (StgInt)0, 0);
+      ++(tso->gran.blockcount);
+      tso->gran.blockedat = CurrentTime[proc];
+    }
+    */
+
+    /* after scheduling the GlobalBlock event the TSO is not put into the
+       run queue again; it is only pointed to via the event we are
+       processing now; in GranSim 4.xx there is no difference between
+       synchr and asynchr comm here */
+    ASSERT(!is_on_queue(tso, proc));
+    ASSERT(tso->link == END_TSO_QUEUE);
+
+    GranSimBlock(tso, proc, bh);  /* GranSim statistics gathering */
+
+    /* Now, put tso into BQ (similar to blocking entry codes) */
+    info = get_itbl(bh);
+    switch (info -> type) {
+      case RBH:
+      case BLACKHOLE:
+      case CAF_BLACKHOLE: // ToDo: check whether this is a possibly ITBL here
+      case SE_BLACKHOLE:   // ToDo: check whether this is a possibly ITBL here
+      case SE_CAF_BLACKHOLE:// ToDo: check whether this is a possibly ITBL here
+	/* basically an inlined version of BLACKHOLE_entry -- HWL */
+	/* Change the BLACKHOLE into a BLACKHOLE_BQ */
+	((StgBlockingQueue *)bh)->header.info = &BLACKHOLE_BQ_info;
+	/* Put ourselves on the blocking queue for this black hole */
+	// tso->link=END_TSO_QUEUE;   not necessary; see assertion above
+	((StgBlockingQueue *)bh)->blocking_queue = (StgBlockingQueueElement *)tso;
+	tso->block_info.closure = bh;
+	recordMutable((StgMutClosure *)bh);
+	break;
+
+    case BLACKHOLE_BQ:
+	/* basically an inlined version of BLACKHOLE_BQ_entry -- HWL */
+	tso->link = (StgTSO *) (((StgBlockingQueue*)bh)->blocking_queue); 
+	((StgBlockingQueue*)bh)->blocking_queue = (StgBlockingQueueElement *)tso;
+	recordMutable((StgMutClosure *)bh);
+
+# if 0 && defined(GC_MUT_REQUIRED)
+	ToDo: check whether recordMutable is necessary -- HWL
+	/*
+	 * If we modify a black hole in the old generation, we have to make 
+	 * sure it goes on the mutables list
+	 */
+
+	if (bh <= StorageMgrInfo.OldLim) {
+	    MUT_LINK(bh) = (W_) StorageMgrInfo.OldMutables;
+	    StorageMgrInfo.OldMutables = bh;
+	} else
+	    MUT_LINK(bh) = MUT_NOT_LINKED;
+# endif
+	break;
+
+    case FETCH_ME_BQ:
+	barf("Qagh: FMBQ closure (%p) found in GrAnSim (TSO=%p (%d))\n",
+	     bh, tso, tso->id);
+
+    default:
+	{
+	  G_PRINT_NODE(bh);
+	  barf("Qagh: thought %p was a black hole (IP %p (%s))",
+		  bh, info, info_type(bh));
+	}
+      }
+    return (Ok);
+}
+
+
+//@node Idle PEs, Routines directly called from Haskell world, Code for Fetching Nodes, GranSim specific code
+//@subsection Idle PEs
+
+/*
+   Export work to idle PEs. This function is called from @ReSchedule@
+   before dispatching on the current event. @HandleIdlePEs@ iterates over
+   all PEs, trying to get work for idle PEs. Note, that this is a
+   simplification compared to GUM's fishing model. We try to compensate for
+   that by making the cost for stealing work dependent on the number of
+   idle processors and thereby on the probability with which a randomly
+   sent fish would find work.  
+*/
+
+//@cindex handleIdlePEs
+
+void
+handleIdlePEs(void)
+{
+  PEs p;
+
+  IF_DEBUG(gran, fprintf(stderr, "GRAN: handling Idle PEs\n"))
+
+  /* Should never be entered in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+
+  /* Could check whether there are idle PEs if it's a cheap check */
+  for (p = 0; p < RtsFlags.GranFlags.proc; p++) 
+    if (procStatus[p]==Idle)  /*  && IS_SPARKING(p) && IS_STARTING(p) */
+      /* First look for local work i.e. examine local spark pool! */
+      if (pending_sparks_hds[p]!=(rtsSpark *)NULL) {
+	new_event(p, p, CurrentTime[p],
+		  FindWork,
+		  (StgTSO*)NULL, (StgClosure*)NULL, (rtsSpark*)NULL);
+	procStatus[p] = Sparking;
+      } else if ((RtsFlags.GranFlags.maxFishes==0 ||
+		  OutstandingFishes[p]<RtsFlags.GranFlags.maxFishes) ) {
+
+	/* If no local work then try to get remote work! 
+	   Qu' Hopbe' pagh tu'lu'pu'chugh Qu' Hop yISuq ! */
+	if (RtsFlags.GranFlags.DoStealThreadsFirst && 
+	    (RtsFlags.GranFlags.FetchStrategy >= 4 || OutstandingFetches[p] == 0))
+	  {
+	    if (SurplusThreads > 0l)                    /* Steal a thread */
+	      stealThread(p);
+          
+	    if (procStatus[p]!=Idle)
+	      break;
+	  }
+	
+	if (SparksAvail > 0 && 
+	    (RtsFlags.GranFlags.FetchStrategy >= 3 || OutstandingFetches[p] == 0)) /* Steal a spark */
+	  stealSpark(p);
+	
+	if (SurplusThreads > 0 && 
+	    (RtsFlags.GranFlags.FetchStrategy >= 4 || OutstandingFetches[p] == 0)) /* Steal a thread */
+	  stealThread(p);
+      }
+}
+
+/*
+   Steal a spark and schedule moving it to proc. We want to look at PEs in
+   clock order -- most retarded first.  Currently sparks are only stolen
+   from the @ADVISORY_POOL@ never from the @REQUIRED_POOL@. Eventually,
+   this should be changed to first steal from the former then from the
+   latter.
+
+   We model a sort of fishing mechanism by counting the number of sparks
+   and threads we are currently stealing.  */
+
+/* 
+   Return a random nat value in the intervall [from, to) 
+*/
+static nat 
+natRandom(from, to)
+nat from, to;
+{
+  nat r, d;
+
+  ASSERT(from<=to);
+  d = to - from;
+  /* random returns a value in [0, RAND_MAX] */
+  r = (nat) ((float)from + ((float)random()*(float)d)/(float)RAND_MAX);
+  r = (r==to) ? from : r;
+  ASSERT(from<=r && (r<to || from==to));
+  return r;  
+}
+
+/* 
+   Find any PE other than proc. Used for GUM style fishing only.
+*/
+static PEs 
+findRandomPE (proc)
+PEs proc;
+{
+  nat p;
+
+  ASSERT(RtsFlags.GranFlags.Fishing);
+  if (RtsFlags.GranFlags.RandomSteal) {
+    p = natRandom(0,RtsFlags.GranFlags.proc);  /* full range of PEs */
+  } else {
+    p = 0;
+  }
+  IF_GRAN_DEBUG(randomSteal,
+		belch("^^ RANDOM_STEAL (fishing): stealing from PE %d (current proc is %d)",
+		      p, proc));
+    
+  return (PEs)p;
+}
+
+/*
+  Magic code for stealing sparks/threads makes use of global knowledge on
+  spark queues.  
+*/
+static void
+sortPEsByTime (proc, pes_by_time, firstp, np) 
+PEs proc;
+PEs *pes_by_time;
+nat *firstp, *np;
+{
+  PEs p, temp, n, i, j;
+  nat first, upb, r=0, q=0;
+
+  ASSERT(!RtsFlags.GranFlags.Fishing);
+
+#if 0  
+  upb = RtsFlags.GranFlags.proc;            /* full range of PEs */
+
+  if (RtsFlags.GranFlags.RandomSteal) {
+    r = natRandom(0,RtsFlags.GranFlags.proc);  /* full range of PEs */
+  } else {
+    r = 0;
+  }
+#endif
+
+  /* pes_by_time shall contain processors from which we may steal sparks */ 
+  for(n=0, p=0; p < RtsFlags.GranFlags.proc; ++p)
+    if ((proc != p) &&                       // not the current proc
+        (pending_sparks_hds[p] != (rtsSpark *)NULL) && // non-empty spark pool
+        (CurrentTime[p] <= CurrentTime[CurrentProc]))
+      pes_by_time[n++] = p;
+
+  /* sort pes_by_time */
+  for(i=0; i < n; ++i)
+    for(j=i+1; j < n; ++j)
+      if (CurrentTime[pes_by_time[i]] > CurrentTime[pes_by_time[j]]) {
+	rtsTime temp = pes_by_time[i];
+	pes_by_time[i] = pes_by_time[j];
+	pes_by_time[j] = temp;
+      }
+
+  /* Choose random processor to steal spark from; first look at processors */
+  /* that are earlier than the current one (i.e. proc) */
+  for(first=0; 
+      (first < n) && (CurrentTime[pes_by_time[first]] <= CurrentTime[proc]);
+      ++first)
+    /* nothing */ ;
+
+  /* if the assertion below is true we can get rid of first */
+  /* ASSERT(first==n); */
+  /* ToDo: check if first is really needed; find cleaner solution */
+
+  *firstp = first;
+  *np = n;
+}
+
+/* 
+   Steal a spark (piece of work) from any processor and bring it to proc.
+*/
+//@cindex stealSpark
+static rtsBool 
+stealSpark(PEs proc) { stealSomething(proc, rtsTrue, rtsFalse); }
+
+/* 
+   Steal a thread from any processor and bring it to proc i.e. thread migration
+*/
+//@cindex stealThread
+static rtsBool 
+stealThread(PEs proc) { stealSomething(proc, rtsFalse, rtsTrue); }
+
+/* 
+   Steal a spark or a thread and schedule moving it to proc.
+*/
+//@cindex stealSomething
+static rtsBool
+stealSomething(proc, steal_spark, steal_thread)
+PEs proc;                           // PE that needs work (stealer)
+rtsBool steal_spark, steal_thread;  // should a spark and/or thread be stolen
+{
+  PEs p;
+  rtsTime fish_arrival_time;
+  rtsSpark *spark, *prev, *next;
+  rtsBool stolen = rtsFalse;
+
+  ASSERT(steal_spark || steal_thread);
+
+  /* Should never be entered in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+  ASSERT(!steal_thread || RtsFlags.GranFlags.DoThreadMigration);
+
+  if (!RtsFlags.GranFlags.Fishing) {
+    // ToDo: check if stealing threads is prefered over stealing sparks
+    if (steal_spark) {
+      if (stealSparkMagic(proc))
+	return rtsTrue;
+      else                             // no spark found
+	if (steal_thread)
+	  return stealThreadMagic(proc);
+        else                           // no thread found
+	  return rtsFalse;             
+    } else {                           // ASSERT(steal_thread);
+      return stealThreadMagic(proc);
+    }
+    barf("stealSomething: never reached");
+  }
+
+  /* The rest of this function does GUM style fishing */
+  
+  p = findRandomPE(proc); /* find a random PE other than proc */
+  
+  /* Message packing costs for sending a Fish; qeq jabbI'ID */
+  CurrentTime[proc] += RtsFlags.GranFlags.Costs.mpacktime;
+  
+  /* use another GranEvent for requesting a thread? */
+  if (steal_spark && RtsFlags.GranFlags.GranSimStats.Sparks)
+    DumpRawGranEvent(p, proc, SP_REQUESTED,
+		     (StgTSO*)NULL, (StgClosure *)NULL, (StgInt)0, 0);
+
+  /* time of the fish arrival on the remote PE */
+  fish_arrival_time = CurrentTime[proc] + RtsFlags.GranFlags.Costs.latency;
+  
+  /* Phps use an own Fish event for that? */
+  /* The contents of the spark component is a HACK:
+      1 means give me a spark;
+      2 means give me a thread
+      0 means give me nothing (this should never happen)
+  */
+  new_event(p, proc, fish_arrival_time,
+	    FindWork,
+	    (StgTSO*)NULL, (StgClosure*)NULL, 
+	    (steal_spark ? (rtsSpark*)1 : steal_thread ? (rtsSpark*)2 : (rtsSpark*)0));
+  
+  ++OutstandingFishes[proc];
+  /* only with Async fetching? */
+  if (procStatus[proc]==Idle)  
+    procStatus[proc]=Fishing;
+  
+  /* time needed to clean up buffers etc after sending a message */
+  CurrentTime[proc] += RtsFlags.GranFlags.Costs.mtidytime;
+
+  /* If GUM style fishing stealing always succeeds because it only consists
+     of sending out a fish; of course, when the fish may return
+     empty-handed! */
+  return rtsTrue;
+}
+
+/* 
+   This version of stealing a spark makes use of the global info on all
+   spark pools etc which is not available in a real parallel system.
+   This could be extended to test e.g. the impact of perfect load information.
+*/
+//@cindex stealSparkMagic
+static rtsBool
+stealSparkMagic(proc)
+PEs proc;
+{
+  PEs p=0, i=0, j=0, n=0, first, upb;
+  rtsSpark *spark=NULL, *next;
+  PEs pes_by_time[MAX_PROC];
+  rtsBool stolen = rtsFalse;
+  rtsTime stealtime;
+
+  /* Should never be entered in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+
+  sortPEsByTime(proc, pes_by_time, &first, &n);
+
+  while (!stolen && n>0) {
+    upb = (first==0) ? n : first;
+    i = natRandom(0,upb);                /* choose a random eligible PE */
+    p = pes_by_time[i];
+
+    IF_GRAN_DEBUG(randomSteal,
+		  belch("^^ stealSparkMagic (random_steal, not fishing): stealing spark from PE %d (current proc is %d)",
+			p, proc));
+      
+    ASSERT(pending_sparks_hds[p]!=(rtsSpark *)NULL); /* non-empty spark pool */
+
+    /* Now go through rtsSparkQ and steal the first eligible spark */
+    
+    spark = pending_sparks_hds[p]; 
+    while (!stolen && spark != (rtsSpark*)NULL)
+      {
+	/* NB: no prev pointer is needed here because all sparks that are not 
+	   chosen are pruned
+	*/
+	if ((procStatus[p]==Idle || procStatus[p]==Sparking || procStatus[p] == Fishing) &&
+	    spark->next==(rtsSpark*)NULL) 
+	  {
+	    /* Be social! Don't steal the only spark of an idle processor 
+	       not {spark} neH yInIH !! */
+	    break; /* next PE */
+	  } 
+	else if (closure_SHOULD_SPARK(spark->node))
+	  {
+	    /* Don't Steal local sparks; 
+	       ToDo: optionally prefer local over global sparks
+	    if (!spark->global) {
+	      prev=spark;
+	      continue;                  next spark
+	    }
+	    */
+	    /* found a spark! */
+
+	    /* Prepare message for sending spark */
+	    CurrentTime[p] += RtsFlags.GranFlags.Costs.mpacktime;
+
+	    if (RtsFlags.GranFlags.GranSimStats.Sparks)
+	      DumpRawGranEvent(p, (PEs)0, SP_EXPORTED,
+			       (StgTSO*)NULL, spark->node,
+			       spark->name, spark_queue_len(p));
+
+	    stealtime = (CurrentTime[p] > CurrentTime[proc] ? 
+			   CurrentTime[p] : 
+			   CurrentTime[proc])
+	                + sparkStealTime();
+
+	    new_event(proc, p /* CurrentProc */, stealtime,
+		      MoveSpark,
+		      (StgTSO*)NULL, spark->node, spark);
+	    
+	    stolen = rtsTrue;
+	    ++OutstandingFishes[proc]; /* no. of sparks currently on the fly */
+	    if (procStatus[proc]==Idle)
+	      procStatus[proc] = Fishing;
+	    ++(spark->global);         /* record that this is a global spark */
+	    ASSERT(SparksAvail>0);
+	    --SparksAvail;            /* on-the-fly sparks are not available */
+	    next = delete_from_sparkq(spark, p, rtsFalse); // don't dispose!
+	    CurrentTime[p] += RtsFlags.GranFlags.Costs.mtidytime;
+	  }
+	else   /* !(closure_SHOULD_SPARK(SPARK_NODE(spark))) */
+	  {
+	   IF_GRAN_DEBUG(checkSparkQ,
+			 belch("^^ pruning spark %p (node %p) in stealSparkMagic",
+			       spark, spark->node));
+
+	    /* if the spark points to a node that should not be sparked,
+	       prune the spark queue at this point */
+	    if (RtsFlags.GranFlags.GranSimStats.Sparks)
+	      DumpRawGranEvent(p, (PEs)0, SP_PRUNED,
+			       (StgTSO*)NULL, spark->node,
+			       spark->name, spark_queue_len(p));
+	    if (RtsFlags.GranFlags.GranSimStats.Global)
+	      globalGranStats.pruned_sparks++;
+	    
+	    ASSERT(SparksAvail>0);
+	    --SparksAvail;
+	    spark = delete_from_sparkq(spark, p, rtsTrue);
+	  }
+	/* unlink spark (may have been freed!) from sparkq;
+	if (prev == NULL) // spark was head of spark queue
+	  pending_sparks_hds[p] = spark->next;
+        else  
+	  prev->next = spark->next;
+	if (spark->next == NULL)
+	  pending_sparks_tls[p] = prev;
+        else  
+	  next->prev = prev;
+	*/
+      }                    /* while ...    iterating over sparkq */
+
+    /* ToDo: assert that PE p still has work left after stealing the spark */
+
+    if (!stolen && (n>0)) {  /* nothing stealable from proc p :( */
+      ASSERT(pes_by_time[i]==p);
+
+      /* remove p from the list (at pos i) */
+      for (j=i; j+1<n; j++)
+	pes_by_time[j] = pes_by_time[j+1];
+      n--;
+      
+      /* update index to first proc which is later (or equal) than proc */
+      for ( ;
+	    (first>0) &&
+	      (CurrentTime[pes_by_time[first-1]]>CurrentTime[proc]);
+	    first--)
+	/* nothing */ ;
+    } 
+  }  /* while ... iterating over PEs in pes_by_time */
+
+  IF_GRAN_DEBUG(randomSteal,
+		if (stolen)
+		  belch("^^ stealSparkMagic: spark %p (node=%p) stolen by PE %d from PE %d (SparksAvail=%d; idlers=%d)",
+		       spark, spark->node, proc, p, 
+		       SparksAvail, idlers());
+		else  
+		  belch("^^ stealSparkMagic: nothing stolen by PE %d (sparkq len after pruning=%d)(SparksAvail=%d; idlers=%d)",
+		        proc, SparksAvail, idlers()));
+
+  if (RtsFlags.GranFlags.GranSimStats.Global &&
+      stolen && (i!=0)) {                          /* only for statistics */
+    globalGranStats.rs_sp_count++;
+    globalGranStats.ntimes_total += n;
+    globalGranStats.fl_total += first;
+    globalGranStats.no_of_steals++;
+  }
+
+  return stolen;
+}
+
+/* 
+   The old stealThread code, which makes use of global info and does not
+   send out fishes.  
+   NB: most of this is the same as in stealSparkMagic;
+       only the pieces specific to processing thread queues are different; 
+       long live polymorphism!  
+*/
+
+//@cindex stealThreadMagic
+static rtsBool
+stealThreadMagic(proc)
+PEs proc;
+{
+  PEs p=0, i=0, j=0, n=0, first, upb;
+  StgTSO *tso=END_TSO_QUEUE;
+  PEs pes_by_time[MAX_PROC];
+  rtsBool stolen = rtsFalse;
+  rtsTime stealtime;
+
+  /* Should never be entered in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+
+  sortPEsByTime(proc, pes_by_time, &first, &n);
+
+  while (!stolen && n>0) {
+    upb = (first==0) ? n : first;
+    i = natRandom(0,upb);                /* choose a random eligible PE */
+    p = pes_by_time[i];
+
+    IF_GRAN_DEBUG(randomSteal,
+		  belch("^^ stealThreadMagic (random_steal, not fishing): stealing thread from PE %d (current proc is %d)",
+			p, proc));
+      
+    /* Steal the first exportable thread in the runnable queue but
+       never steal the first in the queue for social reasons;
+       not Qu' wa'DIch yInIH !!
+    */
+    /* Would be better to search through queue and have options which of
+       the threads to pick when stealing */
+    if (run_queue_hds[p] == END_TSO_QUEUE) {
+      IF_GRAN_DEBUG(randomSteal,
+		    belch("^^ stealThreadMagic: No thread to steal from PE %d (stealer=PE %d)", 
+			  p, proc));
+    } else {
+      tso = run_queue_hds[p]->link;  /* tso is *2nd* thread in thread queue */
+      /* Found one */
+      stolen = rtsTrue;
+
+      /* update links in queue */
+      run_queue_hds[p]->link = tso->link;
+      if (run_queue_tls[p] == tso)
+	run_queue_tls[p] = run_queue_hds[p];
+      
+      /* ToDo: Turn magic constants into params */
+      
+      CurrentTime[p] += 5l * RtsFlags.GranFlags.Costs.mpacktime;
+      
+      stealtime = (CurrentTime[p] > CurrentTime[proc] ? 
+		   CurrentTime[p] : 
+		   CurrentTime[proc])
+	+ sparkStealTime() 
+	+ 4l * RtsFlags.GranFlags.Costs.additional_latency
+	+ 5l * RtsFlags.GranFlags.Costs.munpacktime;
+
+      /* Move the thread; set bitmask to 0 while TSO is `on-the-fly' */
+      SET_GRAN_HDR(tso,Nowhere /* PE_NUMBER(proc) */); 
+
+      /* Move from one queue to another */
+      new_event(proc, p, stealtime,
+		MoveThread,
+		tso, (StgClosure*)NULL, (rtsSpark*)NULL);
+
+      /* MAKE_BUSY(proc);  not yet; only when thread is in threadq */
+      ++OutstandingFishes[proc];
+      if (procStatus[proc])
+	procStatus[proc] = Fishing;
+      --SurplusThreads;
+
+      if(RtsFlags.GranFlags.GranSimStats.Full)
+	DumpRawGranEvent(p, proc, 
+			 GR_STEALING, 
+			 tso, (StgClosure*)NULL, (StgInt)0, 0);
+      
+      /* costs for tidying up buffer after having sent it */
+      CurrentTime[p] += 5l * RtsFlags.GranFlags.Costs.mtidytime;
+    }
+
+    /* ToDo: assert that PE p still has work left after stealing the spark */
+
+    if (!stolen && (n>0)) {  /* nothing stealable from proc p :( */
+      ASSERT(pes_by_time[i]==p);
+
+      /* remove p from the list (at pos i) */
+      for (j=i; j+1<n; j++)
+	pes_by_time[j] = pes_by_time[j+1];
+      n--;
+      
+      /* update index to first proc which is later (or equal) than proc */
+      for ( ;
+	    (first>0) &&
+	      (CurrentTime[pes_by_time[first-1]]>CurrentTime[proc]);
+	    first--)
+	/* nothing */ ;
+    } 
+  }  /* while ... iterating over PEs in pes_by_time */
+
+  IF_GRAN_DEBUG(randomSteal,
+		if (stolen)
+  		  belch("^^ stealThreadMagic: stolen TSO %d (%p) by PE %d from PE %d (SparksAvail=%d; idlers=%d)",
+		        tso->id, tso, proc, p,
+		        SparksAvail, idlers());
+		else
+		  belch("stealThreadMagic: nothing stolen by PE %d (SparksAvail=%d; idlers=%d)",
+			proc, SparksAvail, idlers()));
+
+  if (RtsFlags.GranFlags.GranSimStats.Global &&
+      stolen && (i!=0)) { /* only for statistics */
+    /* ToDo: more statistics on avg thread queue lenght etc */
+    globalGranStats.rs_t_count++;
+    globalGranStats.no_of_migrates++;
+  }
+
+  return stolen;
+}
+
+//@cindex sparkStealTime
+static rtsTime
+sparkStealTime(void)
+{
+  double fishdelay, sparkdelay, latencydelay;
+  fishdelay =  (double)RtsFlags.GranFlags.proc/2;
+  sparkdelay = fishdelay - 
+          ((fishdelay-1.0)/(double)(RtsFlags.GranFlags.proc-1))*((double)idlers());
+  latencydelay = sparkdelay*((double)RtsFlags.GranFlags.Costs.latency);
+
+  return((rtsTime)latencydelay);
+}
+
+//@node Routines directly called from Haskell world, Emiting profiling info for GrAnSim, Idle PEs, GranSim specific code
+//@subsection Routines directly called from Haskell world
+/* 
+The @GranSim...@ routines in here are directly called via macros from the
+threaded world. 
+
+First some auxiliary routines.
+*/
+
+/* Take the current thread off the thread queue and thereby activate the 
+   next thread. It's assumed that the next ReSchedule after this uses 
+   NEW_THREAD as param. 
+   This fct is called from GranSimBlock and GranSimFetch 
+*/
+
+//@cindex ActivateNextThread
+
+void 
+ActivateNextThread (proc)
+PEs proc;
+{
+  StgTSO *t;
+  /*
+    This routine is entered either via GranSimFetch or via GranSimBlock.
+    It has to prepare the CurrentTSO for being blocked and update the
+    run queue and other statistics on PE proc. The actual enqueuing to the 
+    blocking queue (if coming from GranSimBlock) is done in the entry code 
+    of the BLACKHOLE and BLACKHOLE_BQ closures (see StgMiscClosures.hc).
+  */
+  /* ToDo: add assertions here!! */
+  //ASSERT(run_queue_hds[proc]!=END_TSO_QUEUE);
+
+  // Only necessary if the running thread is at front of the queue
+  // run_queue_hds[proc] = run_queue_hds[proc]->link;
+  ASSERT(CurrentProc==proc);
+  ASSERT(!is_on_queue(CurrentTSO,proc));
+  if (run_queue_hds[proc]==END_TSO_QUEUE) {
+    /* NB: this routine is only entered with asynchr comm (see assertion) */
+    procStatus[proc] = Idle;
+  } else {
+    /* ToDo: check cost assignment */
+    CurrentTime[proc] += RtsFlags.GranFlags.Costs.threadcontextswitchtime;
+    if (RtsFlags.GranFlags.GranSimStats.Full && 
+	(!RtsFlags.GranFlags.Light || RtsFlags.GranFlags.Debug.checkLight)) 
+                                      /* right flag !?? ^^^ */ 
+      DumpRawGranEvent(proc, 0, GR_SCHEDULE, run_queue_hds[proc],
+                       (StgClosure*)NULL, (StgInt)0, 0);
+  }
+}
+
+/* 
+   The following GranSim fcts are stg-called from the threaded world.    
+*/
+
+/* Called from HP_CHK and friends (see StgMacros.h)  */
+//@cindex GranSimAllocate
+void 
+GranSimAllocate(n)
+StgInt n;
+{
+  CurrentTSO->gran.allocs += n;
+  ++(CurrentTSO->gran.basicblocks);
+
+  if (RtsFlags.GranFlags.GranSimStats.Heap) {
+      DumpRawGranEvent(CurrentProc, 0, GR_ALLOC, CurrentTSO,
+                       (StgClosure*)NULL, (StgInt)0, n);
+  }
+  
+  CurrentTSO->gran.exectime += RtsFlags.GranFlags.Costs.heapalloc_cost;
+  CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.heapalloc_cost;
+}
+
+/*
+  Subtract the values added above, if a heap check fails and
+  so has to be redone.
+*/
+//@cindex GranSimUnallocate
+void 
+GranSimUnallocate(n)
+StgInt n;
+{
+  CurrentTSO->gran.allocs -= n;
+  --(CurrentTSO->gran.basicblocks);
+  
+  CurrentTSO->gran.exectime -= RtsFlags.GranFlags.Costs.heapalloc_cost;
+  CurrentTime[CurrentProc] -= RtsFlags.GranFlags.Costs.heapalloc_cost;
+}
+
+/* NB: We now inline this code via GRAN_EXEC rather than calling this fct */
+//@cindex GranSimExec
+void 
+GranSimExec(ariths,branches,loads,stores,floats)
+StgWord ariths,branches,loads,stores,floats;
+{
+  StgWord cost = RtsFlags.GranFlags.Costs.arith_cost*ariths + 
+            RtsFlags.GranFlags.Costs.branch_cost*branches + 
+            RtsFlags.GranFlags.Costs.load_cost * loads +
+            RtsFlags.GranFlags.Costs.store_cost*stores + 
+            RtsFlags.GranFlags.Costs.float_cost*floats;
+
+  CurrentTSO->gran.exectime += cost;
+  CurrentTime[CurrentProc] += cost;
+}
+
+/* 
+   Fetch the node if it isn't local
+   -- result indicates whether fetch has been done.
+
+   This is GRIP-style single item fetching.
+*/
+
+//@cindex GranSimFetch
+StgInt 
+GranSimFetch(node /* , liveness_mask */ )
+StgClosure *node;
+/* StgInt liveness_mask; */
+{
+  /* reset the return value (to be checked within STG land) */
+  NeedToReSchedule = rtsFalse;   
+
+  if (RtsFlags.GranFlags.Light) {
+     /* Always reschedule in GrAnSim-Light to prevent one TSO from
+        running off too far 
+     new_event(CurrentProc,CurrentProc,CurrentTime[CurrentProc],
+	      ContinueThread,CurrentTSO,node,NULL);
+     */
+     return(0); 
+  }
+
+  /* Faking an RBH closure:
+     If the bitmask of the closure is 0 then this node is a fake RBH;
+  */
+  if (node->header.gran.procs == Nowhere) {
+    IF_GRAN_DEBUG(bq,
+		  belch("## Found fake RBH (node %p); delaying TSO %d (%p)", 
+			node, CurrentTSO->id, CurrentTSO));
+		  
+    new_event(CurrentProc, CurrentProc, CurrentTime[CurrentProc]+10000,
+	      ContinueThread, CurrentTSO, node, (rtsSpark*)NULL);
+
+    /* Rescheduling (GranSim internal) is necessary */
+    NeedToReSchedule = rtsTrue;
+    
+    return(1); 
+  }
+
+  /* Note: once a node has been fetched, this test will be passed */
+  if (!IS_LOCAL_TO(PROCS(node),CurrentProc))
+    {
+      PEs p = where_is(node);
+      rtsTime fetchtime;
+      
+      IF_GRAN_DEBUG(thunkStealing,
+		    if (p==CurrentProc) 
+		      belch("GranSimFetch: Trying to fetch from own processor%u\n", p););
+      
+      CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.mpacktime;
+      /* NB: Fetch is counted on arrival (FetchReply) */
+      
+      fetchtime = stg_max(CurrentTime[CurrentProc],CurrentTime[p]) +
+	RtsFlags.GranFlags.Costs.latency;
+      
+      new_event(p, CurrentProc, fetchtime,
+		FetchNode, CurrentTSO, node, (rtsSpark*)NULL);
+      
+      if (fetchtime<TimeOfNextEvent)
+	TimeOfNextEvent = fetchtime;
+      
+      /* About to block */
+      CurrentTSO->gran.blockedat = CurrentTime[CurrentProc];
+      
+      ++OutstandingFetches[CurrentProc];
+      
+      if (RtsFlags.GranFlags.DoAsyncFetch) 
+	/* if asynchr comm is turned on, activate the next thread in the q */
+	ActivateNextThread(CurrentProc);
+      else
+	procStatus[CurrentProc] = Fetching;
+
+#if 0 
+      /* ToDo: nuke the entire if (anything special for fair schedule?) */
+      if (RtsFlags.GranFlags.DoAsyncFetch) 
+	{
+	  /* Remove CurrentTSO from the queue -- assumes head of queue == CurrentTSO */
+	  if(!RtsFlags.GranFlags.DoFairSchedule)
+	    {
+	      /* now done in do_the_fetchnode 
+	      if (RtsFlags.GranFlags.GranSimStats.Full)
+		DumpRawGranEvent(CurrentProc, p, GR_FETCH, CurrentTSO,
+				 node, (StgInt)0, 0);
+	      */				
+	      ActivateNextThread(CurrentProc);
+              
+# if 0 && defined(GRAN_CHECK)
+	      if (RtsFlags.GranFlags.Debug.blockOnFetch_sanity) {
+		if (TSO_TYPE(CurrentTSO) & FETCH_MASK_TSO) {
+		  fprintf(stderr,"FetchNode: TSO 0x%x has fetch-mask set @ %d\n",
+			  CurrentTSO,CurrentTime[CurrentProc]);
+		  stg_exit(EXIT_FAILURE);
+		} else {
+		  TSO_TYPE(CurrentTSO) |= FETCH_MASK_TSO;
+		}
+	      }
+# endif
+	      CurrentTSO->link = END_TSO_QUEUE;
+	      /* CurrentTSO = END_TSO_QUEUE; */
+	      
+	      /* CurrentTSO is pointed to by the FetchNode event; it is
+		 on no run queue any more */
+	  } else {  /* fair scheduling currently not supported -- HWL */
+	    barf("Asynchr communication is not yet compatible with fair scheduling\n");
+	  }
+	} else {                /* !RtsFlags.GranFlags.DoAsyncFetch */
+	  procStatus[CurrentProc] = Fetching; // ToDo: BlockedOnFetch;
+	  /* now done in do_the_fetchnode 
+	  if (RtsFlags.GranFlags.GranSimStats.Full)
+	    DumpRawGranEvent(CurrentProc, p,
+			     GR_FETCH, CurrentTSO, node, (StgInt)0, 0);
+	  */
+	  IF_GRAN_DEBUG(blockOnFetch, 
+			BlockedOnFetch[CurrentProc] = CurrentTSO;); /*- rtsTrue; -*/
+	}
+#endif /* 0 */
+
+      CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.mtidytime;
+      
+      /* Rescheduling (GranSim internal) is necessary */
+      NeedToReSchedule = rtsTrue;
+      
+      return(1); 
+    }
+  return(0);
+}
+
+//@cindex GranSimSpark
+void 
+GranSimSpark(local,node)
+StgInt local;
+StgClosure *node;
+{
+  /* ++SparksAvail;  Nope; do that in add_to_spark_queue */
+  if (RtsFlags.GranFlags.GranSimStats.Sparks)
+    DumpRawGranEvent(CurrentProc, (PEs)0, SP_SPARK,
+		     END_TSO_QUEUE, node, (StgInt)0, spark_queue_len(CurrentProc)-1);
+
+  /* Force the PE to take notice of the spark */
+  if(RtsFlags.GranFlags.DoAlwaysCreateThreads) {
+    new_event(CurrentProc,CurrentProc,CurrentTime[CurrentProc],
+	      FindWork,
+	      END_TSO_QUEUE, (StgClosure*)NULL, (rtsSpark*)NULL);
+    if (CurrentTime[CurrentProc]<TimeOfNextEvent)
+      TimeOfNextEvent = CurrentTime[CurrentProc];
+  }
+
+  if(local)
+    ++CurrentTSO->gran.localsparks;
+  else
+    ++CurrentTSO->gran.globalsparks;
+}
+
+//@cindex GranSimSparkAt
+void 
+GranSimSparkAt(spark,where,identifier)
+rtsSpark *spark;
+StgClosure *where;    /* This should be a node; alternatively could be a GA */
+StgInt identifier;
+{
+  PEs p = where_is(where);
+  GranSimSparkAtAbs(spark,p,identifier);
+}
+
+//@cindex GranSimSparkAtAbs
+void 
+GranSimSparkAtAbs(spark,proc,identifier)
+rtsSpark *spark;
+PEs proc;        
+StgInt identifier;
+{
+  rtsTime exporttime;
+
+  if (spark == (rtsSpark *)NULL) /* Note: Granularity control might have */
+    return;                          /* turned a spark into a NULL. */
+
+  /* ++SparksAvail; Nope; do that in add_to_spark_queue */
+  if(RtsFlags.GranFlags.GranSimStats.Sparks)
+    DumpRawGranEvent(proc,0,SP_SPARKAT,
+		     END_TSO_QUEUE, spark->node, (StgInt)0, spark_queue_len(proc));
+
+  if (proc!=CurrentProc) {
+    CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.mpacktime;
+    exporttime = (CurrentTime[proc] > CurrentTime[CurrentProc]? 
+                  CurrentTime[proc]: CurrentTime[CurrentProc])
+                 + RtsFlags.GranFlags.Costs.latency;
+  } else {
+    exporttime = CurrentTime[CurrentProc];
+  }
+
+  if ( RtsFlags.GranFlags.Light )
+    /* Need CurrentTSO in event field to associate costs with creating
+       spark even in a GrAnSim Light setup */
+    new_event(proc, CurrentProc, exporttime,
+	      MoveSpark,
+	      CurrentTSO, spark->node, spark);
+  else
+    new_event(proc, CurrentProc, exporttime,
+	      MoveSpark, (StgTSO*)NULL, spark->node, spark);
+  /* Bit of a hack to treat placed sparks the same as stolen sparks */
+  ++OutstandingFishes[proc];
+
+  /* Force the PE to take notice of the spark (FINDWORK is put after a
+     MoveSpark into the sparkq!) */
+  if (RtsFlags.GranFlags.DoAlwaysCreateThreads) {
+    new_event(CurrentProc,CurrentProc,exporttime+1,
+              FindWork,
+	      (StgTSO*)NULL, (StgClosure*)NULL, (rtsSpark*)NULL);
+  }
+
+  if (exporttime<TimeOfNextEvent)
+    TimeOfNextEvent = exporttime;
+
+  if (proc!=CurrentProc) {
+    CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.mtidytime;
+    ++CurrentTSO->gran.globalsparks;
+  } else { 
+    ++CurrentTSO->gran.localsparks;
+  }
+}
+
+/* 
+   This function handles local and global blocking.  It's called either
+   from threaded code (RBH_entry, BH_entry etc) or from blockFetch when
+   trying to fetch an BH or RBH 
+*/
+
+//@cindex GranSimBlock
+void 
+GranSimBlock(tso, proc, node)
+StgTSO *tso;
+PEs proc;
+StgClosure *node;
+{
+  PEs node_proc = where_is(node), 
+      tso_proc = where_is((StgClosure *)tso);
+
+  ASSERT(tso_proc==CurrentProc);
+  // ASSERT(node_proc==CurrentProc);
+  IF_GRAN_DEBUG(bq,
+		if (node_proc!=CurrentProc) 
+		  belch("## ghuH: TSO %d (%lx) [PE %d] blocks on non-local node %p [PE %d] (no simulation of FETCHMEs)",
+		        tso->id, tso, tso_proc, node, node_proc)); 
+  ASSERT(tso->link==END_TSO_QUEUE);
+  ASSERT(!is_on_queue(tso,proc)); // tso must not be on run queue already!
+  //ASSERT(tso==run_queue_hds[proc]);
+
+  IF_DEBUG(gran,
+	   belch("GRAN: TSO %d (%p) [PE %d] blocks on closure %p @ %lx",
+		 tso->id, tso, proc, node, CurrentTime[proc]));
+
+
+    /* THIS SHOULD NEVER HAPPEN!
+       If tso tries to block on a remote node (i.e. node_proc!=CurrentProc)
+       we have missed a GranSimFetch before entering this closure;
+       we hack around it for now, faking a FetchNode; 
+       because GranSimBlock is entered via a BLACKHOLE(_BQ) closure,
+       tso will be blocked on this closure until the FetchReply occurs.
+
+       ngoq Dogh! 
+
+    if (node_proc!=CurrentProc) {
+      StgInt ret;
+      ret = GranSimFetch(node);
+      IF_GRAN_DEBUG(bq,
+                    if (ret)
+		      belch(".. GranSimBlock: faking a FetchNode of node %p from %d to %d",
+			    node, node_proc, CurrentProc););
+      return;
+    }
+    */
+
+  if (RtsFlags.GranFlags.GranSimStats.Full)
+    DumpRawGranEvent(proc,node_proc,GR_BLOCK,tso,node,(StgInt)0,0);
+
+  ++(tso->gran.blockcount);
+  /* Distinction  between local and global block is made in blockFetch */
+  tso->gran.blockedat = CurrentTime[proc];
+
+  CurrentTime[proc] += RtsFlags.GranFlags.Costs.threadqueuetime;
+  ActivateNextThread(proc);
+  /* tso->link = END_TSO_QUEUE;    not really necessary; only for testing */
+}
+
+#endif /* GRAN */
+
+//@node Index,  , Dumping routines, GranSim specific code
+//@subsection Index
+
+//@index
+//* ActivateNextThread::  @cindex\s-+ActivateNextThread
+//* CurrentProc::  @cindex\s-+CurrentProc
+//* CurrentTime::  @cindex\s-+CurrentTime
+//* GranSimAllocate::  @cindex\s-+GranSimAllocate
+//* GranSimBlock::  @cindex\s-+GranSimBlock
+//* GranSimExec::  @cindex\s-+GranSimExec
+//* GranSimFetch::  @cindex\s-+GranSimFetch
+//* GranSimLight_insertThread::  @cindex\s-+GranSimLight_insertThread
+//* GranSimSpark::  @cindex\s-+GranSimSpark
+//* GranSimSparkAt::  @cindex\s-+GranSimSparkAt
+//* GranSimSparkAtAbs::  @cindex\s-+GranSimSparkAtAbs
+//* GranSimUnallocate::  @cindex\s-+GranSimUnallocate
+//* any_idle::  @cindex\s-+any_idle
+//* blockFetch::  @cindex\s-+blockFetch
+//* do_the_fetchnode::  @cindex\s-+do_the_fetchnode
+//* do_the_fetchreply::  @cindex\s-+do_the_fetchreply
+//* do_the_findwork::  @cindex\s-+do_the_findwork
+//* do_the_globalblock::  @cindex\s-+do_the_globalblock
+//* do_the_movespark::  @cindex\s-+do_the_movespark
+//* do_the_movethread::  @cindex\s-+do_the_movethread
+//* do_the_startthread::  @cindex\s-+do_the_startthread
+//* do_the_unblock::  @cindex\s-+do_the_unblock
+//* fetchNode::  @cindex\s-+fetchNode
+//* ga_to_proc::  @cindex\s-+ga_to_proc
+//* get_next_event::  @cindex\s-+get_next_event
+//* get_time_of_next_event::  @cindex\s-+get_time_of_next_event
+//* grab_event::  @cindex\s-+grab_event
+//* handleFetchRequest::  @cindex\s-+handleFetchRequest
+//* handleIdlePEs::  @cindex\s-+handleIdlePEs
+//* idlers::  @cindex\s-+idlers
+//* insertThread::  @cindex\s-+insertThread
+//* insert_event::  @cindex\s-+insert_event
+//* is_on_queue::  @cindex\s-+is_on_queue
+//* is_unique::  @cindex\s-+is_unique
+//* new_event::  @cindex\s-+new_event
+//* prepend_event::  @cindex\s-+prepend_event
+//* print_event::  @cindex\s-+print_event
+//* print_eventq::  @cindex\s-+print_eventq
+//* prune_eventq ::  @cindex\s-+prune_eventq 
+//* spark queue::  @cindex\s-+spark queue
+//* sparkStealTime::  @cindex\s-+sparkStealTime
+//* stealSomething::  @cindex\s-+stealSomething
+//* stealSpark::  @cindex\s-+stealSpark
+//* stealSparkMagic::  @cindex\s-+stealSparkMagic
+//* stealThread::  @cindex\s-+stealThread
+//* stealThreadMagic::  @cindex\s-+stealThreadMagic
+//* thread_queue_len::  @cindex\s-+thread_queue_len
+//* traverse_eventq_for_gc::  @cindex\s-+traverse_eventq_for_gc
+//* where_is::  @cindex\s-+where_is
+//@end index
diff --git a/rts/parallel/GranSimRts.h b/rts/parallel/GranSimRts.h
new file mode 100644
index 0000000000..fc31a1f0a6
--- /dev/null
+++ b/rts/parallel/GranSimRts.h
@@ -0,0 +1,268 @@
+/* --------------------------------------------------------------------------
+   Time-stamp: <Tue Mar 06 2001 00:18:30 Stardate: [-30]6285.06 hwloidl>
+
+   Variables and functions specific to GranSim.
+   ----------------------------------------------------------------------- */
+
+#ifndef GRANSIM_RTS_H
+#define GRANSIM_RTS_H
+
+//@node Headers for GranSim objs used only in the RTS internally, , ,
+//@section Headers for GranSim objs used only in the RTS internally
+
+//@menu
+//* Event queue::		
+//* Spark handling routines::	
+//* Processor related stuff::	
+//* Local types::		
+//* Statistics gathering::	
+//* Prototypes::		
+//@end menu
+//*/ fool highlight
+
+//@node Event queue, Spark handling routines, Headers for GranSim objs used only in the RTS internally, Headers for GranSim objs used only in the RTS internally
+//@subsection Event queue
+
+#if defined(GRAN) || defined(PAR)
+/* Granularity event types for output (see DumpGranEvent) */
+typedef enum GranEventType_ {
+    GR_START = 0, GR_STARTQ, 
+    GR_STEALING, GR_STOLEN, GR_STOLENQ, 
+    GR_FETCH, GR_REPLY, GR_BLOCK, GR_RESUME, GR_RESUMEQ,
+    GR_SCHEDULE, GR_DESCHEDULE,
+    GR_END,
+    SP_SPARK, SP_SPARKAT, SP_USED, SP_PRUNED, SP_EXPORTED, SP_ACQUIRED, SP_REQUESTED,
+    GR_ALLOC,
+    GR_TERMINATE,
+    GR_SYSTEM_START, GR_SYSTEM_END,            /* only for debugging */
+    GR_EVENT_MAX
+} GranEventType;
+
+extern char *gran_event_names[];
+#endif
+
+#if defined(GRAN)                                            /* whole file */
+
+/* Event Types (internal use only) */
+typedef enum rtsEventType_ {
+ ContinueThread = 0,  /* Continue running the first thread in the queue */
+ StartThread,         /* Start a newly created thread */
+ ResumeThread,        /* Resume a previously running thread */
+ MoveSpark,           /* Move a spark from one PE to another */
+ MoveThread,          /* Move a thread from one PE to another */
+ FindWork,            /* Search for work */
+ FetchNode,           /* Fetch a node */
+ FetchReply,          /* Receive a node */
+ GlobalBlock,         /* Block a TSO on a remote node */
+ UnblockThread        /* Make a TSO runnable */
+} rtsEventType;
+
+/* Number of last event type */
+#define MAX_EVENT       9
+ 
+typedef struct rtsEvent_ {
+  PEs           proc;    /* Processor id */
+  PEs           creator; /* Processor id of PE that created the event */
+  rtsEventType  evttype; /* rtsEvent type */
+  rtsTime       time;    /* Time at which event happened */
+  StgTSO       *tso;     /* Associated TSO, if relevant */
+  StgClosure   *node;    /* Associated node, if relevant */
+  rtsSpark     *spark;   /* Associated SPARK, if relevant */
+  StgInt        gc_info; /* Counter of heap objects to mark (used in GC only)*/
+  struct rtsEvent_ *next;
+  } rtsEvent;
+
+typedef rtsEvent *rtsEventQ;
+
+extern rtsEventQ EventHd;
+
+/* Interface for ADT of Event Queue */
+rtsEvent *get_next_event(void);
+rtsTime   get_time_of_next_event(void);
+void      insert_event(rtsEvent *newentry);
+void      new_event(PEs proc, PEs creator, rtsTime time, 
+		    rtsEventType evttype, StgTSO *tso, 
+		    StgClosure *node, rtsSpark *spark);
+void      print_event(rtsEvent *event);
+void      print_eventq(rtsEvent *hd);
+void      prepend_event(rtsEvent *event);
+rtsEventQ grab_event(void);
+void      prune_eventq(StgTSO *tso, StgClosure *node); 
+
+void      traverse_eventq_for_gc(void);
+void      markEventQueue(void);
+
+//@node Spark handling routines, Processor related stuff, Event queue, Headers for GranSim objs used only in the RTS internally
+//@subsection Spark handling routines
+
+/* These functions are only used in the RTS internally; see GranSim.h for rest */
+void 	  disposeSpark(rtsSpark *spark);
+void 	  disposeSparkQ(rtsSparkQ spark);
+void 	  print_spark(rtsSpark *spark);
+void      print_sparkq(PEs proc);
+void 	  print_sparkq_stats(void);
+nat  	  spark_queue_len(PEs proc);
+rtsSpark *delete_from_sparkq (rtsSpark *spark, PEs p, rtsBool dispose_too);
+void      markSparkQueue(void);
+
+//@node Processor related stuff, Local types, Spark handling routines, Headers for GranSim objs used only in the RTS internally
+//@subsection Processor related stuff
+
+typedef enum rtsProcStatus_ {
+  Idle = 0,             /* empty threadq */
+  Sparking,             /* non-empty sparkq; FINDWORK has been issued */
+  Starting,             /* STARTTHREAD has been issue */
+  Fetching,             /* waiting for remote data (only if block-on-fetch) */
+  Fishing,              /* waiting for remote spark/thread */
+  Busy                  /* non-empty threadq, with head of queue active */
+} rtsProcStatus;
+
+/*
+#define IS_IDLE(proc)        (procStatus[proc] == Idle)
+#define IS_SPARKING(proc)    (procStatus[proc] == Sparking)
+#define IS_STARTING(proc)    (procStatus[proc] == Starting)
+#define IS_FETCHING(proc)    (procStatus[proc] == Fetching)
+#define IS_FISHING(proc)     (procStatus[proc] == Fishing)
+#define IS_BUSY(proc)        (procStatus[proc] == Busy)    
+#define ANY_IDLE             (any_idle())
+#define MAKE_IDLE(proc)      procStatus[proc] = Idle
+#define MAKE_SPARKING(proc)  procStatus[proc] = Sparking
+#define MAKE_STARTING(proc)  procStatus[proc] = Starting
+#define MAKE_FETCHING(proc)  procStatus[proc] = Fetching
+#define MAKE_FISHING(proc)   procStatus[proc] = Fishing
+#define MAKE_BUSY(proc)      procStatus[proc] = Busy
+*/
+
+//@node Local types, Statistics gathering, Processor related stuff, Headers for GranSim objs used only in the RTS internally
+//@subsection Local types
+
+/* Return codes of HandleFetchRequest:
+    0 ... ok (FETCHREPLY event with a buffer containing addresses of the 
+              nearby graph has been scheduled)
+    1 ... node is already local (fetched by somebody else; no event is
+                                  scheduled in here)
+    2 ... fetch request has been forwrded to the PE that now contains the
+           node
+    3 ... node is a black hole (BH, BQ or RBH); no event is scheduled, and
+           the current TSO is put into the blocking queue of that node
+    4 ... out of heap in PackNearbyGraph; GC should be triggered in calling
+          function to guarantee that the tso and node inputs are valid
+          (they may be moved during GC).
+   Return codes of blockFetch:
+    0 ... ok; tso is now at beginning of BQ attached to the bh closure
+    1 ... the bh closure is no BH any more; tso is immediately unblocked
+*/
+
+typedef enum rtsFetchReturnCode_ {
+  Ok = 0,
+  NodeIsLocal,
+  NodeHasMoved,
+  NodeIsBH,
+  NodeIsNoBH,
+  OutOfHeap,
+} rtsFetchReturnCode;
+  
+//@node Statistics gathering, Prototypes, Local types, Headers for GranSim objs used only in the RTS internally
+//@subsection Statistics gathering
+
+extern unsigned int /* nat */ OutstandingFetches[], OutstandingFishes[];
+extern rtsProcStatus procStatus[];
+extern StgTSO *BlockedOnFetch[];
+
+/* global structure for collecting statistics */
+typedef struct GlobalGranStats_ {
+  /* event stats */
+  nat noOfEvents;
+  nat event_counts[MAX_EVENT];
+
+  /* communication stats */
+  nat fetch_misses;
+  nat tot_fake_fetches;   // GranSim internal; faked Fetches are a kludge!!
+  nat tot_low_pri_sparks;
+
+  /* load distribution statistics */  
+  nat rs_sp_count, rs_t_count, ntimes_total, fl_total, 
+      no_of_steals, no_of_migrates;
+
+  /* spark queue stats */
+  nat tot_sq_len, tot_sq_probes, tot_sparks;
+  nat tot_add_threads, tot_tq_len, non_end_add_threads;
+
+  /* packet statistics */
+  nat tot_packets, tot_packet_size, tot_cuts, tot_thunks;
+
+  /* thread stats */
+  nat tot_threads_created, threads_created_on_PE[MAX_PROC],
+      tot_TSOs_migrated;
+
+  /* spark stats */
+  nat pruned_sparks, withered_sparks;
+  nat tot_sparks_created, sparks_created_on_PE[MAX_PROC];
+
+  /* scheduling stats */
+  nat tot_yields, tot_stackover, tot_heapover;
+
+  /* blocking queue statistics */
+  rtsTime tot_bq_processing_time;
+  nat tot_bq_len, tot_bq_len_local, tot_awbq, tot_FMBQs;
+} GlobalGranStats;
+
+extern GlobalGranStats globalGranStats;
+
+//@node Prototypes,  , Statistics gathering, Headers for GranSim objs used only in the RTS internally
+//@subsection Prototypes
+
+/* Generally useful fcts */
+PEs where_is(StgClosure *node);
+rtsBool is_unique(StgClosure *node);
+
+/* Prototypes of event handling functions; needed in Schedule.c:ReSchedule() */
+void do_the_globalblock (rtsEvent* event);
+void do_the_unblock (rtsEvent* event);
+void do_the_fetchnode (rtsEvent* event);
+void do_the_fetchreply (rtsEvent* event);
+void do_the_movethread (rtsEvent* event);
+void do_the_movespark (rtsEvent* event);
+void do_the_startthread(rtsEvent *event);
+void do_the_findwork(rtsEvent* event);
+void gimme_spark (rtsEvent *event, rtsBool *found_res, rtsSparkQ *spark_res);
+rtsBool munch_spark (rtsEvent *event, rtsSparkQ spark);
+
+/* GranSimLight routines */
+void GranSimLight_enter_system(rtsEvent *event, StgTSO **ActiveTSOp);
+void GranSimLight_leave_system(rtsEvent *event, StgTSO **ActiveTSOp);
+
+/* Communication related routines */
+rtsFetchReturnCode fetchNode(StgClosure* node, PEs from, PEs to);
+rtsFetchReturnCode handleFetchRequest(StgClosure* node, PEs curr_proc, PEs p, StgTSO* tso);
+void               handleIdlePEs(void);
+
+long int random(void); /* used in stealSpark() and stealThread() in GranSim.c */
+
+/* Scheduling fcts defined in GranSim.c */
+void    insertThread(StgTSO *tso, PEs proc);
+void    endThread(StgTSO *tso, PEs proc);
+rtsBool GranSimLight_insertThread(StgTSO *tso, PEs proc);
+nat     thread_queue_len(PEs proc);
+
+/* For debugging */
+rtsBool is_on_queue (StgTSO *tso, PEs proc);
+#endif
+
+#if defined(GRAN) || defined(PAR)
+/* 
+   Interface for dumping routines (i.e. writing to log file).
+   These routines are shared with GUM (and could also be used for SMP).
+*/
+void DumpGranEvent(GranEventType name, StgTSO *tso);
+void DumpEndEvent(PEs proc, StgTSO *tso, rtsBool mandatory_thread);
+void DumpTSO(StgTSO *tso);
+void DumpRawGranEvent(PEs proc, PEs p, GranEventType name, 
+ 	              StgTSO *tso, StgClosure *node, 
+		      StgInt sparkname, StgInt len);
+void DumpVeryRawGranEvent(rtsTime time, PEs proc, PEs p, GranEventType name,
+			  StgTSO *tso, StgClosure *node, 
+			  StgInt sparkname, StgInt len);
+#endif
+
+#endif /* GRANSIM_RTS_H  */
diff --git a/rts/parallel/HLC.h b/rts/parallel/HLC.h
new file mode 100644
index 0000000000..793ac840f9
--- /dev/null
+++ b/rts/parallel/HLC.h
@@ -0,0 +1,63 @@
+/* --------------------------------------------------------------------------
+   Time-stamp: <Sun Mar 18 2001 20:16:14 Stardate: [-30]6349.22 hwloidl>
+
+   High Level Communications Header (HLC.h)
+
+   Contains the high-level definitions (i.e. communication
+   subsystem independent) used by GUM
+   Phil Trinder, Glasgow University, 12 December 1994
+   H-W. Loidl, Heriot-Watt, November 1999
+   ----------------------------------------------------------------------- */
+
+#ifndef __HLC_H
+#define __HLC_H
+
+#ifdef PAR
+
+#include "LLC.h"
+
+#define NEW_FISH_AGE           0
+#define NEW_FISH_HISTORY       0
+#define NEW_FISH_HUNGER        0
+#define FISH_LIFE_EXPECTANCY  10
+
+
+//@node GUM Message Sending and Unpacking Functions
+//@subsection GUM Message Sending and Unpacking Functions
+
+rtsBool  initMoreBuffers(void);
+
+void 	 sendFetch (globalAddr *ga, globalAddr *bqga, int load);
+void 	 sendResume(globalAddr *rga, int nelem, rtsPackBuffer *packBuffer);
+void 	 sendAck (GlobalTaskId task, int ngas, globalAddr *gagamap);
+void 	 sendFish (GlobalTaskId destPE, GlobalTaskId origPE, int age, int history, int hunger);
+void 	 sendFree (GlobalTaskId destPE, int nelem, P_ data);
+void 	 sendSchedule(GlobalTaskId origPE, int nelem, rtsPackBuffer *packBuffer);
+void 	 sendReval(GlobalTaskId origPE, int nelem, rtsPackBuffer *data);
+
+//@node Message-Processing Functions
+//@subsection Message-Processing Functions
+
+rtsBool	 processMessages(void);
+void 	 processFetches(void);
+void 	 processTheRealFetches(void);
+
+//@node Miscellaneous Functions
+//@subsection Miscellaneous Functions
+
+void 	 prepareFreeMsgBuffers(void);
+void 	 freeRemoteGA (int pe, globalAddr *ga);
+void 	 sendFreeMessages(void);
+
+GlobalTaskId  choosePE(void);
+StgClosure   *createBlockedFetch (globalAddr ga, globalAddr rga);
+void 	      waitForTermination(void);
+
+/* Message bouncing (startup and shutdown, mainly) */
+void          bounceFish(void);
+void          bounceReval(void);
+
+void          DebugPrintGAGAMap (globalAddr *gagamap, int nGAs);
+
+#endif /* PAR */
+#endif /* __HLC_H */
diff --git a/rts/parallel/HLComms.c b/rts/parallel/HLComms.c
new file mode 100644
index 0000000000..b0982e441c
--- /dev/null
+++ b/rts/parallel/HLComms.c
@@ -0,0 +1,1810 @@
+/* ----------------------------------------------------------------------------
+ * Time-stamp: <Wed Mar 21 2001 16:34:41 Stardate: [-30]6363.45 hwloidl>
+ *
+ * High Level Communications Routines (HLComms.lc)
+ *
+ * Contains the high-level routines (i.e. communication
+ * subsystem independent) used by GUM
+ * 
+ * GUM 0.2x: Phil Trinder, Glasgow University, 12 December 1994
+ * GUM 3.xx: Phil Trinder, Simon Marlow July 1998
+ * GUM 4.xx: H-W. Loidl, Heriot-Watt University, November 1999 -
+ * 
+ * ------------------------------------------------------------------------- */
+
+#ifdef PAR /* whole file */
+
+//@node High Level Communications Routines, , ,
+//@section High Level Communications Routines
+
+//@menu
+//* Macros etc::		
+//* Includes::			
+//* GUM Message Sending and Unpacking Functions::  
+//* Message-Processing Functions::  
+//* GUM Message Processor::	
+//* Miscellaneous Functions::	
+//* Index::			
+//@end menu
+
+//@node Macros etc, Includes, High Level Communications Routines, High Level Communications Routines
+//@subsection Macros etc
+
+/* Evidently not Posix */
+/* #include "PosixSource.h" */
+
+//@node Includes, GUM Message Sending and Unpacking Functions, Macros etc, High Level Communications Routines
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "Storage.h"   // for recordMutable
+#include "HLC.h"
+#include "Parallel.h"
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+#include "Sparks.h"
+#include "FetchMe.h"     // for BLOCKED_FETCH_info etc
+#if defined(DEBUG)
+# include "ParallelDebug.h"
+#endif
+#include "StgMacros.h" // inlined IS_... fcts
+
+#ifdef DIST
+#include "SchedAPI.h" //for createIOThread
+extern unsigned int context_switch; 
+#endif /* DIST */
+
+//@node GUM Message Sending and Unpacking Functions, Message-Processing Functions, Includes, High Level Communications Routines
+//@subsection GUM Message Sending and Unpacking Functions
+
+/*
+ * GUM Message Sending and Unpacking Functions
+ */
+
+/*
+ * Allocate space for message processing
+ */
+
+//@cindex gumPackBuffer
+static rtsPackBuffer *gumPackBuffer;
+
+//@cindex initMoreBuffers
+rtsBool
+initMoreBuffers(void)
+{
+  if ((gumPackBuffer = (rtsPackBuffer *)stgMallocWords(RtsFlags.ParFlags.packBufferSize, 
+					     "initMoreBuffers")) == NULL)
+    return rtsFalse;
+  return rtsTrue;
+}
+
+/*
+ * SendFetch packs the two global addresses and a load into a message +
+ * sends it.  
+
+//@cindex FETCH
+
+   Structure of a FETCH message:
+
+         |    GA 1     |        GA 2          |
+         +------------------------------------+------+
+	 | gtid | slot | weight | gtid | slot | load |
+	 +------------------------------------+------+
+ */
+
+//@cindex sendFetch
+void
+sendFetch(globalAddr *rga, globalAddr *lga, int load)
+{
+  ASSERT(rga->weight > 0 && lga->weight > 0);
+  IF_PAR_DEBUG(fetch,
+	       belch("~^** Sending Fetch for ((%x, %d, 0)); locally ((%x, %d, %x)), load = %d", 
+		     rga->payload.gc.gtid, rga->payload.gc.slot, 
+		     lga->payload.gc.gtid, lga->payload.gc.slot, lga->weight,
+		     load));
+
+
+  /* ToDo: Dump event
+  DumpRawGranEvent(CURRENT_PROC, taskIDtoPE(rga->payload.gc.gtid), 
+		   GR_FETCH, CurrentTSO, (StgClosure *)(lga->payload.gc.slot),
+		   0, spark_queue_len(ADVISORY_POOL));
+  */
+
+  sendOpV(PP_FETCH, rga->payload.gc.gtid, 6,
+	  (StgWord) rga->payload.gc.gtid, (StgWord) rga->payload.gc.slot, 
+	  (StgWord) lga->weight, (StgWord) lga->payload.gc.gtid, 
+	  (StgWord) lga->payload.gc.slot, (StgWord) load);
+}
+
+/*
+ * unpackFetch unpacks a FETCH message into two Global addresses and a load
+ * figure.  
+*/
+
+//@cindex unpackFetch
+static void
+unpackFetch(globalAddr *lga, globalAddr *rga, int *load)
+{
+  long buf[6];
+
+  GetArgs(buf, 6); 
+
+  IF_PAR_DEBUG(fetch,
+	       belch("~^** Unpacking Fetch for ((%x, %d, 0)) to ((%x, %d, %x)), load = %d", 
+		     (GlobalTaskId) buf[0], (int) buf[1], 
+		     (GlobalTaskId) buf[3], (int) buf[4], buf[2], buf[5]));
+
+  lga->weight = 1;
+  lga->payload.gc.gtid = (GlobalTaskId) buf[0];
+  lga->payload.gc.slot = (int) buf[1];
+
+  rga->weight = (unsigned) buf[2];
+  rga->payload.gc.gtid = (GlobalTaskId) buf[3];
+  rga->payload.gc.slot = (int) buf[4];
+
+  *load = (int) buf[5];
+
+  ASSERT(rga->weight > 0);
+}
+
+/*
+ * SendResume packs the remote blocking queue's GA and data into a message 
+ * and sends it.
+
+//@cindex RESUME
+
+   Structure of a RESUME message:
+
+      -------------------------------
+      | weight | slot | n | data ...
+      -------------------------------
+
+   data is a packed graph represented as an rtsPackBuffer
+   n is the size of the graph (as returned by PackNearbyGraph) + packet hdr size
+ */
+
+//@cindex sendResume
+void
+sendResume(globalAddr *rga, int nelem, rtsPackBuffer *packBuffer)
+{
+  IF_PAR_DEBUG(fetch,
+	       belch("~^[] Sending Resume (packet <<%d>> with %d elems) for ((%x, %d, %x)) to [%x]", 
+		     packBuffer->id, nelem,
+		     rga->payload.gc.gtid, rga->payload.gc.slot, rga->weight,
+		     rga->payload.gc.gtid));
+  IF_PAR_DEBUG(packet,
+	       PrintPacket(packBuffer));
+
+  ASSERT(nelem==packBuffer->size);
+  /* check for magic end-of-buffer word */
+  IF_DEBUG(sanity, ASSERT(*(packBuffer->buffer+nelem) == END_OF_BUFFER_MARKER));
+
+  sendOpNV(PP_RESUME, rga->payload.gc.gtid, 
+	   nelem + PACK_BUFFER_HDR_SIZE + DEBUG_HEADROOM, (StgPtr)packBuffer, 
+	   2, (rtsWeight) rga->weight, (StgWord) rga->payload.gc.slot);
+}
+
+/*
+ * unpackResume unpacks a Resume message into two Global addresses and
+ * a data array.
+ */
+
+//@cindex unpackResume
+static void
+unpackResume(globalAddr *lga, int *nelem, rtsPackBuffer *packBuffer)
+{
+    long buf[3];
+
+    GetArgs(buf, 3); 
+
+    /*
+      RESUME event is written in awaken_blocked_queue
+    DumpRawGranEvent(CURRENT_PROC, taskIDtoPE(lga->payload.gc.gtid), 
+		     GR_RESUME, END_TSO_QUEUE, (StgClosure *)NULL, 0, 0);
+    */
+
+    lga->weight = (unsigned) buf[0];
+    lga->payload.gc.gtid = mytid;
+    lga->payload.gc.slot = (int) buf[1];
+
+    *nelem = (int) buf[2] - PACK_BUFFER_HDR_SIZE - DEBUG_HEADROOM;
+    GetArgs(packBuffer, *nelem + PACK_BUFFER_HDR_SIZE + DEBUG_HEADROOM);
+
+    IF_PAR_DEBUG(fetch,
+		 belch("~^[] Unpacking Resume (packet <<%d>> with %d elems) for ((%x, %d, %x))", 
+		       packBuffer->id, *nelem, mytid, (int) buf[1], (unsigned) buf[0]));
+
+    /* check for magic end-of-buffer word */
+    IF_DEBUG(sanity, ASSERT(*(packBuffer->buffer+*nelem) == END_OF_BUFFER_MARKER));
+}
+
+/*
+ * SendAck packs the global address being acknowledged, together with
+ * an array of global addresses for any closures shipped and sends them.
+
+//@cindex ACK
+
+   Structure of an ACK message:
+
+      |        GA 1          |        GA 2          | 
+      +---------------------------------------------+-------
+      | weight | gtid | slot | weight | gtid | slot |  .....  ngas times
+      + --------------------------------------------+------- 
+
+ */
+
+//@cindex sendAck
+void
+sendAck(GlobalTaskId task, int ngas, globalAddr *gagamap)
+{
+  static long *buffer;
+  long *p;
+  int i;
+
+  if(ngas==0)
+    return; //don't send unnecessary messages!!
+  
+  buffer = (long *) gumPackBuffer;
+
+  for(i = 0, p = buffer; i < ngas; i++, p += 6) {
+    ASSERT(gagamap[1].weight > 0);
+    p[0] = (long) gagamap->weight;
+    p[1] = (long) gagamap->payload.gc.gtid;
+    p[2] = (long) gagamap->payload.gc.slot;
+    gagamap++;
+    p[3] = (long) gagamap->weight;
+    p[4] = (long) gagamap->payload.gc.gtid;
+    p[5] = (long) gagamap->payload.gc.slot;
+    gagamap++;
+  }
+  IF_PAR_DEBUG(schedule,
+	       belch("~^,, Sending Ack (%d pairs) to [%x]\n", 
+		     ngas, task));
+
+  sendOpN(PP_ACK, task, p - buffer, (StgPtr)buffer);
+}
+
+/*
+ * unpackAck unpacks an Acknowledgement message into a Global address,
+ * a count of the number of global addresses following and a map of 
+ * Global addresses
+ */
+
+//@cindex unpackAck
+static void
+unpackAck(int *ngas, globalAddr *gagamap)
+{
+  long GAarraysize;
+  long buf[6];
+  
+  GetArgs(&GAarraysize, 1);
+  
+  *ngas = GAarraysize / 6;
+  
+  IF_PAR_DEBUG(schedule,
+	       belch("~^,, Unpacking Ack (%d pairs) on [%x]\n", 
+		     *ngas, mytid));
+
+  while (GAarraysize > 0) {
+    GetArgs(buf, 6);
+    gagamap->weight = (rtsWeight) buf[0];
+    gagamap->payload.gc.gtid = (GlobalTaskId) buf[1];
+    gagamap->payload.gc.slot = (int) buf[2];
+    gagamap++;
+    gagamap->weight = (rtsWeight) buf[3];
+    gagamap->payload.gc.gtid = (GlobalTaskId) buf[4];
+    gagamap->payload.gc.slot = (int) buf[5];
+    ASSERT(gagamap->weight > 0);
+    gagamap++;
+    GAarraysize -= 6;
+  }
+}
+
+/*
+ * SendFish packs the global address being acknowledged, together with
+ * an array of global addresses for any closures shipped and sends them.
+
+//@cindex FISH
+
+ Structure of a FISH message:
+
+     +----------------------------------+
+     | orig PE | age | history | hunger |
+     +----------------------------------+
+ */
+
+//@cindex sendFish
+void
+sendFish(GlobalTaskId destPE, GlobalTaskId origPE, 
+	 int age, int history, int hunger)
+{
+  IF_PAR_DEBUG(fish,
+	       belch("~^$$ Sending Fish to [%x] (%d outstanding fishes)", 
+		     destPE, outstandingFishes));
+
+  sendOpV(PP_FISH, destPE, 4, 
+	  (StgWord) origPE, (StgWord) age, (StgWord) history, (StgWord) hunger);
+
+  if (origPE == mytid) {
+    //fishing = rtsTrue;
+    outstandingFishes++;
+  }
+}
+
+/*
+ * unpackFish unpacks a FISH message into the global task id of the
+ * originating PE and 3 data fields: the age, history and hunger of the
+ * fish. The history + hunger are not currently used.
+
+ */
+
+//@cindex unpackFish
+static void
+unpackFish(GlobalTaskId *origPE, int *age, int *history, int *hunger)
+{
+  long buf[4];
+  
+  GetArgs(buf, 4);
+  
+  IF_PAR_DEBUG(fish,
+	       belch("~^$$ Unpacking Fish from [%x] (age=%d)", 
+		     (GlobalTaskId) buf[0], (int) buf[1]));
+
+  *origPE = (GlobalTaskId) buf[0];
+  *age = (int) buf[1];
+  *history = (int) buf[2];
+  *hunger = (int) buf[3];
+}
+
+/*
+ * SendFree sends (weight, slot) pairs for GAs that we no longer need
+ * references to.  
+
+//@cindex FREE
+
+   Structure of a FREE message:
+   
+       +-----------------------------
+       | n | weight_1 | slot_1 | ...
+       +-----------------------------
+ */
+//@cindex sendFree
+void
+sendFree(GlobalTaskId pe, int nelem, StgPtr data)
+{
+    IF_PAR_DEBUG(free,
+		 belch("~^!! Sending Free (%d GAs) to [%x]", 
+		       nelem/2, pe));
+
+    sendOpN(PP_FREE, pe, nelem, data);
+}
+
+/*
+ * unpackFree unpacks a FREE message into the amount of data shipped and
+ * a data block.
+ */
+//@cindex unpackFree
+static void
+unpackFree(int *nelem, StgWord *data)
+{
+  long buf[1];
+  
+  GetArgs(buf, 1);
+  *nelem = (int) buf[0];
+
+  IF_PAR_DEBUG(free,
+	       belch("~^!! Unpacking Free (%d GAs)", 
+		     *nelem/2));
+
+  GetArgs(data, *nelem);
+}
+
+/*
+ * SendSchedule sends a closure to be evaluated in response to a Fish
+ * message. The message is directed to the PE that originated the Fish
+ * (origPE), and includes the packed closure (data) along with its size
+ * (nelem).
+
+//@cindex SCHEDULE
+
+   Structure of a SCHEDULE message:
+
+       +------------------------------------
+       | PE | n | pack buffer of a graph ...
+       +------------------------------------
+ */
+//@cindex sendSchedule
+void
+sendSchedule(GlobalTaskId origPE, int nelem, rtsPackBuffer *packBuffer) 
+{
+  IF_PAR_DEBUG(schedule,
+	       belch("~^-- Sending Schedule (packet <<%d>> with %d elems) to [%x]\n", 
+		     packBuffer->id, nelem, origPE));
+  IF_PAR_DEBUG(packet,
+	       PrintPacket(packBuffer));
+
+  ASSERT(nelem==packBuffer->size);
+  /* check for magic end-of-buffer word */
+  IF_DEBUG(sanity, ASSERT(*(packBuffer->buffer+nelem) == END_OF_BUFFER_MARKER));
+
+  sendOpN(PP_SCHEDULE, origPE, 
+	  nelem + PACK_BUFFER_HDR_SIZE + DEBUG_HEADROOM, (StgPtr)packBuffer);
+}
+
+/*
+ * unpackSchedule unpacks a SCHEDULE message into the Global address of
+ * the closure shipped, the amount of data shipped (nelem) and the data
+ * block (data).
+ */
+
+//@cindex unpackSchedule
+static void
+unpackSchedule(int *nelem, rtsPackBuffer *packBuffer)
+{
+  long buf[1];
+
+  /* first, just unpack 1 word containing the total size (including header) */
+  GetArgs(buf, 1);
+  /* no. of elems, not counting the header of the pack buffer */
+  *nelem = (int) buf[0] - PACK_BUFFER_HDR_SIZE - DEBUG_HEADROOM;
+
+  /* automatic cast of flat pvm-data to rtsPackBuffer */
+  GetArgs(packBuffer, *nelem + PACK_BUFFER_HDR_SIZE + DEBUG_HEADROOM);
+
+  IF_PAR_DEBUG(schedule,
+	       belch("~^-- Unpacking Schedule (packet <<%d>> with %d elems) on [%x]\n", 
+		     packBuffer->id, *nelem, mytid));
+
+  ASSERT(*nelem==packBuffer->size);
+  /* check for magic end-of-buffer word */
+  IF_DEBUG(sanity, ASSERT(*(packBuffer->buffer+*nelem) == END_OF_BUFFER_MARKER));
+}
+
+#ifdef DIST
+/* sendReval is almost identical to the Schedule version, so we can unpack with unpackSchedule */
+void
+sendReval(GlobalTaskId origPE, int nelem, rtsPackBuffer *packBuffer) 
+{  
+  IF_PAR_DEBUG(schedule,
+	       belch("~^-- Sending Reval (packet <<%d>> with %d elems) to [%x]\n", 
+		     packBuffer->id, nelem, origPE));
+  IF_PAR_DEBUG(packet,
+	       PrintPacket(packBuffer));
+
+  ASSERT(nelem==packBuffer->size);
+  /* check for magic end-of-buffer word */
+  IF_DEBUG(sanity, ASSERT(*(packBuffer->buffer+nelem) == END_OF_BUFFER_MARKER));
+
+  sendOpN(PP_REVAL, origPE, 
+	  nelem + PACK_BUFFER_HDR_SIZE + DEBUG_HEADROOM, (StgPtr)packBuffer);
+}
+
+void FinishReval(StgTSO *t)
+{ StgClosure *res;
+  globalAddr ga;
+  nat size;
+  rtsPackBuffer *buffer=NULL;
+  
+  ga.payload.gc.slot = t->revalSlot;
+  ga.payload.gc.gtid = t->revalTid;
+  ga.weight = 0; 
+  
+  //find where the reval result is
+  res = GALAlookup(&ga);
+  ASSERT(res);
+  
+  IF_PAR_DEBUG(schedule,
+    printGA(&ga);
+    belch(" needs the result %08x\n",res));       
+  
+  //send off the result
+  buffer = PackNearbyGraph(res, END_TSO_QUEUE, &size,ga.payload.gc.gtid);
+  ASSERT(buffer != (rtsPackBuffer *)NULL);
+  sendResume(&ga, size, buffer);
+
+  IF_PAR_DEBUG(schedule,
+    belch("@;~) Reval Finished"));
+}
+
+#endif /* DIST */
+
+//@node Message-Processing Functions, GUM Message Processor, GUM Message Sending and Unpacking Functions, High Level Communications Routines
+//@subsection Message-Processing Functions
+
+/*
+ * Message-Processing Functions
+ *
+ * The following routines process incoming GUM messages. Often reissuing
+ * messages in response.
+ *
+ * processFish unpacks a fish message, reissuing it if it's our own,
+ * sending work if we have it or sending it onwards otherwise.
+ */
+
+/*
+ * processFetches constructs and sends resume messages for every
+ * BlockedFetch which is ready to be awakened.
+ * awaken_blocked_queue (in Schedule.c) is responsible for moving 
+ * BlockedFetches from a blocking queue to the PendingFetches queue.
+ */
+void GetRoots(void);
+extern StgBlockedFetch *PendingFetches;
+
+nat
+pending_fetches_len(void)
+{
+  StgBlockedFetch *bf;
+  nat n;
+
+  for (n=0, bf=PendingFetches; bf != END_BF_QUEUE; n++, bf = (StgBlockedFetch *)(bf->link)) {
+    ASSERT(get_itbl(bf)->type==BLOCKED_FETCH);
+  }
+  return n;
+}
+
+//@cindex processFetches
+void
+processFetches(void) {
+  StgBlockedFetch *bf, *next;
+  StgClosure *closure;
+  StgInfoTable *ip;
+  globalAddr rga;
+  static rtsPackBuffer *packBuffer;
+    
+  IF_PAR_DEBUG(verbose,
+	       belch("____ processFetches: %d pending fetches (root @ %p)",
+		     pending_fetches_len(), PendingFetches));
+  
+  for (bf = PendingFetches; 
+       bf != END_BF_QUEUE;
+       bf=next) {
+    /* the PendingFetches list contains only BLOCKED_FETCH closures */
+    ASSERT(get_itbl(bf)->type==BLOCKED_FETCH);
+    /* store link (we might overwrite it via blockFetch later on */
+    next = (StgBlockedFetch *)(bf->link);
+
+    /*
+     * Find the target at the end of the indirection chain, and
+     * process it in much the same fashion as the original target
+     * of the fetch.  Though we hope to find graph here, we could
+     * find a black hole (of any flavor) or even a FetchMe.
+     */
+    closure = bf->node;
+    /*
+      We evacuate BQs and update the node fields where necessary in GC.c
+      So, if we find an EVACUATED closure, something has gone Very Wrong
+      (and therefore we let the RTS crash most ungracefully).
+    */
+    ASSERT(get_itbl(closure)->type != EVACUATED);
+      //  closure = ((StgEvacuated *)closure)->evacuee;
+
+    closure = UNWIND_IND(closure);
+    //while ((ind = IS_INDIRECTION(closure)) != NULL) { closure = ind; }
+
+    ip = get_itbl(closure);
+    if (ip->type == FETCH_ME) {
+      /* Forward the Fetch to someone else */
+      rga.payload.gc.gtid = bf->ga.payload.gc.gtid;
+      rga.payload.gc.slot = bf->ga.payload.gc.slot;
+      rga.weight = bf->ga.weight;
+      
+      sendFetch(((StgFetchMe *)closure)->ga, &rga, 0 /* load */);
+
+      // Global statistics: count no. of fetches
+      if (RtsFlags.ParFlags.ParStats.Global &&
+	  RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	globalParStats.tot_fetch_mess++;
+      }
+
+      IF_PAR_DEBUG(fetch,
+		   belch("__-> processFetches: Forwarding fetch from %lx to %lx",
+			 mytid, rga.payload.gc.gtid));
+
+    } else if (IS_BLACK_HOLE(closure)) {
+      IF_PAR_DEBUG(verbose,
+		   belch("__++ processFetches: trying to send a BLACK_HOLE => doing a blockFetch on closure %p (%s)",
+			 closure, info_type(closure)));
+      bf->node = closure;
+      blockFetch(bf, closure);
+    } else {
+      /* We now have some local graph to send back */
+      nat size;
+
+      packBuffer = gumPackBuffer;
+      IF_PAR_DEBUG(verbose,
+		   belch("__*> processFetches: PackNearbyGraph of closure %p (%s)",
+			 closure, info_type(closure)));
+
+      if ((packBuffer = PackNearbyGraph(closure, END_TSO_QUEUE, &size, bf->ga.payload.gc.gtid)) == NULL) {
+	// Put current BF back on list
+	bf->link = (StgBlockingQueueElement *)PendingFetches;
+	PendingFetches = (StgBlockedFetch *)bf;
+	// ToDo: check that nothing more has to be done to prepare for GC!
+	barf("processFetches: out of heap while packing graph; ToDo: call GC here");
+	GarbageCollect(GetRoots, rtsFalse); 
+	bf = PendingFetches;
+	PendingFetches = (StgBlockedFetch *)(bf->link);
+	closure = bf->node;
+	packBuffer = PackNearbyGraph(closure, END_TSO_QUEUE, &size, bf->ga.payload.gc.gtid);
+	ASSERT(packBuffer != (rtsPackBuffer *)NULL);
+      }
+      rga.payload.gc.gtid = bf->ga.payload.gc.gtid;
+      rga.payload.gc.slot = bf->ga.payload.gc.slot;
+      rga.weight = bf->ga.weight;
+      
+      sendResume(&rga, size, packBuffer);
+
+      // Global statistics: count no. of fetches
+      if (RtsFlags.ParFlags.ParStats.Global &&
+	  RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	globalParStats.tot_resume_mess++;
+      }
+    }
+  }
+  PendingFetches = END_BF_QUEUE;
+}
+
+#if 0
+/*
+  Alternatively to sending fetch messages directly from the FETCH_ME_entry
+  code we could just store the data about the remote data in a global
+  variable and send the fetch request from the main scheduling loop (similar
+  to processFetches above). This would save an expensive STGCALL in the entry 
+  code because we have to go back to the scheduler anyway.
+*/
+//@cindex processFetches
+void
+processTheRealFetches(void) {
+  StgBlockedFetch *bf;
+  StgClosure *closure, *next;
+    
+  IF_PAR_DEBUG(verbose,
+	       belch("__ processTheRealFetches: ");
+	       printGA(&theGlobalFromGA);
+	       printGA(&theGlobalToGA));
+
+  ASSERT(theGlobalFromGA.payload.gc.gtid != 0 &&
+	 theGlobalToGA.payload.gc.gtid != 0);
+
+  /* the old version did this in the FETCH_ME entry code */
+  sendFetch(&theGlobalFromGA, &theGlobalToGA, 0/*load*/);
+  
+}
+#endif
+
+
+/* 
+   Way of dealing with unwanted fish.
+   Used during startup/shutdown, or from unknown PEs 
+*/
+void
+bounceFish(void) { 
+  GlobalTaskId origPE;
+  int age, history, hunger;
+  
+  /* IF_PAR_DEBUG(verbose, */
+	       belch(".... [%x] Bouncing unwanted FISH",mytid);
+
+  unpackFish(&origPE, &age, &history, &hunger);
+	  
+  if (origPE == mytid) {
+    //fishing = rtsFalse;                   // fish has come home
+    outstandingFishes--;
+    last_fish_arrived_at = CURRENT_TIME;  // remember time (see schedule fct)
+    return;                               // that's all
+  }
+
+  /* otherwise, send it home to die */
+  sendFish(origPE, origPE, (age + 1), NEW_FISH_HISTORY, NEW_FISH_HUNGER);
+  // Global statistics: count no. of fetches
+      if (RtsFlags.ParFlags.ParStats.Global &&
+	  RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	globalParStats.tot_fish_mess++;
+      }
+}
+   
+/*
+ * processFish unpacks a fish message, reissuing it if it's our own,
+ * sending work if we have it or sending it onwards otherwise.
+ */
+//@cindex processFish
+static void
+processFish(void)
+{
+  GlobalTaskId origPE;
+  int age, history, hunger;
+  rtsSpark spark;
+  static rtsPackBuffer *packBuffer; 
+
+  unpackFish(&origPE, &age, &history, &hunger);
+
+  if (origPE == mytid) {
+    //fishing = rtsFalse;                   // fish has come home
+    outstandingFishes--;
+    last_fish_arrived_at = CURRENT_TIME;  // remember time (see schedule fct)
+    return;                               // that's all
+  }
+
+  ASSERT(origPE != mytid);
+  IF_PAR_DEBUG(fish,
+	       belch("$$__ processing fish; %d sparks available",
+		     spark_queue_len(&(MainRegTable.rSparks))));
+  while ((spark = findSpark(rtsTrue/*for_export*/)) != NULL) {
+    nat size;
+    // StgClosure *graph;
+
+    packBuffer = gumPackBuffer; 
+    ASSERT(closure_SHOULD_SPARK((StgClosure *)spark));
+    if ((packBuffer = PackNearbyGraph(spark, END_TSO_QUEUE, &size,origPE)) == NULL) {
+      IF_PAR_DEBUG(fish,
+		   belch("$$ GC while trying to satisfy FISH via PackNearbyGraph of node %p",
+			 (StgClosure *)spark));
+      barf("processFish: out of heap while packing graph; ToDo: call GC here");
+      GarbageCollect(GetRoots, rtsFalse);
+      /* Now go back and try again */
+    } else {
+      IF_PAR_DEBUG(verbose,
+		   if (RtsFlags.ParFlags.ParStats.Sparks)
+		     belch("==== STEALING spark %x; sending to %x", spark, origPE));
+      
+      IF_PAR_DEBUG(fish,
+		   belch("$$-- Replying to FISH from %x by sending graph @ %p (%s)",
+			 origPE, 
+			 (StgClosure *)spark, info_type((StgClosure *)spark)));
+      sendSchedule(origPE, size, packBuffer);
+      disposeSpark(spark);
+      // Global statistics: count no. of fetches
+      if (RtsFlags.ParFlags.ParStats.Global &&
+	  RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	globalParStats.tot_schedule_mess++;
+      }
+
+      break;
+    }
+  }
+  if (spark == (rtsSpark)NULL) {
+    IF_PAR_DEBUG(fish,
+		 belch("$$^^ No sparks available for FISH from %x",
+		       origPE));
+    /* We have no sparks to give */
+    if (age < FISH_LIFE_EXPECTANCY) {
+      /* and the fish is atill young, send it to another PE to look for work */
+      sendFish(choosePE(), origPE,
+	       (age + 1), NEW_FISH_HISTORY, NEW_FISH_HUNGER);
+
+      // Global statistics: count no. of fetches
+      if (RtsFlags.ParFlags.ParStats.Global &&
+	  RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	globalParStats.tot_fish_mess++;
+      }
+    } else { /* otherwise, send it home to die */
+      sendFish(origPE, origPE, (age + 1), NEW_FISH_HISTORY, NEW_FISH_HUNGER);
+      // Global statistics: count no. of fetches
+      if (RtsFlags.ParFlags.ParStats.Global &&
+	  RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	globalParStats.tot_fish_mess++;
+      }
+    }
+  }
+}  /* processFish */
+
+/*
+ * processFetch either returns the requested data (if available) 
+ * or blocks the remote blocking queue on a black hole (if not).
+ */
+
+//@cindex processFetch
+static void
+processFetch(void)
+{
+  globalAddr ga, rga;
+  int load;
+  StgClosure *closure;
+  StgInfoTable *ip;
+
+  unpackFetch(&ga, &rga, &load);
+  IF_PAR_DEBUG(fetch,
+	       belch("%%%%__ Rcvd Fetch for ((%x, %d, 0)), Resume ((%x, %d, %x)) (load %d) from %x",
+		     ga.payload.gc.gtid, ga.payload.gc.slot,
+		     rga.payload.gc.gtid, rga.payload.gc.slot, rga.weight, load,
+		     rga.payload.gc.gtid));
+
+  closure = GALAlookup(&ga);
+  ASSERT(closure != (StgClosure *)NULL);
+  ip = get_itbl(closure);
+  if (ip->type == FETCH_ME) {
+    /* Forward the Fetch to someone else */
+    sendFetch(((StgFetchMe *)closure)->ga, &rga, load);
+
+    // Global statistics: count no. of fetches
+    if (RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+      globalParStats.tot_fetch_mess++;
+    }
+  } else if (rga.payload.gc.gtid == mytid) {
+    /* Our own FETCH forwarded back around to us */
+    StgFetchMeBlockingQueue *fmbq = (StgFetchMeBlockingQueue *)GALAlookup(&rga);
+    
+    IF_PAR_DEBUG(fetch,
+		 belch("%%%%== Fetch returned to sending PE; closure=%p (%s); receiver=%p (%s)",
+		       closure, info_type(closure), fmbq, info_type((StgClosure*)fmbq)));
+    /* We may have already discovered that the fetch target is our own. */
+    if ((StgClosure *)fmbq != closure) 
+      CommonUp((StgClosure *)fmbq, closure);
+    (void) addWeight(&rga);
+  } else if (IS_BLACK_HOLE(closure)) {
+    /* This includes RBH's and FMBQ's */
+    StgBlockedFetch *bf;
+
+    /* Can we assert something on the remote GA? */
+    ASSERT(GALAlookup(&rga) == NULL);
+
+    /* If we're hitting a BH or RBH or FMBQ we have to put a BLOCKED_FETCH
+       closure into the BQ in order to denote that when updating this node
+       the result should be sent to the originator of this fetch message. */
+    bf = (StgBlockedFetch *)createBlockedFetch(ga, rga);
+    IF_PAR_DEBUG(fetch,
+		 belch("%%++ Blocking Fetch ((%x, %d, %x)) on %p (%s)",
+		       rga.payload.gc.gtid, rga.payload.gc.slot, rga.weight, 
+		       closure, info_type(closure)));
+    blockFetch(bf, closure);
+  } else {			
+    /* The target of the FetchMe is some local graph */
+    nat size;
+    // StgClosure *graph;
+    rtsPackBuffer *buffer = (rtsPackBuffer *)NULL;
+
+    if ((buffer = PackNearbyGraph(closure, END_TSO_QUEUE, &size, rga.payload.gc.gtid)) == NULL) {
+      barf("processFetch: out of heap while packing graph; ToDo: call GC here");
+      GarbageCollect(GetRoots, rtsFalse); 
+      closure = GALAlookup(&ga);
+      buffer = PackNearbyGraph(closure, END_TSO_QUEUE, &size, rga.payload.gc.gtid);
+      ASSERT(buffer != (rtsPackBuffer *)NULL);
+    }
+    sendResume(&rga, size, buffer);
+
+    // Global statistics: count no. of fetches
+    if (RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+      globalParStats.tot_resume_mess++;
+    }
+  }
+}
+
+/* 
+   The list of pending fetches must be a root-list for GC.
+   This routine is called from GC.c (same as marking GAs etc).
+*/
+void
+markPendingFetches(rtsBool major_gc) {
+
+  /* No need to traverse the list; this is done via the scavenge code
+     for a BLOCKED_FETCH closure, which evacuates the link field */
+
+  if (PendingFetches != END_BF_QUEUE ) {
+    IF_PAR_DEBUG(tables,
+		 fprintf(stderr, "@@@@ PendingFetches is root; evaced from %p to",
+			 PendingFetches));
+
+    PendingFetches = MarkRoot((StgClosure*)PendingFetches);
+
+    IF_PAR_DEBUG(verbose,
+		 fprintf(stderr, " %p\n", PendingFetches));
+
+  } else {
+    IF_PAR_DEBUG(tables,
+		 fprintf(stderr, "@@@@ PendingFetches is empty; no need to mark it\n"));
+  }
+}
+
+/*
+ * processFree unpacks a FREE message and adds the weights to our GAs.
+ */
+//@cindex processFree
+static void
+processFree(void)
+{
+  int nelem;
+  static StgWord *buffer;
+  int i;
+  globalAddr ga;
+
+  buffer = (StgWord *)gumPackBuffer;
+  unpackFree(&nelem, buffer);
+  IF_PAR_DEBUG(free,
+	       belch("!!__ Rcvd Free (%d GAs)", nelem / 2));
+
+  ga.payload.gc.gtid = mytid;
+  for (i = 0; i < nelem;) {
+    ga.weight = (rtsWeight) buffer[i++];
+    ga.payload.gc.slot = (int) buffer[i++];
+    IF_PAR_DEBUG(free,
+		 fprintf(stderr, "!!-- Processing free "); 
+		 printGA(&ga);
+		 fputc('\n', stderr);
+		 );
+    (void) addWeight(&ga);
+  }
+}
+
+/*
+ * processResume unpacks a RESUME message into the graph, filling in
+ * the LA -> GA, and GA -> LA tables. Threads blocked on the original
+ * FetchMe (now a blocking queue) are awakened, and the blocking queue
+ * is converted into an indirection.  Finally it sends an ACK in response
+ * which contains any newly allocated GAs.
+ */
+
+//@cindex processResume
+static void
+processResume(GlobalTaskId sender)
+{
+  int nelem;
+  nat nGAs;
+  static rtsPackBuffer *packBuffer;
+  StgClosure *newGraph, *old;
+  globalAddr lga;
+  globalAddr *gagamap;
+  
+  packBuffer = (rtsPackBuffer *)gumPackBuffer;
+  unpackResume(&lga, &nelem, packBuffer);
+
+  IF_PAR_DEBUG(fetch,
+	       fprintf(stderr, "[]__ Rcvd Resume for "); 
+	       printGA(&lga);
+	       fputc('\n', stderr));
+  IF_PAR_DEBUG(packet,
+	       PrintPacket((rtsPackBuffer *)packBuffer));
+  
+  /* 
+   * We always unpack the incoming graph, even if we've received the
+   * requested node in some other data packet (and already awakened
+   * the blocking queue).
+  if (SAVE_Hp + packBuffer[0] >= SAVE_HpLim) {
+    ReallyPerformThreadGC(packBuffer[0], rtsFalse);
+    SAVE_Hp -= packBuffer[0];
+  }
+   */
+
+  // ToDo: Check for GC here !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+  /* Do this *after* GC; we don't want to release the object early! */
+
+  if (lga.weight > 0)
+    (void) addWeight(&lga);
+
+  old = GALAlookup(&lga);
+
+  /* ToDo:  The closure that requested this graph must be one of these two?*/
+  ASSERT(get_itbl(old)->type == FETCH_ME_BQ || 
+	 get_itbl(old)->type == RBH);
+
+  if (RtsFlags.ParFlags.ParStats.Full) {
+    StgBlockingQueueElement *bqe, *last_bqe;
+
+    IF_PAR_DEBUG(fetch,
+		 belch("[]-- Resume is REPLY to closure %lx", old));
+
+    /* Write REPLY events to the log file, indicating that the remote
+       data has arrived 
+       NB: we emit a REPLY only for the *last* elem in the queue; this is
+           the one that triggered the fetch message; all other entries
+	   have just added themselves to the queue, waiting for the data 
+	   they know that has been requested (see entry code for FETCH_ME_BQ)
+    */
+    if ((get_itbl(old)->type == FETCH_ME_BQ ||
+	 get_itbl(old)->type == RBH)) {
+      for (bqe = ((StgFetchMeBlockingQueue *)old)->blocking_queue,
+	   last_bqe = END_BQ_QUEUE;
+	     get_itbl(bqe)->type==TSO || 
+	     get_itbl(bqe)->type==BLOCKED_FETCH;
+	   last_bqe = bqe, bqe = bqe->link) { /* nothing */ }
+
+      ASSERT(last_bqe==END_BQ_QUEUE || 
+	     get_itbl((StgClosure *)last_bqe)->type == TSO);
+
+      /* last_bqe now points to the TSO that triggered the FETCH */ 
+      if (get_itbl((StgClosure *)last_bqe)->type == TSO)
+	DumpRawGranEvent(CURRENT_PROC, taskIDtoPE(sender), 
+			 GR_REPLY, ((StgTSO *)last_bqe), ((StgTSO *)last_bqe)->block_info.closure,
+			 0, spark_queue_len(&(MainRegTable.rSparks)));
+    }
+  }
+
+  newGraph = UnpackGraph(packBuffer, &gagamap, &nGAs);
+  ASSERT(newGraph != NULL);
+
+  /* 
+   * Sometimes, unpacking will common up the resumee with the
+   * incoming graph, but if it hasn't, we'd better do so now.
+   */
+   
+  if (get_itbl(old)->type == FETCH_ME_BQ)
+    CommonUp(old, newGraph);
+
+  IF_PAR_DEBUG(fetch,
+	       belch("[]-- Ready to resume unpacked graph at %p (%s)",
+		     newGraph, info_type(newGraph)));
+
+  IF_PAR_DEBUG(tables,
+	       DebugPrintGAGAMap(gagamap, nGAs));
+  
+  sendAck(sender, nGAs, gagamap);
+}
+
+/*
+ * processSchedule unpacks a SCHEDULE message into the graph, filling
+ * in the LA -> GA, and GA -> LA tables. The root of the graph is added to
+ * the local spark queue.  Finally it sends an ACK in response
+ * which contains any newly allocated GAs.
+ */
+//@cindex processSchedule
+static void
+processSchedule(GlobalTaskId sender)
+{
+  nat nelem, nGAs;
+  rtsBool success;
+  static rtsPackBuffer *packBuffer;
+  StgClosure *newGraph;
+  globalAddr *gagamap;
+  
+  packBuffer = gumPackBuffer;		/* HWL */
+  unpackSchedule(&nelem, packBuffer);
+
+  IF_PAR_DEBUG(schedule,
+	       belch("--__ Rcvd Schedule (%d elems)", nelem));
+  IF_PAR_DEBUG(packet,
+	       PrintPacket(packBuffer));
+
+  /*
+   * For now, the graph is a closure to be sparked as an advisory
+   * spark, but in future it may be a complete spark with
+   * required/advisory status, priority etc.
+   */
+
+  /*
+  space_required = packBuffer[0];
+  if (SAVE_Hp + space_required >= SAVE_HpLim) {
+    ReallyPerformThreadGC(space_required, rtsFalse);
+    SAVE_Hp -= space_required;
+  }
+  */
+  // ToDo: check whether GC is necessary !!!!!!!!!!!!!!!!!!!!!
+  newGraph = UnpackGraph(packBuffer, &gagamap, &nGAs);
+  ASSERT(newGraph != NULL);
+  success = add_to_spark_queue(newGraph, &(MainRegTable.rSparks));
+
+  if (RtsFlags.ParFlags.ParStats.Full && 
+      RtsFlags.ParFlags.ParStats.Sparks && 
+      success) 
+    DumpRawGranEvent(CURRENT_PROC, CURRENT_PROC, 
+		     GR_STOLEN, ((StgTSO *)NULL), newGraph, 
+		     0, 0 /* spark_queue_len(ADVISORY_POOL) */);
+
+  IF_PAR_DEBUG(schedule,
+	       if (success)
+  	         belch("--^^  added spark to unpacked graph %p (%s); %d sparks available on [%x] (%s)", 
+		     newGraph, info_type(newGraph), spark_queue_len(&(MainRegTable.rSparks)), mytid);
+	       else
+                 belch("--^^  received non-sparkable closure %p (%s); nothing added to spark pool; %d sparks available on [%x]", 
+		     newGraph, info_type(newGraph), spark_queue_len(&(MainRegTable.rSparks)), mytid));
+  IF_PAR_DEBUG(packet,
+	       belch("*<    Unpacked graph with root at %p (%s):", 
+		     newGraph, info_type(newGraph));
+	       PrintGraph(newGraph, 0));
+
+  IF_PAR_DEBUG(tables,
+  	       DebugPrintGAGAMap(gagamap, nGAs));
+
+  sendAck(sender, nGAs, gagamap);
+
+  //fishing = rtsFalse;
+  ASSERT(outstandingFishes>0);
+  outstandingFishes--;
+}
+
+/*
+ * processAck unpacks an ACK, and uses the GAGA map to convert RBH's
+ * (which represent shared thunks that have been shipped) into fetch-mes
+ * to remote GAs.
+ */
+//@cindex processAck
+static void
+processAck(void)
+{
+  nat nGAs;
+  globalAddr *gaga;
+  globalAddr gagamap[256]; // ToDo: elim magic constant!!   MAX_GAS * 2];??
+
+  unpackAck(&nGAs, gagamap);
+
+  IF_PAR_DEBUG(tables,
+	       belch(",,,, Rcvd Ack (%d pairs)", nGAs);
+	       DebugPrintGAGAMap(gagamap, nGAs));
+
+  IF_DEBUG(sanity,
+	   checkGAGAMap(gagamap, nGAs));
+
+  /*
+   * For each (oldGA, newGA) pair, set the GA of the corresponding
+   * thunk to the newGA, convert the thunk to a FetchMe, and return
+   * the weight from the oldGA.
+   */
+  for (gaga = gagamap; gaga < gagamap + nGAs * 2; gaga += 2) {
+    StgClosure *old_closure = GALAlookup(gaga);
+    StgClosure *new_closure = GALAlookup(gaga + 1);
+
+    ASSERT(old_closure != NULL);
+    if (new_closure == NULL) {
+      /* We don't have this closure, so we make a fetchme for it */
+      globalAddr *ga = setRemoteGA(old_closure, gaga + 1, rtsTrue);
+      
+      /* convertToFetchMe should be done unconditionally here.
+	 Currently, we assign GAs to CONSTRs, too, (a bit of a hack),
+	 so we have to check whether it is an RBH before converting
+
+	 ASSERT(get_itbl(old_closure)==RBH);
+      */
+      if (get_itbl(old_closure)->type==RBH)
+	convertToFetchMe((StgRBH *)old_closure, ga);
+    } else {
+      /* 
+       * Oops...we've got this one already; update the RBH to
+       * point to the object we already know about, whatever it
+       * happens to be.
+       */
+      CommonUp(old_closure, new_closure);
+      
+      /* 
+       * Increase the weight of the object by the amount just
+       * received in the second part of the ACK pair.
+       */
+      (void) addWeight(gaga + 1);
+    }
+    (void) addWeight(gaga);
+  }
+
+  /* check the sanity of the LAGA and GALA tables after mincing them */
+  IF_DEBUG(sanity, checkLAGAtable(rtsFalse));
+}
+
+#ifdef DIST
+
+void
+bounceReval(void) {  
+  barf("Task %x: TODO: should send NACK in response to REVAL",mytid);	  
+}
+
+static void
+processReval(GlobalTaskId sender) //similar to schedule...
+{ nat nelem, space_required, nGAs;
+  static rtsPackBuffer *packBuffer;
+  StgClosure *newGraph;
+  globalAddr *gagamap;
+  StgTSO*     tso;
+  globalAddr *ga;
+  
+  packBuffer = gumPackBuffer;		/* HWL */
+  unpackSchedule(&nelem, packBuffer); /* okay, since the structure is the same */
+
+  IF_PAR_DEBUG(packet,
+	       belch("@;~) [%x] Rcvd Reval (%d elems)", mytid, nelem);
+	       PrintPacket(packBuffer));
+
+  /*
+  space_required = packBuffer[0];
+  if (SAVE_Hp + space_required >= SAVE_HpLim) {
+    ReallyPerformThreadGC(space_required, rtsFalse);
+    SAVE_Hp -= space_required;
+  }
+  */
+  
+  // ToDo: check whether GC is necessary !!!!!!!!!!!!!!!!!!!!!
+  newGraph = UnpackGraph(packBuffer, &gagamap, &nGAs);
+  ASSERT(newGraph != NULL);
+  
+  IF_PAR_DEBUG(packet,
+	       belch("@;~)  Unpacked graph with root at %p (%s):", 
+		     newGraph, info_type(newGraph));
+	       PrintGraph(newGraph, 0));
+
+  IF_PAR_DEBUG(tables,
+  	       DebugPrintGAGAMap(gagamap, nGAs));
+
+  IF_PAR_DEBUG(tables, 
+    printLAGAtable();   
+    DebugPrintGAGAMap(gagamap, nGAs));   
+
+  //We don't send an Ack to the head!!!!
+  ASSERT(nGAs>0);  
+  sendAck(sender, nGAs-1, gagamap+2);
+  
+  IF_PAR_DEBUG(verbose,
+	       belch("@;~)  About to create Reval thread on behalf of %x", 
+		     sender));
+  
+  tso=createGenThread(RtsFlags.GcFlags.initialStkSize,newGraph);
+  tso->priority=RevalPriority;
+  tso->revalSlot=gagamap->payload.gc.slot;//record who sent the reval
+  tso->revalTid =gagamap->payload.gc.gtid;
+  scheduleThread(tso);
+  context_switch = 1; // switch at the earliest opportunity
+} 
+#endif
+
+
+//@node GUM Message Processor, Miscellaneous Functions, Message-Processing Functions, High Level Communications Routines
+//@subsection GUM Message Processor
+
+/*
+ * GUM Message Processor
+
+ * processMessages processes any messages that have arrived, calling
+ * appropriate routines depending on the message tag
+ * (opcode). N.B. Unless profiling it assumes that there {\em ARE} messages
+ * present and performs a blocking receive! During profiling it
+ * busy-waits in order to record idle time.
+ */
+
+//@cindex processMessages
+rtsBool
+processMessages(void)
+{
+  rtsPacket packet;
+  OpCode opcode;
+  GlobalTaskId task;
+  rtsBool receivedFinish = rtsFalse;
+
+  do {
+    packet = GetPacket();  /* Get next message; block until one available */
+    getOpcodeAndSender(packet, &opcode, &task);
+
+    if (task==SysManTask) { 
+      switch (opcode) { 
+      case PP_PETIDS:
+	processPEtids();
+	break;
+	  
+      case PP_FINISH:
+	IF_PAR_DEBUG(verbose,
+		     belch("==== received FINISH [%p]", mytid));
+	/* this boolean value is returned and propagated to the main 
+	   scheduling loop, thus shutting-down this PE */
+	receivedFinish = rtsTrue;
+	break;  
+	  
+      default:  
+	barf("Task %x: received unknown opcode %x from SysMan",mytid, opcode);
+      }
+    } else if (taskIDtoPE(task)==0) { 
+      /* When a new PE joins then potentially FISH & REVAL message may
+	 reach PES before they are notified of the new PEs existance.  The
+	 only solution is to bounce/fail these messages back to the sender.
+	 But we will worry about it once we start seeing these race
+	 conditions!  */
+      switch (opcode) { 
+      case PP_FISH:
+	bounceFish();
+	break;
+#ifdef DIST	  
+      case PP_REVAL:
+	bounceReval();
+	break;	  
+#endif          
+      case PP_PETIDS:
+	belch("Task %x: Ignoring PVM session opened by another SysMan %x",mytid,task);
+	break;
+        
+      case PP_FINISH:   
+	break;
+	
+      default:  
+	belch("Task %x: Ignoring opcode %x from unknown PE %x",mytid, opcode, task);
+      }
+    } else
+      switch (opcode) {
+      case PP_FETCH:
+	processFetch();
+	// Global statistics: count no. of fetches
+	if (RtsFlags.ParFlags.ParStats.Global &&
+	    RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	  globalParStats.rec_fetch_mess++;
+	}
+	break;
+
+      case PP_RESUME:
+	processResume(task);
+	// Global statistics: count no. of fetches
+	if (RtsFlags.ParFlags.ParStats.Global &&
+	    RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	  globalParStats.rec_resume_mess++;
+	}
+	break;
+
+      case PP_ACK:
+	processAck();
+	break;
+
+      case PP_FISH:
+	processFish();
+	// Global statistics: count no. of fetches
+	if (RtsFlags.ParFlags.ParStats.Global &&
+	    RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	  globalParStats.rec_fish_mess++;
+	}
+	break;
+
+      case PP_FREE:
+	processFree();
+	break;
+      
+      case PP_SCHEDULE:
+	processSchedule(task);
+	// Global statistics: count no. of fetches
+	if (RtsFlags.ParFlags.ParStats.Global &&
+	    RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	  globalParStats.rec_schedule_mess++;
+	}
+	break;
+      
+#ifdef DIST      
+      case PP_REVAL:
+	processReval(task);
+	// Global statistics: count no. of fetches
+	if (RtsFlags.ParFlags.ParStats.Global &&
+	    RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	  globalParStats.rec_reval_mess++;
+	}
+	break;
+#endif
+      
+      default:
+	/* Anything we're not prepared to deal with. */
+	barf("Task %x: Unexpected opcode %x from %x",
+	     mytid, opcode, task);
+      } /* switch */
+
+  } while (PacketsWaiting());	/* While there are messages: process them */
+  return receivedFinish;
+}				/* processMessages */
+
+//@node Miscellaneous Functions, Index, GUM Message Processor, High Level Communications Routines
+//@subsection Miscellaneous Functions
+
+/*
+ * blockFetch blocks a BlockedFetch node on some kind of black hole.
+ */
+//@cindex blockFetch
+void
+blockFetch(StgBlockedFetch *bf, StgClosure *bh) {
+  bf->node = bh;
+  switch (get_itbl(bh)->type) {
+  case BLACKHOLE:
+    bf->link = END_BQ_QUEUE;
+    //((StgBlockingQueue *)bh)->header.info = &stg_BLACKHOLE_BQ_info;
+    SET_INFO(bh, &stg_BLACKHOLE_BQ_info); // turn closure into a blocking queue
+    ((StgBlockingQueue *)bh)->blocking_queue = (StgBlockingQueueElement *)bf;
+    
+    // put bh on the mutables list
+    recordMutable((StgMutClosure *)bh);
+    break;
+    
+  case BLACKHOLE_BQ:
+    /* enqueue bf on blocking queue of closure bh */
+    bf->link = ((StgBlockingQueue *)bh)->blocking_queue;
+    ((StgBlockingQueue *)bh)->blocking_queue = (StgBlockingQueueElement *)bf;
+
+    // put bh on the mutables list; ToDo: check
+    recordMutable((StgMutClosure *)bh);
+    break;
+
+  case FETCH_ME_BQ:
+    /* enqueue bf on blocking queue of closure bh */
+    bf->link = ((StgFetchMeBlockingQueue *)bh)->blocking_queue;
+    ((StgFetchMeBlockingQueue *)bh)->blocking_queue = (StgBlockingQueueElement *)bf;
+
+    // put bh on the mutables list; ToDo: check
+    recordMutable((StgMutClosure *)bh);
+    break;
+    
+  case RBH:
+    /* enqueue bf on blocking queue of closure bh */
+    bf->link = ((StgRBH *)bh)->blocking_queue;
+    ((StgRBH *)bh)->blocking_queue = (StgBlockingQueueElement *)bf;
+
+    // put bh on the mutables list; ToDo: check
+    recordMutable((StgMutClosure *)bh);
+    break;
+    
+  default:
+    barf("blockFetch: thought %p was a black hole (IP %#lx, %s)",
+	 (StgClosure *)bh, get_itbl((StgClosure *)bh), 
+	 info_type((StgClosure *)bh));
+  }
+  IF_PAR_DEBUG(bq,
+	       belch("##++ blockFetch: after block the BQ of %p (%s) is:",
+		     bh, info_type(bh));
+	       print_bq(bh));
+}
+
+
+/*
+  @blockThread@ is called from the main scheduler whenever tso returns with
+  a ThreadBlocked return code; tso has already been added to a blocking
+  queue (that's done in the entry code of the closure, because it is a 
+  cheap operation we have to do in any case); the main purpose of this
+  routine is to send a Fetch message in case we are blocking on a FETCHME(_BQ)
+  closure, which is indicated by the tso.why_blocked field;
+  we also write an entry into the log file if we are generating one
+
+  Should update exectime etc in the entry code already; but we don't have
+  something like ``system time'' in the log file anyway, so this should
+  even out the inaccuracies.
+*/
+
+//@cindex blockThread
+void
+blockThread(StgTSO *tso)
+{
+  globalAddr *remote_ga=NULL;
+  globalAddr *local_ga;
+  globalAddr fmbq_ga;
+
+  // ASSERT(we are on some blocking queue)
+  ASSERT(tso->block_info.closure != (StgClosure *)NULL);
+
+  /*
+    We have to check why this thread has been blocked.
+  */
+  switch (tso->why_blocked) {
+    case BlockedOnGA:
+      /* the closure must be a FETCH_ME_BQ; tso came in here via 
+	 FETCH_ME entry code */
+      ASSERT(get_itbl(tso->block_info.closure)->type==FETCH_ME_BQ);
+
+      /* HACK: the link field is used to hold the GA between FETCH_ME_entry
+	 end this point; if something (eg. GC) happens inbetween the whole
+	 thing will blow up 
+	 The problem is that the ga field of the FETCH_ME has been overwritten
+	 with the head of the blocking queue (which is tso). 
+      */
+      ASSERT(looks_like_ga(&theGlobalFromGA));
+      // ASSERT(tso->link!=END_TSO_QUEUE && tso->link!=NULL);
+      remote_ga = &theGlobalFromGA; //tso->link;
+      tso->link = (StgTSO*)END_BQ_QUEUE;
+      /* it was tso which turned node from FETCH_ME into FETCH_ME_BQ =>
+	 we have to send a Fetch message here! */
+      if (RtsFlags.ParFlags.ParStats.Full) {
+	/* Note that CURRENT_TIME may perform an unsafe call */
+	tso->par.exectime += CURRENT_TIME - tso->par.blockedat;
+	tso->par.fetchcount++;
+	tso->par.blockedat = CURRENT_TIME;
+	/* we are about to send off a FETCH message, so dump a FETCH event */
+	DumpRawGranEvent(CURRENT_PROC, 
+			 taskIDtoPE(remote_ga->payload.gc.gtid),
+			 GR_FETCH, tso, tso->block_info.closure, 0, 0);
+      }
+      /* Phil T. claims that this was a workaround for a hard-to-find
+       * bug, hence I'm leaving it out for now --SDM 
+       */
+      /* Assign a brand-new global address to the newly created FMBQ  */
+      local_ga = makeGlobal(tso->block_info.closure, rtsFalse);
+      splitWeight(&fmbq_ga, local_ga);
+      ASSERT(fmbq_ga.weight == 1U << (BITS_IN(unsigned) - 1));
+      
+      sendFetch(remote_ga, &fmbq_ga, 0/*load*/);
+
+      // Global statistics: count no. of fetches
+      if (RtsFlags.ParFlags.ParStats.Global &&
+	  RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	globalParStats.tot_fetch_mess++;
+      }
+
+      IF_DEBUG(sanity,
+	       theGlobalFromGA.payload.gc.gtid = (GlobalTaskId)0);
+      break;
+
+    case BlockedOnGA_NoSend:
+      /* the closure must be a FETCH_ME_BQ; tso came in here via 
+	 FETCH_ME_BQ entry code */
+      ASSERT(get_itbl(tso->block_info.closure)->type==FETCH_ME_BQ);
+
+      /* Fetch message has been sent already */
+      if (RtsFlags.ParFlags.ParStats.Full) {
+	/* Note that CURRENT_TIME may perform an unsafe call */
+	tso->par.exectime += CURRENT_TIME - tso->par.blockedat;
+	tso->par.blockcount++;
+	tso->par.blockedat = CURRENT_TIME;
+	/* dump a block event, because fetch has been sent already */
+	DumpRawGranEvent(CURRENT_PROC, thisPE,
+			 GR_BLOCK, tso, tso->block_info.closure, 0, 0);
+      }
+      break;
+
+    case BlockedOnMVar:
+    case BlockedOnBlackHole:
+      /* the closure must be a BLACKHOLE_BQ or an RBH; tso came in here via 
+	 BLACKHOLE(_BQ) or CAF_BLACKHOLE or RBH entry code */
+      ASSERT(get_itbl(tso->block_info.closure)->type==MVAR ||
+	     get_itbl(tso->block_info.closure)->type==BLACKHOLE_BQ ||
+	     get_itbl(tso->block_info.closure)->type==RBH);
+
+      /* if collecting stats update the execution time etc */
+      if (RtsFlags.ParFlags.ParStats.Full) {
+	/* Note that CURRENT_TIME may perform an unsafe call */
+	tso->par.exectime += CURRENT_TIME - tso->par.blockedat;
+	tso->par.blockcount++;
+	tso->par.blockedat = CURRENT_TIME;
+	DumpRawGranEvent(CURRENT_PROC, thisPE,
+			 GR_BLOCK, tso, tso->block_info.closure, 0, 0);
+      }
+      break;
+
+    case BlockedOnDelay:
+      /* Whats sort of stats shall we collect for an explicit threadDelay? */
+      IF_PAR_DEBUG(verbose,
+	       belch("##++ blockThread: TSO %d blocked on ThreadDelay",
+		     tso->id));
+      break;
+
+    /* Check that the following is impossible to happen, indeed
+    case BlockedOnException:
+    case BlockedOnRead:
+    case BlockedOnWrite:
+    */
+    default:
+      barf("blockThread: impossible why_blocked code %d for TSO %d",
+	   tso->why_blocked, tso->id);
+  }
+
+  IF_PAR_DEBUG(verbose,
+	       belch("##++ blockThread: TSO %d blocked on closure %p (%s); %s",
+		     tso->id, tso->block_info.closure, info_type(tso->block_info.closure),
+		     (tso->why_blocked==BlockedOnGA) ? "Sent FETCH for GA" : ""));
+  
+  IF_PAR_DEBUG(bq,
+	       print_bq(tso->block_info.closure));
+}
+
+/*
+ * ChoosePE selects a GlobalTaskId from the array of PEs 'at random'.
+ * Important properties:
+ *   - it varies during execution, even if the PE is idle
+ *   - it's different for each PE
+ *   - we never send a fish to ourselves
+ */
+extern long lrand48 (void);
+
+//@cindex choosePE
+GlobalTaskId
+choosePE(void)
+{
+  long temp;
+
+  temp = lrand48() % nPEs;
+  if (allPEs[temp] == mytid) {	/* Never send a FISH to yourself */
+    temp = (temp + 1) % nPEs;
+  }
+  return allPEs[temp];
+}
+
+/* 
+ * allocate a BLOCKED_FETCH closure and fill it with the relevant fields
+ * of the ga argument; called from processFetch when the local closure is
+ * under evaluation
+ */
+//@cindex createBlockedFetch
+StgClosure *
+createBlockedFetch (globalAddr ga, globalAddr rga)
+{
+  StgBlockedFetch *bf;
+  StgClosure *closure;
+
+  closure = GALAlookup(&ga);
+  if ((bf = (StgBlockedFetch *)allocate(_HS + sizeofW(StgBlockedFetch))) == NULL) {
+    barf("createBlockedFetch: out of heap while allocating heap for a BlocekdFetch; ToDo: call GC here");
+    GarbageCollect(GetRoots, rtsFalse); 
+    closure = GALAlookup(&ga);
+    bf = (StgBlockedFetch *)allocate(_HS + sizeofW(StgBlockedFetch));
+    // ToDo: check whether really guaranteed to succeed 2nd time around
+  }
+
+  ASSERT(bf != (StgBlockedFetch *)NULL);
+  SET_INFO((StgClosure *)bf, &stg_BLOCKED_FETCH_info);
+  // ToDo: check whether other header info is needed
+  bf->node = closure;
+  bf->ga.payload.gc.gtid = rga.payload.gc.gtid;
+  bf->ga.payload.gc.slot = rga.payload.gc.slot;
+  bf->ga.weight = rga.weight;
+  // bf->link = NULL;  debugging
+
+  IF_PAR_DEBUG(schedule,
+	       fprintf(stderr, "%%%%// created BF: bf=%p (%s) of closure , GA: ",
+		       bf, info_type((StgClosure*)bf));
+	       printGA(&(bf->ga));
+	       fputc('\n',stderr));
+  return (StgClosure *)bf;
+}
+
+/*
+ * waitForTermination enters a loop ignoring spurious messages while
+ * waiting for the termination sequence to be completed.  
+ */
+//@cindex waitForTermination
+void
+waitForTermination(void)
+{
+  do {
+    rtsPacket p = GetPacket();
+    processUnexpectedMessage(p);
+  } while (rtsTrue);
+}
+
+#ifdef DEBUG
+//@cindex DebugPrintGAGAMap
+void
+DebugPrintGAGAMap(globalAddr *gagamap, int nGAs)
+{
+  nat i;
+  
+  for (i = 0; i < nGAs; ++i, gagamap += 2)
+    fprintf(stderr, "__ gagamap[%d] = ((%x, %d, %x)) -> ((%x, %d, %x))\n", i,
+	    gagamap[0].payload.gc.gtid, gagamap[0].payload.gc.slot, gagamap[0].weight,
+	    gagamap[1].payload.gc.gtid, gagamap[1].payload.gc.slot, gagamap[1].weight);
+}
+
+//@cindex checkGAGAMap
+void
+checkGAGAMap(globalAddr *gagamap, int nGAs)
+{
+  nat i;
+  
+  for (i = 0; i < (nat)nGAs; ++i, gagamap += 2) {
+    ASSERT(looks_like_ga(gagamap));
+    ASSERT(looks_like_ga(gagamap+1));
+  }
+}
+#endif
+
+//@cindex freeMsgBuffer
+static StgWord **freeMsgBuffer = NULL;
+//@cindex freeMsgIndex
+static nat      *freeMsgIndex  = NULL;
+
+//@cindex prepareFreeMsgBuffers
+void
+prepareFreeMsgBuffers(void)
+{
+  nat i;
+  
+  /* Allocate the freeMsg buffers just once and then hang onto them. */
+  if (freeMsgIndex == NULL) {
+    freeMsgIndex = (nat *) stgMallocBytes(nPEs * sizeof(nat), 
+					  "prepareFreeMsgBuffers (Index)");
+    freeMsgBuffer = (StgWord **) stgMallocBytes(nPEs * sizeof(long *), 
+					  "prepareFreeMsgBuffers (Buffer)");
+    
+    for(i = 0; i < nPEs; i++) 
+      if (i != (thisPE-1)) 
+	freeMsgBuffer[i] = (StgPtr) stgMallocWords(RtsFlags.ParFlags.packBufferSize,
+					       "prepareFreeMsgBuffers (Buffer #i)");
+      else
+	freeMsgBuffer[i] = 0;
+  }
+  
+  /* Initialize the freeMsg buffer pointers to point to the start of their
+     buffers */
+  for (i = 0; i < nPEs; i++)
+    freeMsgIndex[i] = 0;
+}
+
+//@cindex freeRemoteGA
+void
+freeRemoteGA(int pe, globalAddr *ga)
+{
+  nat i;
+  
+  ASSERT(GALAlookup(ga) == NULL);
+  
+  if ((i = freeMsgIndex[pe]) + 2 >= RtsFlags.ParFlags.packBufferSize) {
+    IF_PAR_DEBUG(free,
+		 belch("!! Filled a free message buffer (sending remaining messages indivisually)"));	
+
+    sendFree(ga->payload.gc.gtid, i, freeMsgBuffer[pe]);
+    i = 0;
+  }
+  freeMsgBuffer[pe][i++] = (StgWord) ga->weight;
+  freeMsgBuffer[pe][i++] = (StgWord) ga->payload.gc.slot;
+  freeMsgIndex[pe] = i;
+
+  IF_DEBUG(sanity,
+	   ga->weight = 0xdead0add;
+	   ga->payload.gc.gtid = 0xbbbbbbbb;
+	   ga->payload.gc.slot = 0xbbbbbbbb;);
+}
+
+//@cindex sendFreeMessages
+void
+sendFreeMessages(void)
+{
+  nat i;
+  
+  for (i = 0; i < nPEs; i++) 
+    if (freeMsgIndex[i] > 0)
+      sendFree(allPEs[i], freeMsgIndex[i], freeMsgBuffer[i]);
+}
+
+/* synchronises with the other PEs. Receives and records in a global
+ * variable the task-id of SysMan. If this is the main thread (discovered
+ * in main.lc), identifies itself to SysMan. Finally it receives
+ * from SysMan an array of the Global Task Ids of each PE, which is
+ * returned as the value of the function.
+ */
+
+#if defined(PAR_TICKY)
+/* Has to see freeMsgIndex, so must be defined here not in ParTicky.c */
+//@cindex stats_CntFreeGA
+void
+stats_CntFreeGA (void) {  // stats only
+
+  // Global statistics: residency of thread and spark pool
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    nat i, s;
+  
+    globalParStats.cnt_free_GA++;
+    for (i = 0, s = 0; i < nPEs; i++) 
+      s += globalParStats.tot_free_GA += freeMsgIndex[i]/2;
+
+    if ( s > globalParStats.res_free_GA )
+      globalParStats.res_free_GA = s;
+  }
+}
+#endif /* PAR_TICKY */
+
+#endif /* PAR -- whole file */
+
+//@node Index,  , Miscellaneous Functions, High Level Communications Routines
+//@subsection Index
+
+//@index
+//* ACK::  @cindex\s-+ACK
+//* DebugPrintGAGAMap::  @cindex\s-+DebugPrintGAGAMap
+//* FETCH::  @cindex\s-+FETCH
+//* FISH::  @cindex\s-+FISH
+//* FREE::  @cindex\s-+FREE
+//* RESUME::  @cindex\s-+RESUME
+//* SCHEDULE::  @cindex\s-+SCHEDULE
+//* blockFetch::  @cindex\s-+blockFetch
+//* choosePE::  @cindex\s-+choosePE
+//* freeMsgBuffer::  @cindex\s-+freeMsgBuffer
+//* freeMsgIndex::  @cindex\s-+freeMsgIndex
+//* freeRemoteGA::  @cindex\s-+freeRemoteGA
+//* gumPackBuffer::  @cindex\s-+gumPackBuffer
+//* initMoreBuffers::  @cindex\s-+initMoreBuffers
+//* prepareFreeMsgBuffers::  @cindex\s-+prepareFreeMsgBuffers
+//* processAck::  @cindex\s-+processAck
+//* processFetch::  @cindex\s-+processFetch
+//* processFetches::  @cindex\s-+processFetches
+//* processFish::  @cindex\s-+processFish
+//* processFree::  @cindex\s-+processFree
+//* processMessages::  @cindex\s-+processMessages
+//* processResume::  @cindex\s-+processResume
+//* processSchedule::  @cindex\s-+processSchedule
+//* sendAck::  @cindex\s-+sendAck
+//* sendFetch::  @cindex\s-+sendFetch
+//* sendFish::  @cindex\s-+sendFish
+//* sendFree::  @cindex\s-+sendFree
+//* sendFreeMessages::  @cindex\s-+sendFreeMessages
+//* sendResume::  @cindex\s-+sendResume
+//* sendSchedule::  @cindex\s-+sendSchedule
+//* unpackAck::  @cindex\s-+unpackAck
+//* unpackFetch::  @cindex\s-+unpackFetch
+//* unpackFish::  @cindex\s-+unpackFish
+//* unpackFree::  @cindex\s-+unpackFree
+//* unpackResume::  @cindex\s-+unpackResume
+//* unpackSchedule::  @cindex\s-+unpackSchedule
+//* waitForTermination::  @cindex\s-+waitForTermination
+//@end index
diff --git a/rts/parallel/LLC.h b/rts/parallel/LLC.h
new file mode 100644
index 0000000000..536e431bef
--- /dev/null
+++ b/rts/parallel/LLC.h
@@ -0,0 +1,130 @@
+/* --------------------------------------------------------------------------
+   Time-stamp: <Sun Mar 18 2001 21:23:50 Stardate: [-30]6349.45 hwloidl>
+
+   Low Level Communications Header (LLC.h)
+
+   Contains the definitions used by the Low-level Communications
+   module of the GUM Haskell runtime environment.
+   Based on the Graph for PVM implementation.
+
+   Phil Trinder, Glasgow University, 13th Dec 1994
+   Adapted for the 4.xx RTS
+   H-W. Loidl, Heriot-Watt, November 1999
+   ----------------------------------------------------------------------- */
+
+#ifndef __LLC_H
+#define __LLC_H
+
+#ifdef PAR
+
+//@node Low Level Communications Header, , ,
+//@section Low Level Communications Header
+
+//@menu
+//* Includes::			
+//* Macros and Constants::	
+//* PVM macros::		
+//* Externs::			
+//@end menu
+
+//@node Includes, Macros and Constants, Low Level Communications Header, Low Level Communications Header
+//@subsection Includes
+
+#include "Rts.h"
+#include "Parallel.h"
+
+#include "PEOpCodes.h"
+#include "pvm3.h"
+
+//@node Macros and Constants, PVM macros, Includes, Low Level Communications Header
+//@subsection Macros and Constants
+
+#define	ANY_TASK	(-1)	/* receive messages from any task */
+#define ANY_GLOBAL_TASK	ANY_TASK
+#define ANY_OPCODE	(-1)	/* receive any opcode */
+#define	ALL_GROUP	(-1)	/* wait for barrier from every group member */
+
+#define	PEGROUP		"PE"
+
+#define	MGRGROUP	"MGR"
+#define	SYSGROUP	"SYS"
+
+
+#define	PETASK		"PE"
+
+//@node PVM macros, Externs, Macros and Constants, Low Level Communications Header
+//@subsection PVM macros
+
+#define	sync(gp,op)		do { \
+                                  broadcast(gp,op); \
+                                  pvm_barrier(gp,ALL_GROUP); \
+                                } while(0)
+
+#define broadcast(gp,op)	do { \
+                                  pvm_initsend(PvmDataDefault); \
+                                  pvm_bcast(gp,op); \
+                                } while(0)
+
+#define checkComms(c,s)		do { \
+                                  if ((c)<0) { \
+                                    pvm_perror(s); \
+                                    stg_exit(EXIT_FAILURE); \
+                                }} while(0)
+
+#define _my_gtid		pvm_mytid()
+#define GetPacket()             pvm_recv(ANY_TASK,ANY_OPCODE)
+#define PacketsWaiting()	(pvm_probe(ANY_TASK,ANY_OPCODE) != 0)
+
+#define SPARK_THREAD_DESCRIPTOR		1
+#define GLOBAL_THREAD_DESCRIPTOR	2
+
+#define _extract_jump_field(v)	(v)
+
+#define MAX_DATA_WORDS_IN_PACKET	1024
+
+/* basic PVM packing */
+#define PutArg1(a)		pvm_pklong((long *)&(a),1,1)
+#define PutArg2(a)		pvm_pklong((long *)&(a),1,1)
+#define PutArgN(n,a)		pvm_pklong((long *)&(a),1,1)
+#define PutArgs(b,n)		pvm_pklong((long *)b,n,1)
+
+#define PutLit(l)		{ int a = l; PutArgN(?,a); }
+
+/* basic PVM unpacking */
+#define GetArg1(a)		pvm_upklong((long *)&(a),1,1)
+#define GetArg2(a)		pvm_upklong((long *)&(a),1,1)
+#define GetArgN(n,a)		pvm_upklong((long *)&(a),1,1)
+#define GetArgs(b,n)		pvm_upklong((long *)b,n,1)
+
+//@node Externs,  , PVM macros, Low Level Communications Header
+//@subsection Externs
+
+/* basic message passing routines */
+extern void sendOp   (OpCode,GlobalTaskId),
+            sendOp1  (OpCode,GlobalTaskId,StgWord),
+            sendOp2  (OpCode,GlobalTaskId,StgWord,StgWord),
+	    sendOpV  (OpCode,GlobalTaskId,int,...), 
+            sendOpN  (OpCode,GlobalTaskId,int,StgPtr),
+            sendOpNV (OpCode,GlobalTaskId,int,StgPtr,int,...);
+
+extern void broadcastOpN(OpCode op, char *group, int n, StgPtr args);
+
+/* extracting data out of a packet */
+OpCode        getOpcode (rtsPacket p);
+void          getOpcodeAndSender (rtsPacket p, OpCode *popcode, 
+			          GlobalTaskId *psender_id);
+GlobalTaskId  senderTask (rtsPacket p);
+rtsPacket     waitForPEOp(OpCode op, GlobalTaskId who, void(*processUnexpected)(rtsPacket) );
+
+/* Init and shutdown routines */
+void          startUpPE (void);
+void          shutDownPE(void);
+int           getExitCode(int nbytes, GlobalTaskId *sender_idp);
+
+/* aux functions */
+char  *getOpName (unsigned op);  // returns string of opcode
+void   processUnexpectedMessage (rtsPacket);
+//void   NullException(void);
+
+#endif /*PAR */
+#endif /*defined __LLC_H */
diff --git a/rts/parallel/LLComms.c b/rts/parallel/LLComms.c
new file mode 100644
index 0000000000..baa6dddf0c
--- /dev/null
+++ b/rts/parallel/LLComms.c
@@ -0,0 +1,489 @@
+/* ----------------------------------------------------------------------------
+ * Time-stamp: <Mon Mar 19 2001 22:10:38 Stardate: [-30]6354.62 hwloidl>
+ *
+ * GUM Low-Level Inter-Task Communication
+ *
+ * This module defines PVM Routines for PE-PE  communication.
+ *
+ * P. Trinder, December 5th. 1994.
+ * P. Trinder, July 1998
+ * H-W. Loidl, November 1999 -
+ --------------------------------------------------------------------------- */
+
+#ifdef PAR /* whole file */
+
+//@node GUM Low-Level Inter-Task Communication, , ,
+//@section GUM Low-Level Inter-Task Communication
+
+/*
+ *This module defines the routines which communicate between PEs.  The
+ *code is based on Kevin Hammond's GRIP RTS. (OpCodes.h defines
+ *PEOp1 etc. in terms of sendOp1 etc.).  
+ *
+ *Routine	&	Arguments 
+ *		&		
+ *sendOp	&	0			\\
+ *sendOp1	&	1			\\
+ *sendOp2	&	2			\\
+ *sendOpN	&	vector			\\
+ *sendOpV	&	variable		\\
+ *sendOpNV	&	variable+ vector	\\
+ *
+ *First the standard include files.
+ */
+
+//@menu
+//* Macros etc::		
+//* Includes::			
+//* Auxiliary functions::	
+//* Index::			
+//@end menu
+
+//@node Macros etc, Includes, GUM Low-Level Inter-Task Communication, GUM Low-Level Inter-Task Communication
+//@subsection Macros etc
+
+/* Evidently not Posix */
+/* #include "PosixSource.h" */
+
+#define UNUSED           /* nothing */
+
+//@node Includes, Auxiliary functions, Macros etc, GUM Low-Level Inter-Task Communication
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Parallel.h"
+#include "ParallelRts.h"
+#if defined(DEBUG)
+# include "ParallelDebug.h"
+#endif
+#include "LLC.h"
+
+#ifdef __STDC__
+#include <stdarg.h>
+#else
+#include <varargs.h>
+#endif
+
+/* Cannot use std macro when compiling for SysMan */
+/* debugging enabled */
+// #define IF_PAR_DEBUG(c,s)  { s; }
+/* debugging disabled */
+#define IF_PAR_DEBUG(c,s)  /* nothing */
+
+//@node Auxiliary functions, Index, Includes, GUM Low-Level Inter-Task Communication
+//@subsection Auxiliary functions
+
+/*
+ * heapChkCounter tracks the number of heap checks since the last probe.
+ * Not currently used! We check for messages when a thread is resheduled.
+ */
+int heapChkCounter = 0;
+
+/*
+ * Then some miscellaneous functions. 
+ * getOpName returns the character-string name of any OpCode.
+ */
+
+char *UserPEOpNames[] = { PEOP_NAMES };
+
+//@cindex getOpName
+char *
+getOpName(nat op)
+{
+    if (op >= MIN_PEOPS && op <= MAX_PEOPS)
+	return (UserPEOpNames[op - MIN_PEOPS]);
+    else
+	return ("Unknown PE OpCode");
+}
+
+/*
+ * traceSendOp handles the tracing of messages. 
+ */
+
+//@cindex traceSendOp
+static void
+traceSendOp(OpCode op, GlobalTaskId dest UNUSED,
+	     unsigned int data1 UNUSED, unsigned int data2 UNUSED)
+{
+    char *OpName;
+
+    OpName = getOpName(op);
+    IF_PAR_DEBUG(trace,
+		 fprintf(stderr," %s [%x,%x] sent from %x to %x", 
+		       OpName, data1, data2, mytid, dest));
+}
+
+/*
+ * sendOp sends a 0-argument message with OpCode {\em op} to
+ * the global task {\em task}.
+ */
+
+//@cindex sendOp
+void
+sendOp(OpCode op, GlobalTaskId task)
+{
+    traceSendOp(op, task,0,0);
+
+    pvm_initsend(PvmDataRaw);
+    pvm_send(task, op);
+}
+
+/*
+ * sendOp1 sends a 1-argument message with OpCode {\em op}
+ * to the global task {\em task}.
+ */
+
+//@cindex sendOp1
+void
+sendOp1(OpCode op, GlobalTaskId task, StgWord arg1)
+{
+    traceSendOp(op, task, arg1,0);
+
+    pvm_initsend(PvmDataRaw);
+    PutArg1(arg1);
+    pvm_send(task, op);
+}
+
+
+/*
+ * sendOp2 is used by the FP code only. 
+ */
+
+//@cindex sendOp2
+void
+sendOp2(OpCode op, GlobalTaskId task, StgWord arg1, StgWord arg2)
+{
+    traceSendOp(op, task, arg1, arg2);
+
+    pvm_initsend(PvmDataRaw);
+    PutArg1(arg1);
+    PutArg2(arg2);
+    pvm_send(task, op);
+}
+
+/*
+ *
+ * sendOpV takes a variable number of arguments, as specified by {\em n}.  
+ * For example,
+ *
+ *    sendOpV( PP_STATS, StatsTask, 3, start_time, stop_time, sparkcount);
+ */
+
+//@cindex sendOpV
+void
+sendOpV(OpCode op, GlobalTaskId task, int n, ...)
+{
+    va_list ap;
+    int i;
+    StgWord arg;
+
+    va_start(ap, n);
+
+    traceSendOp(op, task, 0, 0);
+
+    pvm_initsend(PvmDataRaw);
+
+    for (i = 0; i < n; ++i) {
+	arg = va_arg(ap, StgWord);
+	PutArgN(i, arg);
+    }
+    va_end(ap);
+
+    pvm_send(task, op);
+}
+
+/*    
+ *
+ * sendOpNV takes a variable-size datablock, as specified by {\em
+ * nelem} and a variable number of arguments, as specified by {\em
+ * narg}. N.B. The datablock and the additional arguments are contiguous
+ * and are copied over together.  For example,
+ *
+ *        sendOpNV(PP_RESUME, tsoga.pe, 6, nelem, data,
+ *	    (W_) ga.weight, (W_) ga.loc.gc.gtid, (W_) ga.loc.gc.slot, 
+ *	    (W_) tsoga.weight, (W_) tsoga.loc.gc.gtid, (W_) tsoga.loc.gc.slot);
+ *
+ * Important: The variable arguments must all be StgWords.
+
+ sendOpNV(_, tid, m, n, data, x1, ..., xm):
+
+                         |   n elems
+     +------------------------------
+     | x1 | ... | xm | n | data ....
+     +------------------------------
+ */
+
+//@cindex sendOpNV
+void
+sendOpNV(OpCode op, GlobalTaskId task, int nelem, 
+	 StgWord *datablock, int narg, ...)
+{
+    va_list ap;
+    int i;
+    StgWord arg;
+
+    va_start(ap, narg);
+
+    traceSendOp(op, task, 0, 0);
+    IF_PAR_DEBUG(trace,
+		 fprintf(stderr,"~~ sendOpNV: op = %x (%s), task = %x, narg = %d, nelem = %d",
+		       op, getOpName(op), task, narg, nelem));
+
+    pvm_initsend(PvmDataRaw);
+
+    for (i = 0; i < narg; ++i) {
+	arg = va_arg(ap, StgWord);
+        IF_PAR_DEBUG(trace,
+		     fprintf(stderr,"~~ sendOpNV: arg = %d\n",arg));
+	PutArgN(i, arg);
+    }
+    arg = (StgWord) nelem;
+    PutArgN(narg, arg);
+
+/*  for (i=0; i < nelem; ++i) fprintf(stderr, "%d ",datablock[i]); */
+/*  fprintf(stderr," in sendOpNV\n");*/
+
+    PutArgs(datablock, nelem);
+    va_end(ap);
+
+    pvm_send(task, op);
+}
+
+/*    
+ * sendOpN take a variable size array argument, whose size is given by
+ * {\em n}.  For example,
+ *
+ *    sendOpN( PP_STATS, StatsTask, 3, stats_array);
+ */
+
+//@cindex sendOpN
+void
+sendOpN(OpCode op, GlobalTaskId task, int n, StgPtr args)
+{
+    long arg;
+
+    traceSendOp(op, task, 0, 0);
+
+    pvm_initsend(PvmDataRaw);
+    arg = (long) n;
+    PutArgN(0, arg);
+    PutArgs(args, n);
+    pvm_send(task, op);
+}
+
+/*    
+ * broadcastOpN is as sendOpN but broadcasts to all members of a group.
+ */
+
+void
+broadcastOpN(OpCode op, char *group, int n, StgPtr args)
+{
+  long arg;
+
+  //traceSendOp(op, task, 0, 0);
+  
+  pvm_initsend(PvmDataRaw);
+  arg = (long) n;
+  PutArgN(0, arg);
+  PutArgs(args, n);
+  pvm_bcast(group, op);
+}
+
+/*
+   waitForPEOp waits for a packet from global task who with the
+   OpCode op.  If ignore is true all other messages are simply ignored; 
+   otherwise they are handled by processUnexpected.
+ */
+//@cindex waitForPEOp
+rtsPacket 
+waitForPEOp(OpCode op, GlobalTaskId who, void(*processUnexpected)(rtsPacket) )
+{
+  rtsPacket p;
+  int nbytes;
+  OpCode opCode;
+  GlobalTaskId sender_id;
+  rtsBool match;
+
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,"~~ waitForPEOp: expecting op = %x (%s), who = [%x]\n", 
+		       op, getOpName(op), who)); 
+
+  do {
+    while((p = pvm_recv(ANY_TASK,ANY_OPCODE)) < 0)
+      pvm_perror("waitForPEOp: Waiting for PEOp");
+      
+    pvm_bufinfo( p, &nbytes, &opCode, &sender_id );
+    match = (op == ANY_OPCODE || op == opCode) && 
+            (who == ANY_TASK || who == sender_id);
+
+    if (match) {
+      IF_PAR_DEBUG(verbose,
+		   fprintf(stderr,
+			   "~~waitForPEOp: Qapla! received: OpCode = %#x (%s), sender_id = [%x]",
+			   opCode, getOpName(opCode), sender_id)); 
+
+      return(p);
+    }
+
+    /* Handle the unexpected OpCodes */
+    if (processUnexpected!=NULL) {
+      (*processUnexpected)(p);
+    } else {
+      IF_PAR_DEBUG(verbose,
+		   fprintf(stderr,
+			   "~~ waitForPEOp: ignoring OpCode = %#x (%s), sender_id = [%x]",
+			   opCode, getOpName(opCode), sender_id)); 
+    }
+
+  } while(rtsTrue);
+}
+
+/*
+  processUnexpected processes unexpected messages. If the message is a
+  FINISH it exits the prgram, and PVM gracefully
+ */
+//@cindex processUnexpectedMessage
+void
+processUnexpectedMessage(rtsPacket packet) {
+    OpCode opCode = getOpcode(packet);
+
+    IF_PAR_DEBUG(verbose,
+		 GlobalTaskId sender = senderTask(packet); 
+		 fprintf(stderr,"~~ [%x] processUnexpected: Received %x (%s), sender %x\n",
+		       mytid, opCode, getOpName(opCode), sender)); 
+
+    switch (opCode) {
+    case PP_FINISH:
+        stg_exit(EXIT_SUCCESS);
+	break;
+
+      /* Anything we're not prepared to deal with.  Note that ALL OpCodes
+	 are discarded during termination -- this helps prevent bizarre
+	 race conditions.  */
+      default:
+	// if (!GlobalStopPending) 
+        {
+	  GlobalTaskId errorTask;
+	  OpCode opCode;
+
+	  getOpcodeAndSender(packet, &opCode, &errorTask);
+	  fprintf(stderr,"== Task %x: Unexpected OpCode %x from %x in processUnexpected",
+		mytid, opCode, errorTask );
+            
+	  stg_exit(EXIT_FAILURE);
+	}
+    }
+}
+
+//@cindex getOpcode
+OpCode 
+getOpcode(rtsPacket p)
+{
+  int nbytes;
+  OpCode OpCode;
+  GlobalTaskId sender_id;
+  /* read PVM buffer */
+  pvm_bufinfo(p, &nbytes, &OpCode, &sender_id);
+  /* return tag of the buffer as opcode */
+  return(OpCode);
+}
+
+//@cindex getOpcodeAndSender
+void
+getOpcodeAndSender(rtsPacket p, OpCode *opCodep, GlobalTaskId *senderIdp)
+{
+  int nbytes;
+  /* read PVM buffer */
+  pvm_bufinfo(p, &nbytes, opCodep, senderIdp);
+}
+
+//@cindex senderTask
+GlobalTaskId
+senderTask(rtsPacket p)
+{
+  int nbytes;
+  OpCode opCode;
+  GlobalTaskId sender_id;
+  /* read PVM buffer */
+  pvm_bufinfo(p, &nbytes, &opCode, &sender_id);
+  return(sender_id);
+}
+
+/*
+ * startUpPE does the low-level comms specific startup stuff for a
+ * PE. It initialises the comms system, joins the appropriate groups
+ * allocates the PE buffer
+ */
+
+//@cindex startUpPE
+void
+startUpPE(void)
+{ 
+  mytid = _my_gtid;	/* Initialise PVM and get task id into global var.*/
+  
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,"== [%x] PEStartup: Task id = [%x], No. PEs = %d \n", 
+		       mytid, mytid, nPEs));
+  checkComms(pvm_joingroup(PEGROUP), "PEStartup");
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,"== [%x] PEStartup: Joined PEGROUP\n", mytid));
+}
+
+/*
+ * PEShutdown does the low-level comms-specific shutdown stuff for a
+ * single PE. It leaves the groups and then exits from pvm.
+ */
+//@cindex shutDownPE
+void
+shutDownPE(void)
+{    
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr, "== [%x] PEshutdown\n", mytid));
+
+  checkComms(pvm_lvgroup(PEGROUP),"PEShutDown");
+  checkComms(pvm_exit(),"PEShutDown");
+}
+
+/* 
+   Extract the exit code out of a PP_FINISH packet (used in SysMan)
+*/
+int
+getExitCode(int nbytes, GlobalTaskId *sender_idp) {
+  int exitCode=0;
+
+  if (nbytes==4) {               // Notification from a task doing pvm_exit
+    GetArgs(sender_idp,1);       // Presumably this must be MainPE Id
+    exitCode = -1;
+  } else if (nbytes==8) {        // Doing a controlled shutdown
+    GetArgs(&exitCode,1);        // HACK: controlled shutdown == 2 values
+    GetArgs(&exitCode,1);
+  } else {
+    exitCode = -2;               // everything else
+  }
+  return exitCode;
+}
+
+#endif /* PAR -- whole file */
+
+//@node Index,  , Auxiliary functions, GUM Low-Level Inter-Task Communication
+//@subsection Index
+
+//@index
+//* getOpName::  @cindex\s-+getOpName
+//* traceSendOp::  @cindex\s-+traceSendOp
+//* sendOp::  @cindex\s-+sendOp
+//* sendOp1::  @cindex\s-+sendOp1
+//* sendOp2::  @cindex\s-+sendOp2
+//* sendOpV::  @cindex\s-+sendOpV
+//* sendOpNV::  @cindex\s-+sendOpNV
+//* sendOpN::  @cindex\s-+sendOpN
+//* waitForPEOp::  @cindex\s-+waitForPEOp
+//* processUnexpectedMessage::  @cindex\s-+processUnexpectedMessage
+//* getOpcode::  @cindex\s-+getOpcode
+//* getOpcodeAndSender::  @cindex\s-+getOpcodeAndSender
+//* senderTask::  @cindex\s-+senderTask
+//* startUpPE::  @cindex\s-+startUpPE
+//* shutDownPE::  @cindex\s-+shutDownPE
+//@end index
diff --git a/rts/parallel/PEOpCodes.h b/rts/parallel/PEOpCodes.h
new file mode 100644
index 0000000000..2d18b439f2
--- /dev/null
+++ b/rts/parallel/PEOpCodes.h
@@ -0,0 +1,58 @@
+#ifndef PEOPCODES_H
+#define PEOPCODES_H
+
+/************************************************************************
+*                         PEOpCodes.h                                   *
+*									*
+*	This file contains definitions for all the GUM PE Opcodes       *
+*       It's based on the GRAPH for PVM version                         *
+*       Phil Trinder, Glasgow University 8th December 1994              *
+*									*
+   RFPointon, December 1999
+     - removed PP_SYSMAN_TID, introduced PP_READY
+     - removed PP_MAIN_TASK, introduced PP_NEWPE
+     - added PP_REVAL
+************************************************************************/
+
+#define REPLY_OK		0x00
+
+/*Startup + Shutdown*/
+#define	PP_READY		0x50  /* sent PEs -> SysMan */
+#define	PP_NEWPE		0x51  /* sent via newHost notify -> SysMan */
+#define	PP_FINISH		0x52  /* sent PEs & via taskExit notfiy -> SysMan */
+#define	PP_PETIDS		0x53  /* sent sysman -> PEs */
+
+/* Stats stuff */
+#define	PP_STATS		0x54
+#define PP_STATS_ON		0x55
+#define PP_STATS_OFF		0x56
+
+//#define PP_FAIL		0x57 
+
+/*Garbage Collection*/
+#define PP_GC_INIT              0x58
+#define PP_FULL_SYSTEM          0x59
+#define PP_GC_POLL              0x5a
+
+/*GUM Messages*/
+#define PP_FETCH                0x5b
+#define PP_RESUME               0x5c
+#define PP_ACK                  0x5d
+#define PP_FISH                 0x5e
+#define PP_SCHEDULE             0x5f
+#define PP_FREE			0x60
+#define PP_REVAL		0x61
+
+
+#define	MIN_PEOPS		0x50
+#define	MAX_PEOPS		0x61
+
+#define	PEOP_NAMES		"Ready", "NewPE", \
+				"Finish", "PETIDS", \
+                                "Stats", "Stats_On", "Stats_Off", \
+  				"Fail", \
+                                "GCInit", "FullSystem", "GCPoll", \
+                                "Fetch","Resume","ACK","Fish","Schedule", \
+				"Free","REval"
+
+#endif /* PEOPCODES_H */
diff --git a/rts/parallel/Pack.c b/rts/parallel/Pack.c
new file mode 100644
index 0000000000..e8653f6303
--- /dev/null
+++ b/rts/parallel/Pack.c
@@ -0,0 +1,4293 @@
+/* 
+   Time-stamp: <Wed Mar 21 2001 16:32:47 Stardate: [-30]6363.44 hwloidl>
+
+   Graph packing and unpacking code for sending it to another processor
+   and retrieving the original graph structure from the packet.
+   In the old RTS the code was split into Pack.c and Unpack.c (now deceased)
+   Used in GUM and GrAnSim.
+
+   The GrAnSim version of the code defines routines for *simulating* the
+   packing of closures in the same way it is done in the parallel runtime
+   system. Basically GrAnSim only puts the addresses of the closures to be
+   transferred into a buffer. This buffer will then be associated with the
+   event of transferring the graph. When this event is scheduled, the
+   @UnpackGraph@ routine is called and the buffer can be discarded
+   afterwards.
+
+   Note that in GranSim we need many buffers, not just one per PE.
+*/
+
+//@node Graph packing, , ,
+//@section Graph packing
+
+#if defined(PAR) || defined(GRAN)   /* whole file */
+
+//@menu
+//* Includes::			
+//* Prototypes::		
+//* Global variables::		
+//* ADT of Closure Queues::	
+//* Initialisation for packing::  
+//* Packing Functions::		
+//* Low level packing routines::  
+//* Unpacking routines::	
+//* Aux fcts for packing::	
+//* Printing Packet Contents::	
+//* End of file::		
+//@end menu
+//*/
+
+//@node Includes, Prototypes, Graph packing, Graph packing
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "ClosureTypes.h"
+#include "Storage.h"
+#include "Hash.h"
+#include "Parallel.h"
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+# if defined(DEBUG)
+# include "Sanity.h"
+# include "Printer.h"
+# include "ParallelDebug.h"
+# endif
+#include "FetchMe.h"
+
+/* Which RTS flag should be used to get the size of the pack buffer ? */
+# if defined(PAR)
+#  define RTS_PACK_BUFFER_SIZE   RtsFlags.ParFlags.packBufferSize
+# else   /* GRAN */
+#  define RTS_PACK_BUFFER_SIZE   RtsFlags.GranFlags.packBufferSize
+# endif
+
+//@node Prototypes, Global variables, Includes, Graph packing
+//@subsection Prototypes
+/* 
+   Code declarations. 
+*/
+
+//@node ADT of closure queues, Init for packing, Prototypes, Prototypes
+//@subsubsection ADT of closure queues
+
+static inline void    	  InitClosureQueue(void);
+static inline rtsBool 	  QueueEmpty(void);
+static inline void    	  QueueClosure(StgClosure *closure);
+static inline StgClosure *DeQueueClosure(void);
+
+//@node Init for packing, Packing routines, ADT of closure queues, Prototypes
+//@subsubsection Init for packing
+
+static void     InitPacking(rtsBool unpack);
+# if defined(PAR)
+rtsBool         InitPackBuffer(void);
+# elif defined(GRAN)
+rtsPackBuffer  *InstantiatePackBuffer (void);
+static void     reallocPackBuffer (void);
+# endif
+
+//@node Packing routines, Low level packing fcts, Init for packing, Prototypes
+//@subsubsection Packing routines
+
+static void    PackClosure (StgClosure *closure);
+
+//@node Low level packing fcts, Unpacking routines, Packing routines, Prototypes
+//@subsubsection Low level packing fcts
+
+# if defined(GRAN)
+static  void    Pack (StgClosure *data);
+# else
+static  void    Pack (StgWord data);
+
+static void    PackGeneric(StgClosure *closure);
+static void    PackArray(StgClosure *closure);
+static void    PackPLC (StgPtr addr);
+static void    PackOffset (int offset);
+static void    PackPAP(StgPAP *pap);
+static rtsPackBuffer *PackTSO(StgTSO *tso, nat *packBufferSize);
+static rtsPackBuffer *PackStkO(StgPtr stko, nat *packBufferSize);
+static void           PackFetchMe(StgClosure *closure);
+
+static void    GlobaliseAndPackGA (StgClosure *closure);
+# endif
+
+//@node Unpacking routines, Aux fcts for packing, Low level packing fcts, Prototypes
+//@subsubsection Unpacking routines
+
+# if defined(PAR)
+void        InitPendingGABuffer(nat size); 
+void        CommonUp(StgClosure *src, StgClosure *dst);
+static StgClosure *SetGAandCommonUp(globalAddr *gaP, StgClosure *closure, 
+				  rtsBool hasGA);
+static nat         FillInClosure(StgWord ***bufptrP, StgClosure *graph);
+static void        LocateNextParent(StgClosure **parentP,
+				    nat *pptrP, nat *pptrsP, nat *sizeP);
+StgClosure        *UnpackGraph(rtsPackBuffer *packBuffer,
+			       globalAddr **gamap,
+			       nat *nGAs);
+static  StgClosure *UnpackClosure (StgWord ***bufptrP, StgClosure **graphP, 
+				   globalAddr *ga);
+static  StgWord   **UnpackGA(StgWord **bufptr, globalAddr *ga);
+static  StgClosure *UnpackOffset(globalAddr *ga);
+static  StgClosure *UnpackPLC(globalAddr *ga);
+static  void        UnpackArray(StgWord ***bufptrP, StgClosure *graph);
+static  nat         UnpackPAP(StgWord ***bufptrP, StgClosure *graph);
+
+# elif defined(GRAN)
+void        CommonUp(StgClosure *src, StgClosure *dst);
+StgClosure *UnpackGraph(rtsPackBuffer* buffer);
+#endif
+
+//@node Aux fcts for packing,  , Unpacking routines, Prototypes
+//@subsubsection Aux fcts for packing
+
+# if defined(PAR)
+static void 	DonePacking(void);
+static void 	AmPacking(StgClosure *closure);
+static int  	OffsetFor(StgClosure *closure);
+static rtsBool  NotYetPacking(int offset);
+static inline rtsBool  RoomToPack (nat size, nat ptrs);
+static inline rtsBool  isOffset(globalAddr *ga);
+static inline rtsBool  isFixed(globalAddr *ga);
+static inline rtsBool  isConstr(globalAddr *ga);
+static inline rtsBool  isUnglobalised(globalAddr *ga);
+# elif defined(GRAN)
+static void     DonePacking(void);
+static rtsBool  NotYetPacking(StgClosure *closure);
+# endif
+
+//@node Global variables, ADT of Closure Queues, Prototypes, Graph packing
+//@subsection Global variables
+/*
+  Static data declarations
+*/
+
+static nat     pack_locn,           /* ptr to first free loc in pack buffer */
+               clq_size, clq_pos,
+               buf_id = 1;          /* identifier for buffer */
+static nat     unpacked_size;
+static rtsBool roomInBuffer;
+#if defined(PAR)
+static GlobalTaskId dest_gtid=0;    /* destination for message to send */
+#endif
+
+/* 
+   The pack buffer
+   To be pedantic: in GrAnSim we're packing *addresses* of closures,
+   not the closures themselves.
+*/
+static rtsPackBuffer *globalPackBuffer = NULL,    /* for packing a graph */
+                     *globalUnpackBuffer = NULL;  /* for unpacking a graph */
+
+
+/*
+  Bit of a hack for testing if a closure is the root of the graph. This is
+  set in @PackNearbyGraph@ and tested in @PackClosure@.  
+*/
+
+static nat          packed_thunks = 0;
+static StgClosure  *graph_root;
+
+# if defined(PAR)
+/*
+  The offset hash table is used during packing to record the location in
+  the pack buffer of each closure which is packed.
+*/
+//@cindex offsetTable
+static HashTable *offsetTable;
+
+//@cindex PendingGABuffer
+static globalAddr *PendingGABuffer, *gaga;
+
+# endif /* PAR */
+
+
+//@node ADT of Closure Queues, Initialisation for packing, Global variables, Graph packing
+//@subsection ADT of Closure Queues
+
+//@menu
+//* Closure Queues::		
+//* Init routines::		
+//* Basic routines::		
+//@end menu
+
+//@node Closure Queues, Init routines, ADT of Closure Queues, ADT of Closure Queues
+//@subsubsection Closure Queues
+/*
+  Closure Queues
+
+  These routines manage the closure queue.
+*/
+
+static nat clq_pos, clq_size;
+
+static StgClosure **ClosureQueue = NULL;   /* HWL: init in main */
+
+#if defined(DEBUG)
+static char graphFingerPrint[MAX_FINGER_PRINT_LEN];
+#endif
+
+//@node Init routines, Basic routines, Closure Queues, ADT of Closure Queues
+//@subsubsection Init routines
+
+/* @InitClosureQueue@ allocates and initialises the closure queue. */
+
+//@cindex InitClosureQueue
+static inline void
+InitClosureQueue(void)
+{
+  clq_pos = clq_size = 0;
+
+  if (ClosureQueue==NULL)
+    ClosureQueue = (StgClosure**) stgMallocWords(RTS_PACK_BUFFER_SIZE, 
+						 "InitClosureQueue");
+}
+
+//@node Basic routines, Types of Global Addresses, Init routines, ADT of Closure Queues
+//@subsubsection Basic routines
+
+/*
+  QueueEmpty returns rtsTrue if the closure queue is empty; rtsFalse otherwise.
+*/
+
+//@cindex QueueEmpty
+static inline rtsBool
+QueueEmpty(void)
+{
+  return(clq_pos >= clq_size);
+}
+
+/* QueueClosure adds its argument to the closure queue. */
+
+//@cindex QueueClosure
+static inline void
+QueueClosure(closure)
+StgClosure *closure;
+{
+  if(clq_size < RTS_PACK_BUFFER_SIZE ) {
+    IF_PAR_DEBUG(paranoia,
+		 belch(">__> <<%d>> Q: %p (%s); %d elems in q",
+		       globalPackBuffer->id, closure, info_type(closure), clq_size-clq_pos));
+    ClosureQueue[clq_size++] = closure;
+  } else { 
+    barf("Closure Queue Overflow (EnQueueing %p (%s))", 
+	 closure, info_type(closure));
+  }
+}
+
+/* DeQueueClosure returns the head of the closure queue. */
+
+//@cindex DeQueueClosure
+static inline StgClosure * 
+DeQueueClosure(void)
+{
+  if(!QueueEmpty()) {
+    IF_PAR_DEBUG(paranoia,
+		 belch(">__> <<%d>> DeQ: %p (%s); %d elems in q",
+		       globalPackBuffer->id, ClosureQueue[clq_pos], info_type(ClosureQueue[clq_pos]), 
+		       clq_size-clq_pos));
+    return(ClosureQueue[clq_pos++]);
+  } else {
+    return((StgClosure*)NULL);
+  }
+}
+
+/* DeQueueClosure returns the head of the closure queue. */
+
+#if defined(DEBUG)
+//@cindex PrintQueueClosure
+static void
+PrintQueueClosure(void)
+{
+  nat i;
+
+  fputs("Closure queue:", stderr);
+  for (i=clq_pos; i < clq_size; i++)
+    fprintf(stderr, "%p (%s), ", 
+	    (StgClosure *)ClosureQueue[clq_pos++], 
+	    info_type(ClosureQueue[clq_pos++]));
+  fputc('\n', stderr);
+}
+#endif
+
+//@node Types of Global Addresses,  , Basic routines, ADT of Closure Queues
+//@subsubsection Types of Global Addresses
+
+/*
+  Types of Global Addresses
+
+  These routines determine whether a GA is one of a number of special types
+  of GA.
+*/
+
+# if defined(PAR)
+//@cindex isOffset
+static inline rtsBool 
+isOffset(globalAddr *ga)
+{
+    return (ga->weight == 1U && ga->payload.gc.gtid == (GlobalTaskId)0);
+}
+
+//@cindex isFixed
+static inline rtsBool
+isFixed(globalAddr *ga)
+{
+    return (ga->weight == 0U);
+}
+
+//@cindex isConstr
+static inline rtsBool
+isConstr(globalAddr *ga)
+{
+    return (ga->weight == 2U);
+}
+
+//@cindex isUnglobalised
+static inline rtsBool
+isUnglobalised(globalAddr *ga)
+{
+    return (ga->weight == 2U);
+}
+# endif
+
+//@node Initialisation for packing, Packing Functions, ADT of Closure Queues, Graph packing
+//@subsection Initialisation for packing
+/*
+  Simple Packing Routines
+
+  About packet sizes in GrAnSim: In GrAnSim we use a malloced block of
+  gransim_pack_buffer_size words to simulate a packet of pack_buffer_size
+  words.  In the simulated PackBuffer we only keep the addresses of the
+  closures that would be packed in the parallel system (see Pack). To
+  decide if a packet overflow occurs pack_buffer_size must be compared
+  versus unpacked_size (see RoomToPack).  Currently, there is no multi
+  packet strategy implemented, so in the case of an overflow we just stop
+  adding closures to the closure queue.  If an overflow of the simulated
+  packet occurs, we just realloc some more space for it and carry on as
+  usual.  -- HWL
+*/
+
+# if defined(GRAN)
+rtsPackBuffer *
+InstantiatePackBuffer (void) {
+  extern rtsPackBuffer *globalPackBuffer;
+
+  globalPackBuffer = (rtsPackBuffer *) stgMallocWords(sizeofW(rtsPackBuffer), 
+			 "InstantiatePackBuffer: failed to alloc packBuffer");
+  globalPackBuffer->size = RtsFlags.GranFlags.packBufferSize_internal;
+  globalPackBuffer->buffer = (StgWord **) stgMallocWords(RtsFlags.GranFlags.packBufferSize_internal,
+				 "InstantiatePackBuffer: failed to alloc GranSim internal packBuffer");
+  /* NB: gransim_pack_buffer_size instead of pack_buffer_size -- HWL */
+  /* stgMallocWords is now simple allocate in Storage.c */
+
+  return (globalPackBuffer);
+}
+
+/* 
+   Reallocate the GranSim internal pack buffer to make room for more closure
+   pointers. This is independent of the check for packet overflow as in GUM
+*/
+static void
+reallocPackBuffer (void) {
+
+  ASSERT(pack_locn >= (int)globalPackBuffer->size+sizeofW(rtsPackBuffer));
+
+  IF_GRAN_DEBUG(packBuffer,
+		belch("** Increasing size of PackBuffer %p to %d words (PE %u @ %d)\n",
+		      globalPackBuffer, globalPackBuffer->size+REALLOC_SZ,
+		      CurrentProc, CurrentTime[CurrentProc]));
+  
+  globalPackBuffer = (rtsPackBuffer*)realloc(globalPackBuffer, 
+				  sizeof(StgClosure*)*(REALLOC_SZ +
+						       (int)globalPackBuffer->size +
+						       sizeofW(rtsPackBuffer))) ;
+  if (globalPackBuffer==(rtsPackBuffer*)NULL) 
+    barf("Failing to realloc %d more words for PackBuffer %p (PE %u @ %d)\n", 
+	 REALLOC_SZ, globalPackBuffer, CurrentProc, CurrentTime[CurrentProc]);
+  
+  globalPackBuffer->size += REALLOC_SZ;
+
+  ASSERT(pack_locn < globalPackBuffer->size+sizeofW(rtsPackBuffer));
+}
+# endif
+
+# if defined(PAR)
+/* @initPacking@ initialises the packing buffer etc. */
+//@cindex InitPackBuffer
+rtsBool
+InitPackBuffer(void)
+{
+  if (globalPackBuffer==(rtsPackBuffer*)NULL) {
+    if ((globalPackBuffer = (rtsPackBuffer *) 
+	 stgMallocWords(sizeofW(rtsPackBuffer)+RtsFlags.ParFlags.packBufferSize+DEBUG_HEADROOM,
+			"InitPackBuffer")) == NULL)
+      return rtsFalse;
+  }
+  return rtsTrue;
+}
+
+# endif 
+//@cindex InitPacking
+static void
+InitPacking(rtsBool unpack)
+{
+# if defined(GRAN)
+  globalPackBuffer = InstantiatePackBuffer();     /* for GrAnSim only -- HWL */
+                                       /* NB: free in UnpackGraph */
+# elif defined(PAR)
+  if (unpack) {
+    /* allocate a GA-to-GA map (needed for ACK message) */
+    InitPendingGABuffer(RtsFlags.ParFlags.packBufferSize);
+  } else {
+    /* allocate memory to pack the graph into */
+    InitPackBuffer();
+  }
+# endif
+  /* init queue of closures seen during packing */
+  InitClosureQueue();
+
+  if (unpack) 
+    return;
+
+  globalPackBuffer->id = buf_id++;  /* buffer id are only used for debugging! */
+  pack_locn = 0;         /* the index into the actual pack buffer */
+  unpacked_size = 0;     /* the size of the whole graph when unpacked */
+  roomInBuffer = rtsTrue;
+  packed_thunks = 0;   /* total number of thunks packed so far */
+# if defined(PAR)
+  offsetTable = allocHashTable();
+# endif
+}
+
+//@node Packing Functions, Low level packing routines, Initialisation for packing, Graph packing
+//@subsection Packing Functions
+
+//@menu
+//* Packing Sections of Nearby Graph::	
+//* Packing Closures::		
+//@end menu
+
+//@node Packing Sections of Nearby Graph, Packing Closures, Packing Functions, Packing Functions
+//@subsubsection Packing Sections of Nearby Graph
+/*
+  Packing Sections of Nearby Graph
+
+  @PackNearbyGraph@ packs a closure and associated graph into a static
+  buffer (@PackBuffer@).  It returns the address of this buffer and the
+  size of the data packed into the buffer (in its second parameter,
+  @packBufferSize@).  The associated graph is packed in a depth first
+  manner, hence it uses an explicit queue of closures to be packed rather
+  than simply using a recursive algorithm.  Once the packet is full,
+  closures (other than primitive arrays) are packed as FetchMes, and their
+  children are not queued for packing.  */
+
+//@cindex PackNearbyGraph
+
+/* NB: this code is shared between GranSim and GUM;
+       tso only used in GranSim */
+rtsPackBuffer *
+PackNearbyGraph(closure, tso, packBufferSize, dest)
+StgClosure* closure;
+StgTSO* tso;
+nat *packBufferSize;
+GlobalTaskId dest;
+{
+  IF_PAR_DEBUG(resume,
+	       graphFingerPrint[0] = '\0');
+
+  ASSERT(RTS_PACK_BUFFER_SIZE > 0);
+  ASSERT(_HS==1);  // HWL HACK; compile time constant
+
+#if defined(PAR_TICKY) // HWL HAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACK
+  PAR_TICKY_PACK_NEARBY_GRAPH_START();
+#endif
+
+  /* ToDo: check that we have enough heap for the packet
+     ngoq ngo'
+     if (Hp + PACK_HEAP_REQUIRED > HpLim) 
+     return NULL;
+  */
+  InitPacking(rtsFalse);
+# if defined(PAR)
+  dest_gtid=dest; //-1 to disable
+# elif defined(GRAN)
+  graph_root = closure;
+# endif
+
+  IF_GRAN_DEBUG(pack,
+		belch(">>> Packing <<%d>> (buffer @ %p); graph root @ %p [PE %d]\n    demanded by TSO %d (%p) [PE %u]",
+		      globalPackBuffer->id, globalPackBuffer, closure, where_is(closure), 
+		      tso->id, tso, where_is((StgClosure*)tso)));
+
+  IF_GRAN_DEBUG(pack,
+		belch("** PrintGraph of %p is:", closure); 
+		PrintGraph(closure,0));
+
+  IF_PAR_DEBUG(resume,
+	       GraphFingerPrint(closure, graphFingerPrint);
+	       ASSERT(strlen(graphFingerPrint)<=MAX_FINGER_PRINT_LEN);
+	       belch(">>> Packing <<%d>> (buffer @ %p); graph root @ %p [%x]\n    demanded by TSO %d (%p); Finger-print is\n    {%s}",
+		     globalPackBuffer->id, globalPackBuffer, closure, mytid,
+		     tso->id, tso, graphFingerPrint)); 
+
+  IF_PAR_DEBUG(packet,
+	       belch("** PrintGraph of %p is:", closure); 
+	       belch("** pack_locn=%d", pack_locn);
+	       PrintGraph(closure,0));
+
+  QueueClosure(closure);
+  do {
+    PackClosure(DeQueueClosure());
+  } while (!QueueEmpty());
+  
+# if defined(PAR)
+
+  /* Record how much space the graph needs in packet and in heap */
+  globalPackBuffer->tso = tso;       // currently unused, I think (debugging?)
+  globalPackBuffer->unpacked_size = unpacked_size;
+  globalPackBuffer->size = pack_locn;
+
+  /* Check for buffer overflow (again) */
+  ASSERT(pack_locn <= RtsFlags.ParFlags.packBufferSize+DEBUG_HEADROOM);
+  IF_DEBUG(sanity,                           // write magic end-of-buffer word
+	   globalPackBuffer->buffer[pack_locn] = END_OF_BUFFER_MARKER);
+  *packBufferSize = pack_locn;
+
+# else  /* GRAN */
+
+  /* Record how much space is needed to unpack the graph */
+  // PackBuffer[PACK_FLAG_LOCN] = (P_) MAGIC_PACK_FLAG;  for testing
+  globalPackBuffer->tso = tso;
+  globalPackBuffer->unpacked_size = unpacked_size;
+
+  // ASSERT(pack_locn <= PackBuffer[PACK_SIZE_LOCN]+PACK_HDR_SIZE);
+  /* ToDo: Print an earlier, more meaningful message */
+  if (pack_locn==0)   /* i.e. packet is empty */
+    barf("EMPTY PACKET! Can't transfer closure %p at all!!\n",
+	 closure);
+  globalPackBuffer->size = pack_locn;
+  *packBufferSize = pack_locn;
+
+# endif
+
+  DonePacking();                               /* {GrAnSim}vaD 'ut'Ha' */
+
+# if defined(GRAN)
+  IF_GRAN_DEBUG(pack ,
+		belch("** Finished <<%d>> packing graph %p; closures packed: %d; thunks packed: %d; size of graph: %d",
+		      globalPackBuffer->id, closure, globalPackBuffer->size, packed_thunks, globalPackBuffer->unpacked_size));
+  if (RtsFlags.GranFlags.GranSimStats.Global) {
+    globalGranStats.tot_packets++; 
+    globalGranStats.tot_packet_size += pack_locn; 
+  }
+  
+  IF_GRAN_DEBUG(pack, PrintPacket(globalPackBuffer));
+# elif defined(PAR)
+  IF_PAR_DEBUG(packet,
+		belch("** Finished <<%d>> packing graph %p (%s); closures packed: %d; thunks packed: %d; size of graph: %d",
+		      globalPackBuffer->id, closure, info_type(closure),
+		      globalPackBuffer->size, packed_thunks, 
+		      globalPackBuffer->unpacked_size));;
+
+  IF_DEBUG(sanity, // do a sanity check on the packet just constructed 
+	   checkPacket(globalPackBuffer));
+# endif   /* GRAN */
+
+#if defined(PAR_TICKY) // HWL HAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACK
+  PAR_TICKY_PACK_NEARBY_GRAPH_END(globalPackBuffer->size, packed_thunks);
+#endif
+  
+  return (globalPackBuffer);
+}
+
+//@cindex PackOneNode
+
+# if defined(GRAN)
+/* This version is used when the node is already local */
+
+rtsPackBuffer *
+PackOneNode(closure, tso, packBufferSize)
+StgClosure* closure;
+StgTSO* tso;
+nat *packBufferSize;
+{
+  extern rtsPackBuffer *globalPackBuffer;
+  int i, clpack_locn;
+
+  InitPacking(rtsFalse);
+
+  IF_GRAN_DEBUG(pack,
+		belch("** PackOneNode: %p (%s)[PE %d] requested by TSO %d (%p) [PE %d]",
+		      closure, info_type(closure),
+		      where_is(closure), tso->id, tso, where_is((StgClosure *)tso)));
+
+  Pack(closure);
+
+  /* Record how much space is needed to unpack the graph */
+  globalPackBuffer->tso = tso;
+  globalPackBuffer->unpacked_size = unpacked_size;
+
+  /* Set the size parameter */
+  ASSERT(pack_locn <= RTS_PACK_BUFFER_SIZE);
+  globalPackBuffer->size =  pack_locn;
+  *packBufferSize = pack_locn;
+
+  if (RtsFlags.GranFlags.GranSimStats.Global) {
+    globalGranStats.tot_packets++; 
+    globalGranStats.tot_packet_size += pack_locn; 
+  }
+  IF_GRAN_DEBUG(pack,
+    PrintPacket(globalPackBuffer));
+
+  return (globalPackBuffer);
+}
+# endif  /* GRAN */
+
+#if defined(GRAN)
+
+/*
+   PackTSO and PackStkO are entry points for two special kinds of closure
+   which are used in the parallel RTS.  Compared with other closures they
+   are rather awkward to pack because they don't follow the normal closure
+   layout (where all pointers occur before all non-pointers).  Luckily,
+   they're only needed when migrating threads between processors.  */
+
+//@cindex PackTSO
+rtsPackBuffer*
+PackTSO(tso, packBufferSize)
+StgTSO *tso;
+nat *packBufferSize;
+{
+  extern rtsPackBuffer *globalPackBuffer;
+  IF_GRAN_DEBUG(pack,
+		belch("** Packing TSO %d (%p)", tso->id, tso));
+  *packBufferSize = 0;
+  // PackBuffer[0] = PackBuffer[1] = 0; ???
+  return(globalPackBuffer);
+}
+
+//@cindex PackStkO
+static rtsPackBuffer*
+PackStkO(stko, packBufferSize)
+StgPtr stko;
+nat *packBufferSize;
+{
+  extern rtsPackBuffer *globalPackBuffer;
+  IF_GRAN_DEBUG(pack,
+		belch("** Packing STKO %p", stko));
+  *packBufferSize = 0;
+  // PackBuffer[0] = PackBuffer[1] = 0;
+  return(globalPackBuffer);
+}
+
+static void
+PackFetchMe(StgClosure *closure)
+{
+  barf("{PackFetchMe}Daq Qagh: no FetchMe closures in GRAN!");
+}
+
+#elif defined(PAR)
+
+static rtsPackBuffer*
+PackTSO(tso, packBufferSize)
+StgTSO *tso;
+nat *packBufferSize;
+{
+  barf("{PackTSO}Daq Qagh: trying to pack a TSO %d (%p) of size %d; thread migrations not supported, yet",
+       tso->id, tso, packBufferSize);
+}
+
+rtsPackBuffer*
+PackStkO(stko, packBufferSize)
+StgPtr stko;
+nat *packBufferSize;
+{
+  barf("{PackStkO}Daq Qagh: trying to pack a STKO (%p) of size %d; thread migrations not supported, yet",
+       stko, packBufferSize);
+}
+
+//@cindex PackFetchMe
+static void
+PackFetchMe(StgClosure *closure)
+{
+  StgInfoTable *ip;
+  nat i;
+  int offset;
+#if defined(DEBUG)
+  nat x = pack_locn;
+#endif
+
+#if defined(GRAN)
+  barf("{PackFetchMe}Daq Qagh: no FetchMe closures in GRAN!");
+#else
+  offset = OffsetFor(closure);
+  if (!NotYetPacking(offset)) {
+    IF_PAR_DEBUG(pack,
+		 belch("*>.. Packing FETCH_ME for closure %p (s) as offset to %d",
+		       closure, info_type(closure), offset));
+    PackOffset(offset);
+    // unpacked_size += 0;   // unpacked_size unchanged (closure is shared!!)
+    return;
+  }
+
+  /* Need a GA even when packing a constructed FETCH_ME (cruel world!) */
+  AmPacking(closure);
+  /* FMs must be always globalised */
+  GlobaliseAndPackGA(closure);
+
+  IF_PAR_DEBUG(pack,
+	       belch("*>.. Packing FETCH_ME for closure %p (%s) with GA: ((%x, %d, %x))",
+		     closure, info_type(closure), 
+		     globalPackBuffer->buffer[pack_locn-2],
+		     globalPackBuffer->buffer[pack_locn-1],
+		     globalPackBuffer->buffer[pack_locn-3]));
+
+  /* Pack a FetchMe closure instead of closure */
+  ip = &stg_FETCH_ME_info;
+  /* this assumes that the info ptr is always the first word in a closure*/
+  Pack((StgWord)ip);
+  for (i = 1; i < _HS; ++i)               // pack rest of fixed header
+    Pack((StgWord)*(((StgPtr)closure)+i));
+  
+  unpacked_size += sizeofW(StgFetchMe);
+  /* size of FETCHME in packed is the same as that constant */
+  ASSERT(pack_locn-x==PACK_FETCHME_SIZE);
+  /* In the pack buffer the pointer to a GA (in the FetchMe closure) 
+     is expanded to the full GA; this is a compile-time const */
+  //ASSERT(PACK_FETCHME_SIZE == sizeofW(StgFetchMe)-1+PACK_GA_SIZE);  
+#endif
+}
+
+#endif
+
+#ifdef DIST
+static void
+PackRemoteRef(StgClosure *closure)
+{
+  StgInfoTable *ip;
+  nat i;
+  int offset;
+
+  offset = OffsetFor(closure);
+  if (!NotYetPacking(offset)) {
+    PackOffset(offset);
+    unpacked_size += 2;
+    return;
+  }
+
+  /* Need a GA even when packing a constructed REMOTE_REF (cruel world!) */
+  AmPacking(closure);
+  
+  /* basically we just Globalise, but for sticky things we can't have multiple GAs,
+     so we must prevent the GAs being split.
+     
+     In returning things to the true sticky owner, this case is already handled, but for
+     anything else we just give up at the moment... This needs to be fixed! 
+  */
+  { globalAddr *ga;
+    ga = LAGAlookup(closure); // surely this ga must exist?
+    
+    // ***************************************************************************
+    // ***************************************************************************
+    // REMOTE_REF HACK - dual is in SetGAandCommonUp
+    // - prevents the weight from ever reaching zero
+    if(ga != NULL) 
+      ga->weight=0x06660666; //anything apart from 0 really...
+    // ***************************************************************************
+    // ***************************************************************************
+    
+    if((ga != NULL)&&(ga->weight / 2 <= 2))
+      barf("Cant split the weight any further when packing REMOTE_REF for closure %p (%s) with GA: ((%x, %d, %x))",
+		closure, info_type(closure), 
+	   	ga->payload.gc.gtid, ga->payload.gc.slot, ga->weight);   			     
+  } 
+  GlobaliseAndPackGA(closure);
+      
+  IF_PAR_DEBUG(pack,
+	       belch("*>.. Packing REMOTE_REF for closure %p (%s) with GA: ((%x, %d, %x))",
+		     closure, info_type(closure), 
+		     globalPackBuffer->buffer[pack_locn-2],
+		     globalPackBuffer->buffer[pack_locn-1],
+		     globalPackBuffer->buffer[pack_locn-3]));
+
+  /* Pack a REMOTE_REF closure instead of closure */
+  ip = &stg_REMOTE_REF_info;
+  /* this assumes that the info ptr is always the first word in a closure*/
+  Pack((StgWord)ip);
+  for (i = 1; i < _HS; ++i)               // pack rest of fixed header
+    Pack((StgWord)*(((StgPtr)closure)+i));
+  
+  unpacked_size += PACK_FETCHME_SIZE;
+}
+#endif /* DIST */
+
+//@node Packing Closures,  , Packing Sections of Nearby Graph, Packing Functions
+//@subsubsection Packing Closures
+/*
+  Packing Closures
+
+  @PackClosure@ is the heart of the normal packing code.  It packs a single
+  closure into the pack buffer, skipping over any indirections and
+  globalising it as necessary, queues any child pointers for further
+  packing, and turns it into a @FetchMe@ or revertible black hole (@RBH@)
+  locally if it was a thunk.  Before the actual closure is packed, a
+  suitable global address (GA) is inserted in the pack buffer.  There is
+  always room to pack a fetch-me to the closure (guaranteed by the
+  RoomToPack calculation), and this is packed if there is no room for the
+  entire closure.
+
+  Space is allocated for any primitive array children of a closure, and
+  hence a primitive array can always be packed along with it's parent
+  closure.  */
+
+//@cindex PackClosure
+
+# if defined(PAR)
+
+void
+PackClosure(closure)
+StgClosure *closure;
+{
+  StgInfoTable *info;
+  nat clpack_locn;
+
+  ASSERT(LOOKS_LIKE_GHC_INFO(get_itbl(closure)));
+
+  closure = UNWIND_IND(closure);
+  /* now closure is the thing we want to pack */
+  info = get_itbl(closure);
+
+  clpack_locn = OffsetFor(closure);
+
+  /* If the closure has been packed already, just pack an indirection to it
+     to guarantee that the graph doesn't become a tree when unpacked */
+  if (!NotYetPacking(clpack_locn)) {
+    PackOffset(clpack_locn);
+    return;
+  }
+
+  switch (info->type) {
+
+  case CONSTR_CHARLIKE:
+    IF_PAR_DEBUG(pack,
+		 belch("*>^^ Packing a charlike closure %d", 
+		       ((StgIntCharlikeClosure*)closure)->data));
+    
+    PackPLC((StgPtr)CHARLIKE_CLOSURE(((StgIntCharlikeClosure*)closure)->data));
+    // NB: unpacked_size of a PLC is 0
+    return;
+      
+  case CONSTR_INTLIKE:
+    {
+      StgInt val = ((StgIntCharlikeClosure*)closure)->data;
+
+      if ((val <= MAX_INTLIKE) && (val >= MIN_INTLIKE)) {
+	IF_PAR_DEBUG(pack,
+		     belch("*>^^ Packing a small intlike %d as a PLC", 
+			   val));
+	PackPLC((StgPtr)INTLIKE_CLOSURE(val));
+	// NB: unpacked_size of a PLC is 0
+	return;
+      } else {
+	IF_PAR_DEBUG(pack,
+		     belch("*>^^ Packing a big intlike %d as a normal closure", 
+			   val));
+	PackGeneric(closure);
+	return;
+      }
+    }
+
+  case CONSTR:
+  case CONSTR_1_0:
+  case CONSTR_0_1:
+  case CONSTR_2_0:
+  case CONSTR_1_1:
+  case CONSTR_0_2:
+    /* it's a constructor (i.e. plain data) */
+    IF_PAR_DEBUG(pack,
+		 belch("*>^^ Packing a CONSTR %p (%s) using generic packing", 
+		       closure, info_type(closure)));
+    PackGeneric(closure);
+    return;
+
+  case THUNK_STATIC:       // ToDo: check whether that's ok
+  case FUN_STATIC:       // ToDo: check whether that's ok
+  case CONSTR_STATIC:
+  case CONSTR_NOCAF_STATIC:// For now we ship indirections to CAFs: They are
+			   // evaluated on each PE if needed
+    IF_PAR_DEBUG(pack,
+		 belch("*>~~ Packing a %p (%s) as a PLC", 
+		       closure, info_type(closure)));
+
+    PackPLC((StgPtr)closure);
+    // NB: unpacked_size of a PLC is 0
+    return;
+
+  case THUNK_SELECTOR: 
+    {
+      StgClosure *selectee = ((StgSelector *)closure)->selectee;
+
+      IF_PAR_DEBUG(pack,
+		   belch("*>** Found THUNK_SELECTOR at %p (%s) pointing to %p (%s); using PackGeneric", 
+			 closure, info_type(closure), 
+			 selectee, info_type(selectee)));
+      PackGeneric(closure);
+      /* inlined code; probably could use PackGeneric
+      Pack((StgWord)(*(StgPtr)closure));  
+      Pack((StgWord)(selectee));
+      QueueClosure(selectee);
+      unpacked_size += 2;
+      */
+    }
+    return;
+
+  case  FUN:
+  case	FUN_1_0:
+  case	FUN_0_1:
+  case	FUN_2_0:
+  case	FUN_1_1:
+  case	FUN_0_2:
+  case  THUNK:
+  case	THUNK_1_0:
+  case	THUNK_0_1:
+  case	THUNK_2_0:
+  case	THUNK_1_1:
+  case	THUNK_0_2:
+    PackGeneric(closure);
+    return;
+
+  case AP_UPD:
+  case PAP:
+    /* 
+    barf("*>   Packing of PAP not implemented %p (%s)",
+		       closure, info_type(closure));
+	 
+       Currently we don't pack PAPs; we pack a FETCH_ME to the closure, 
+       instead. Note that since PAPs contain a chunk of stack as payload,
+       implementing packing of PAPs is a first step towards thread migration.
+    IF_PAR_DEBUG(pack,
+		 belch("*>.. Packing a PAP closure at %p (%s) as a FETCH_ME", 
+		       closure, info_type(closure)));
+    PackFetchMe(closure);
+    */
+    PackPAP((StgPAP *)closure);
+    return;
+
+  case CAF_BLACKHOLE:
+  case BLACKHOLE:
+  case BLACKHOLE_BQ:
+  case SE_BLACKHOLE:
+  case SE_CAF_BLACKHOLE:
+  case RBH:
+  case FETCH_ME:
+  case FETCH_ME_BQ:
+
+    /* If it's a (revertible) black-hole, pack a FetchMe closure to it */
+    //ASSERT(pack_locn > PACK_HDR_SIZE);
+    
+    IF_PAR_DEBUG(pack,
+		 belch("*>.. Packing a BH-like closure at %p (%s) as a FETCH_ME", 
+		       closure, info_type(closure)));
+    /* NB: in case of a FETCH_ME this might build up a chain of FETCH_MEs;
+           phps short-cut the GA here */
+    PackFetchMe(closure);
+    return;
+
+#ifdef DIST    
+  case REMOTE_REF:
+    IF_PAR_DEBUG(pack,
+		 belch("*>.. Packing %p (%s) as a REMOTE_REF", 
+		       closure, info_type(closure)));
+    PackRemoteRef(closure);
+    /* we hopefully don't end up with a chain of REMOTE_REFs!!!!!!!!!! */
+
+    return;
+#endif  
+    
+  case TSO:
+  case MVAR:
+#ifdef DIST
+          IF_PAR_DEBUG(pack,
+		 belch("*>.. Packing %p (%s) as a RemoteRef", 
+		       closure, info_type(closure)));
+    PackRemoteRef(closure);
+#else
+    barf("{Pack}Daq Qagh: Only GdH can pack %p (%s)", 
+	 closure, info_type(closure));
+#endif    
+    return;
+    
+  case ARR_WORDS:
+    PackArray(closure);
+    return;
+
+  case MUT_ARR_PTRS:
+  case MUT_ARR_PTRS_FROZEN:
+  case MUT_VAR:
+    /* 
+       Eventually, this should use the same packing routine as ARR_WRODS
+
+       GlobaliseAndPackGA(closure);
+       PackArray(closure);
+       return;
+    */
+    barf("Qagh{Pack}Doq: packing of mutable closures not yet implemented: %p (%s)",
+	 closure, info_type(closure));
+
+#  ifdef DEBUG
+  case BCO:
+    barf("{Pack}Daq Qagh: found BCO closure %p (%s); GUM hates interpreted code", 
+	 closure, info_type(closure));
+    /* never reached */
+    
+    // check error cases only in a debugging setup
+  case RET_BCO:
+  case RET_SMALL:
+  case RET_VEC_SMALL:
+  case RET_BIG:
+  case RET_VEC_BIG:
+  case RET_DYN:
+    barf("{Pack}Daq Qagh: found return vector %p (%s) when packing (thread migration not implemented)", 
+	 closure, info_type(closure));
+    /* never reached */
+    
+  case UPDATE_FRAME:
+  case STOP_FRAME:
+  case CATCH_FRAME:
+  case SEQ_FRAME:
+    barf("{Pack}Daq Qagh: found stack frame %p (%s) when packing (thread migration not implemented)", 
+	 closure, info_type(closure));
+    /* never reached */
+
+  case BLOCKED_FETCH:
+  case EVACUATED:
+    /* something's very wrong */
+    barf("{Pack}Daq Qagh: found %s (%p) when packing", 
+	 info_type(closure), closure);
+    /* never reached */
+
+  case IND:
+  case IND_OLDGEN:
+  case IND_PERM:
+  case IND_OLDGEN_PERM:
+  case IND_STATIC:
+    barf("Pack: found IND_... after shorting out indirections %d (%s)", 
+	 (nat)(info->type), info_type(closure));
+
+  case WEAK:
+  case FOREIGN:
+  case STABLE_NAME:
+    barf("Pack: found foreign thingy; not yet implemented in %d (%s)", 
+	 (nat)(info->type), info_type(closure));
+#endif
+
+  default:
+    barf("Pack: strange closure %d", (nat)(info->type));
+  } /* switch */
+}
+
+/*
+  Pack a constructor of unknown size.
+  Similar to PackGeneric but without creating GAs.
+*/
+#if 0
+//@cindex PackConstr
+static void
+PackConstr(StgClosure *closure)
+{
+  StgInfoTable *info;
+  nat size, ptrs, nonptrs, vhs, i;
+  char str[80];
+
+  ASSERT(LOOKS_LIKE_GHC_INFO(closure->header.info));
+
+  /* get info about basic layout of the closure */
+  info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+
+  ASSERT(info->type == CONSTR ||
+         info->type == CONSTR_1_0 ||
+         info->type == CONSTR_0_1 ||
+         info->type == CONSTR_2_0 ||
+         info->type == CONSTR_1_1 ||
+         info->type == CONSTR_0_2);
+
+  IF_PAR_DEBUG(pack,
+	       fprintf(stderr, "*>^^ packing a constructor at %p (%s) (size=%d, ptrs=%d, nonptrs=%d)\n",
+		       closure, info_type(closure), size, ptrs, nonptrs));
+
+  /* Primitive arrays have gone; now we have (MUT_)ARR_WORDS etc */
+
+  if (!RoomToPack(PACK_GA_SIZE + _HS + vhs + nonptrs, ptrs)) {
+    IF_PAR_DEBUG(pack,
+		 belch("*>&& pack buffer is full; packing FETCH_ME for closure %p (%s)",
+		       closure, info_type(closure)));
+    PackFetchMe(closure);
+    return;
+  }
+
+  /* Record the location of the GA */
+  AmPacking(closure);
+
+  /* Pack Constructor marker */
+  Pack((StgWord)2);
+
+  /* pack fixed and variable header */
+  for (i = 0; i < _HS + vhs; ++i)
+    Pack((StgWord)*(((StgPtr)closure)+i));
+      
+  /* register all ptrs for further packing */
+  for (i = 0; i < ptrs; ++i)
+    QueueClosure(((StgClosure *) *(((StgPtr)closure)+(_HS+vhs)+i)));
+
+  /* pack non-ptrs */
+  for (i = 0; i < nonptrs; ++i)
+    Pack((StgWord)*(((StgPtr)closure)+(_HS+vhs)+ptrs+i));
+}
+#endif
+
+/*
+  Generic packing code.
+  This code is performed for `ordinary' closures such as CONSTR, THUNK etc.
+*/
+//@cindex PackGeneric
+static void
+PackGeneric(StgClosure *closure)
+{
+  StgInfoTable *info;
+  StgClosure *rbh;
+  nat size, ptrs, nonptrs, vhs, i, m;
+  char str[80];
+
+  ASSERT(LOOKS_LIKE_COOL_CLOSURE(closure));
+
+  /* get info about basic layout of the closure */
+  info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+
+  ASSERT(!IS_BLACK_HOLE(closure));
+
+  IF_PAR_DEBUG(pack,
+	       fprintf(stderr, "*>== %p (%s): generic packing (size=%d, ptrs=%d, nonptrs=%d)\n",
+		       closure, info_type(closure), size, ptrs, nonptrs));
+
+  /* packing strategies: how many thunks to add to a packet; 
+     default is infinity i.e. RtsFlags.ParFlags.thunksToPack==0 */
+  if (RtsFlags.ParFlags.thunksToPack &&
+      packed_thunks >= RtsFlags.ParFlags.thunksToPack &&
+      closure_THUNK(closure)) {
+    IF_PAR_DEBUG(pack,
+		 belch("*>&& refusing to pack more than %d thunks per packet; packing FETCH_ME for closure %p (%s)",
+		       packed_thunks, closure, info_type(closure)));
+    PackFetchMe(closure);
+    return;
+  }
+
+  /* Primitive arrays have gone; now we have (MUT_)ARR_WORDS etc */
+
+  if (!RoomToPack(PACK_GA_SIZE + _HS + vhs + nonptrs, ptrs)) {
+    IF_PAR_DEBUG(pack,
+		 belch("*>&& pack buffer is full; packing FETCH_ME for closure %p (%s)",
+		       closure, info_type(closure)));
+    PackFetchMe(closure);
+    return;
+  }
+
+  /* Record the location of the GA */
+  AmPacking(closure);
+  /* Allocate a GA for this closure and put it into the buffer */
+  /* Checks for globalisation scheme; default: globalise everything thunks */
+  if ( RtsFlags.ParFlags.globalising == 0 || 
+       (closure_THUNK(closure) && !closure_UNPOINTED(closure)) )
+    GlobaliseAndPackGA(closure);
+  else
+    Pack((StgWord)2);  // marker for unglobalised closure
+
+
+  ASSERT(!(info->type == ARR_WORDS || info->type == MUT_ARR_PTRS ||
+	   info->type == MUT_ARR_PTRS_FROZEN || info->type == MUT_VAR));
+
+  /* At last! A closure we can actually pack! */
+  if (ip_MUTABLE(info) && ((info->type != FETCH_ME)||(info->type != REMOTE_REF)))
+    barf("*>// %p (%s) PackClosure: trying to replicate a Mutable closure!",
+	 closure, info_type(closure));
+      
+  /* 
+     Remember, the generic closure layout is as follows:
+        +-------------------------------------------------+
+	| FIXED HEADER | VARIABLE HEADER | PTRS | NON-PRS |
+        +-------------------------------------------------+
+  */
+  /* pack fixed and variable header */
+  for (i = 0; i < _HS + vhs; ++i)
+    Pack((StgWord)*(((StgPtr)closure)+i));
+      
+  /* register all ptrs for further packing */
+  for (i = 0; i < ptrs; ++i)
+    QueueClosure(((StgClosure *) *(((StgPtr)closure)+(_HS+vhs)+i)));
+
+  /* pack non-ptrs */
+  for (i = 0; i < nonptrs; ++i)
+    Pack((StgWord)*(((StgPtr)closure)+(_HS+vhs)+ptrs+i));
+      
+  // ASSERT(_HS+vhs+ptrs+nonptrs==size);
+  if ((m=_HS+vhs+ptrs+nonptrs)<size) {
+    IF_PAR_DEBUG(pack,
+		 belch("*>** WARNING: slop in closure %p (%s); filling %d words; SHOULD NEVER HAPPEN",
+		       closure, info_type(closure), size-m));
+    for (i=m; i<size; i++) 
+      Pack((StgWord)*(((StgPtr)closure)+i));
+  }
+
+  unpacked_size += size;
+  //unpacked_size += (size < MIN_UPD_SIZE) ? MIN_UPD_SIZE : size;
+
+  /*
+   * Record that this is a revertable black hole so that we can fill in
+   * its address from the fetch reply.  Problem: unshared thunks may cause
+   * space leaks this way, their GAs should be deallocated following an
+   * ACK.
+   */
+      
+  if (closure_THUNK(closure) && !closure_UNPOINTED(closure)) { 
+    rbh = convertToRBH(closure);
+    ASSERT(size>=_HS+MIN_UPD_SIZE); // min size for any updatable closure
+    ASSERT(rbh == closure);         // rbh at the same position (minced version)
+    packed_thunks++;
+  } else if ( closure==graph_root ) {
+    packed_thunks++;                // root of graph is counted as a thunk
+  }
+}
+/*
+  Pack an array of words.
+  ToDo: implement packing of MUT_ARRAYs
+*/
+
+//@cindex PackArray
+static void
+PackArray(StgClosure *closure)
+{
+  StgInfoTable *info;
+  nat size, ptrs, nonptrs, vhs;
+  nat i, n;
+  char str[80];
+
+  /* get info about basic layout of the closure */
+  info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+
+  ASSERT(info->type == ARR_WORDS || info->type == MUT_ARR_PTRS ||
+	 info->type == MUT_ARR_PTRS_FROZEN || info->type == MUT_VAR);
+
+  n = ((StgArrWords *)closure)->words;
+  // this includes the header!: arr_words_sizeW(stgCast(StgArrWords*,q)); 
+
+  IF_PAR_DEBUG(pack,
+	       belch("*>== %p (%s): packing an array of %d words (size=%d)\n",
+		     closure, info_type(closure), n,
+		     arr_words_sizeW((StgArrWords *)closure)));
+
+  /* check that we have enough room in the pack buffer */
+  if (!RoomToPack(PACK_GA_SIZE + _HS + vhs + nonptrs, ptrs)) {
+    IF_PAR_DEBUG(pack,
+		 belch("*>&& pack buffer is full; packing FETCH_ME for closure %p (%s)",
+		       closure, info_type(closure)));
+    PackFetchMe(closure);
+    return;
+  }
+
+  /* global stats about arrays sent */
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    globalParStats.tot_arrs++;
+    globalParStats.tot_arr_size += ((StgArrWords *)closure)->words;
+  }
+
+  /* record offset of the closure and allocate a GA */
+  AmPacking(closure);
+  /* Checks for globalisation scheme; default: globalise everything thunks */
+  if ( RtsFlags.ParFlags.globalising == 0 || 
+       (closure_THUNK(closure) && !closure_UNPOINTED(closure)) )
+    GlobaliseAndPackGA(closure);
+  else
+    Pack((StgWord)2);  // marker for unglobalised closure
+
+  /* Pack the header (2 words: info ptr and the number of words to follow) */
+  Pack((StgWord)*(StgPtr)closure);
+  Pack(((StgArrWords *)closure)->words);
+
+  /* pack the payload of the closure (all non-ptrs) */
+  for (i=0; i<n; i++)
+    Pack((StgWord)((StgArrWords *)closure)->payload[i]);
+
+  unpacked_size += arr_words_sizeW((StgArrWords *)closure);
+}
+
+/*
+   Pack a PAP closure.
+   Note that the representation of a PAP in the buffer is different from
+   its representation in the heap. In particular, pointers to local
+   closures are packed directly as FETCHME closures, using
+   PACK_FETCHME_SIZE words to represent q 1 word pointer in the orig graph
+   structure. To account for the difference in size we store the packed
+   size of the closure as part of the PAP's variable header in the buffer.
+*/
+
+//@cindex PackPAP
+static void
+PackPAP(StgPAP *pap) {
+  nat n, i, j, pack_start;
+  StgPtr p, q;
+  const StgInfoTable* info;
+  StgWord bitmap;
+  /* debugging only */
+  StgPtr end;
+  nat size, ptrs, nonptrs, vhs;
+  char str[80];
+  nat unpacked_size_before_PAP, FMs_in_PAP=0; // debugging only
+
+  /* This is actually a setup invariant; checked here 'cause it affects PAPs*/
+  //ASSERT(PACK_FETCHME_SIZE == sizeofW(StgFetchMe)-1+PACK_GA_SIZE);
+  ASSERT(NotYetPacking(OffsetFor((StgClosure *)pap)));
+  IF_DEBUG(sanity,
+	   unpacked_size_before_PAP = unpacked_size);
+
+  n = (nat)(pap->n_args);
+
+  /* get info about basic layout of the closure */
+  info = get_closure_info((StgClosure *)pap, &size, &ptrs, &nonptrs, &vhs, str);
+  ASSERT(ptrs==0 && nonptrs==0 && size==pap_sizeW(pap));
+
+  IF_PAR_DEBUG(pack,
+	       belch("*>**  %p (%s): PackPAP: packing PAP with %d words (size=%d; ptrs=%d; nonptrs=%d:", 
+		     (StgClosure *)pap, info_type((StgClosure *)pap),
+		     n, size, ptrs, nonptrs);
+               printClosure((StgClosure *)pap));
+
+  /* check that we have enough room in the pack buffer */
+  if (!RoomToPack(PACK_GA_SIZE + _HS + vhs + nonptrs, ptrs)) {
+    IF_PAR_DEBUG(pack,
+		 belch("*>&& pack buffer is full; packing FETCH_ME for closure %p (%s)",
+		       (StgClosure *)pap, info_type((StgClosure *)pap)));
+    PackFetchMe((StgClosure *)pap);
+    return;
+  }
+
+  /* record offset of the closure and allocate a GA */
+  AmPacking((StgClosure *)pap);
+  /* Checks for globalisation scheme; default: globalise everything thunks */
+  if ( RtsFlags.ParFlags.globalising == 0 || 
+       (closure_THUNK(pap) && !closure_UNPOINTED(pap)) )
+    GlobaliseAndPackGA((StgClosure *)pap);
+  else
+    Pack((StgWord)2);  // marker for unglobalised closure
+
+  /* Pack the PAP header */
+  Pack((StgWord)(pap->header.info));
+  Pack((StgWord)(pap->n_args));
+  Pack((StgWord)(pap->fun));
+  pack_start = pack_locn;   // to compute size of PAP in buffer
+  Pack((StgWord)0);    // this will be filled in later (size of PAP in buffer)
+
+  /* Pack the payload of a PAP i.e. a stack chunk */
+  /* pointers to start of stack chunk */
+  p = (StgPtr)(pap->payload);
+  end = (StgPtr)((nat)pap+pap_sizeW(pap)*sizeof(StgWord)); // (StgPtr)((nat)pap+sizeof(StgPAP)+sizeof(StgPtr)*n);
+  while (p<end) {
+    /* the loop body has been borrowed from scavenge_stack */
+    q = (StgPtr)*p;
+
+    /* If we've got a tag, pack all words in that block */
+    if (IS_ARG_TAG((W_)q)) {   // q stands for the no. of non-ptrs to follow
+      nat m = ARG_TAG((W_)q);      // first word after this block
+      IF_PAR_DEBUG(pack,
+		   belch("*>**    PackPAP @ %p: packing %d words (tagged), starting @ %p", 
+			 p, m, p));
+      for (i=0; i<m+1; i++)
+	Pack((StgWord)*(p+i));
+      p += m+1;                // m words + the tag
+      continue;
+    }
+     
+    /* If q is is a pointer to a (heap allocated) closure we pack a FETCH_ME
+       ToDo: provide RTS flag to also pack these closures
+    */
+    if (! LOOKS_LIKE_GHC_INFO(q) ) {
+      /* distinguish static closure (PLC) from other closures (FM) */
+      switch (get_itbl((StgClosure*)q)->type) {
+      case CONSTR_CHARLIKE:
+	IF_PAR_DEBUG(pack,
+		     belch("*>**    PackPAP: packing a charlike closure %d", 
+			   ((StgIntCharlikeClosure*)q)->data));
+    
+	PackPLC((StgPtr)CHARLIKE_CLOSURE(((StgIntCharlikeClosure*)q)->data));
+	p++;
+	break;
+      
+      case CONSTR_INTLIKE:
+	{
+	  StgInt val = ((StgIntCharlikeClosure*)q)->data;
+      
+	  if ((val <= MAX_INTLIKE) && (val >= MIN_INTLIKE)) {
+	    IF_PAR_DEBUG(pack,
+			 belch("*>**    PackPAP: Packing ptr to a small intlike %d as a PLC", val));
+	    PackPLC((StgPtr)INTLIKE_CLOSURE(val));
+	    p++;
+	    break;
+	  } else {
+	    IF_PAR_DEBUG(pack,
+			 belch("*>**    PackPAP: Packing a ptr to a big intlike %d as a FM", 
+			       val));
+	    Pack((StgWord)(ARGTAG_MAX+1));
+	    PackFetchMe((StgClosure *)q);
+	    p++;
+	    IF_DEBUG(sanity, FMs_in_PAP++);
+	    break;
+	  }
+	}
+	case THUNK_STATIC:       // ToDo: check whether that's ok
+	case FUN_STATIC:       // ToDo: check whether that's ok
+	case CONSTR_STATIC:
+	case CONSTR_NOCAF_STATIC:
+	  {
+	    IF_PAR_DEBUG(pack,
+			 belch("*>**    PackPAP: packing a ptr to a %p (%s) as a PLC", 
+			       q, info_type((StgClosure *)q)));
+	    
+	    PackPLC((StgPtr)q);
+	    p++;
+	    break;
+	  }
+      default:
+	  IF_PAR_DEBUG(pack,
+		       belch("*>**    PackPAP @ %p: packing FM to %p (%s)", 
+			     p, q, info_type((StgClosure*)q)));
+	  Pack((StgWord)(ARGTAG_MAX+1));
+	  PackFetchMe((StgClosure *)q);
+	  p++;
+	  IF_DEBUG(sanity, FMs_in_PAP++);
+	  break;
+      }
+      continue;
+    }
+	
+    /* 
+     * Otherwise, q must be the info pointer of an activation
+     * record.  All activation records have 'bitmap' style layout
+     * info.
+     */
+    info  = get_itbl((StgClosure *)p);
+    switch (info->type) {
+	
+      /* Dynamic bitmap: the mask is stored on the stack */
+    case RET_DYN:
+      IF_PAR_DEBUG(pack,
+		   belch("*>**    PackPAP @ %p: RET_DYN", 
+			 p));
+
+      /* Pack the header as is */
+      Pack((StgWord)(((StgRetDyn *)p)->info));
+      Pack((StgWord)(((StgRetDyn *)p)->liveness));
+      Pack((StgWord)(((StgRetDyn *)p)->ret_addr));
+
+      bitmap = ((StgRetDyn *)p)->liveness;
+      p      = (P_)&((StgRetDyn *)p)->payload[0];
+      goto small_bitmap;
+
+      /* probably a slow-entry point return address: */
+    case FUN:
+    case FUN_STATIC:
+      {
+      IF_PAR_DEBUG(pack,
+		   belch("*>**    PackPAP @ %p: FUN or FUN_STATIC", 
+			 p));
+
+      Pack((StgWord)(((StgClosure *)p)->header.info));
+      p++;
+
+      goto follow_srt; //??
+      }
+
+      /* Using generic code here; could inline as in scavenge_stack */
+    case UPDATE_FRAME:
+      {
+	StgUpdateFrame *frame = (StgUpdateFrame *)p;
+	nat type = get_itbl(frame->updatee)->type;
+
+	ASSERT(type==BLACKHOLE || type==CAF_BLACKHOLE || type==BLACKHOLE_BQ);
+
+	IF_PAR_DEBUG(pack,
+		     belch("*>**    PackPAP @ %p: UPDATE_FRAME (updatee=%p; link=%p)", 
+			   p, frame->updatee, frame->link));
+
+	Pack((StgWord)(frame->header.info));
+	Pack((StgWord)(frame->link));     // ToDo: fix intra-stack pointer
+	Pack((StgWord)(frame->updatee));  // ToDo: follow link 
+
+	p += 3;
+      }
+
+      /* small bitmap (< 32 entries, or 64 on a 64-bit machine) */
+    case STOP_FRAME:
+      {
+	IF_PAR_DEBUG(pack,
+		     belch("*>**    PackPAP @ %p: STOP_FRAME", 
+			   p));
+	Pack((StgWord)((StgStopFrame *)p)->header.info);
+	p++;
+      }
+
+    case CATCH_FRAME:
+      {
+	IF_PAR_DEBUG(pack,
+		     belch("*>**    PackPAP @ %p: CATCH_FRAME (handler=%p)", 
+			   p, ((StgCatchFrame *)p)->handler));
+
+	Pack((StgWord)((StgCatchFrame *)p)->header.info);
+	Pack((StgWord)((StgCatchFrame *)p)->link); // ToDo: fix intra-stack pointer
+	Pack((StgWord)((StgCatchFrame *)p)->exceptions_blocked);
+	Pack((StgWord)((StgCatchFrame *)p)->handler);
+	p += 4;
+      }
+
+    case SEQ_FRAME:
+      {
+	IF_PAR_DEBUG(pack,
+		     belch("*>**    PackPAP @ %p: UPDATE_FRAME (link=%p)", 
+			   p, ((StgSeqFrame *)p)->link));
+
+	Pack((StgWord)((StgSeqFrame *)p)->header.info);
+	Pack((StgWord)((StgSeqFrame *)p)->link); // ToDo: fix intra-stack pointer
+
+        // ToDo: handle bitmap
+        bitmap = info->layout.bitmap;
+
+        p = (StgPtr)&(((StgClosure *)p)->payload);
+        goto small_bitmap;
+      }
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+      IF_PAR_DEBUG(pack,
+		   belch("*>**    PackPAP @ %p: RET_{BCO,SMALL,VEC_SMALL} (bitmap=%o)", 
+			 p, info->layout.bitmap));
+
+
+      Pack((StgWord)((StgClosure *)p)->header.info);
+      p++;
+      // ToDo: handle bitmap
+      bitmap = info->layout.bitmap;
+      /* this assumes that the payload starts immediately after the info-ptr */
+
+    small_bitmap:
+      while (bitmap != 0) {
+	if ((bitmap & 1) == 0) {
+	  Pack((StgWord)(ARGTAG_MAX+1));
+	  PackFetchMe((StgClosure *)*p++); // pack a FetchMe to the closure
+	  IF_DEBUG(sanity, FMs_in_PAP++);
+	} else {
+	  Pack((StgWord)*p++);
+	}
+	bitmap = bitmap >> 1;
+      }
+      
+    follow_srt:
+	IF_PAR_DEBUG(pack,
+	             belch("*>--    PackPAP: nothing to do for follow_srt"));
+      continue;
+
+      /* large bitmap (> 32 entries) */
+    case RET_BIG:
+    case RET_VEC_BIG:
+      {
+	StgPtr q;
+	StgLargeBitmap *large_bitmap;
+
+	IF_PAR_DEBUG(pack,
+		     belch("*>**    PackPAP @ %p: RET_{BIG,VEC_BIG} (large_bitmap=%p)", 
+			   p, info->layout.large_bitmap));
+
+
+	Pack((StgWord)((StgClosure *)p)->header.info);
+	p++;
+
+	large_bitmap = info->layout.large_bitmap;
+
+	for (j=0; j<large_bitmap->size; j++) {
+	  bitmap = large_bitmap->bitmap[j];
+	  q = p + BITS_IN(W_);
+	  while (bitmap != 0) {
+	    if ((bitmap & 1) == 0) {
+	      Pack((StgWord)(ARGTAG_MAX+1));
+	      PackFetchMe((StgClosure *)*p++); // ToDo: pack pointer(StgClosure *)*p = evacuate((StgClosure *)*p);
+	      IF_DEBUG(sanity, FMs_in_PAP++);
+	    } else {
+	      Pack((StgWord)*p++);
+	    }
+	    bitmap = bitmap >> 1;
+	  }
+	  if (j+1 < large_bitmap->size) {
+	    while (p < q) {
+	      Pack((StgWord)(ARGTAG_MAX+1));
+	      PackFetchMe((StgClosure *)*p++); // ToDo: pack pointer (StgClosure *)*p = evacuate((StgClosure *)*p);
+	      IF_DEBUG(sanity, FMs_in_PAP++);
+	    }
+	  }
+	}
+
+	/* and don't forget to follow the SRT */
+	goto follow_srt;
+      }
+
+    default:
+      barf("PackPAP: weird activation record found on stack (@ %p): %d", 
+	   p, (int)(info->type));
+    }
+  }
+  // fill in size of the PAP (only the payload!) in buffer
+  globalPackBuffer->buffer[pack_start] = (StgWord)(pack_locn - pack_start - 1*sizeofW(StgWord));
+  /*
+    We can use the generic pap_sizeW macro to compute the size of the
+    unpacked PAP because whenever we pack a new FETCHME as part of the
+    PAP's payload we also adjust unpacked_size accordingly (smart, aren't we?)
+
+    NB: the current PAP (un-)packing code  relies on the fact that
+    the size of the unpacked PAP + size of all unpacked FMs is the same as
+    the size of the packed PAP!!
+  */
+  unpacked_size += pap_sizeW(pap); // sizeofW(pap) + (nat)(globalPackBuffer->buffer[pack_start]);
+  IF_DEBUG(sanity,
+	   ASSERT(unpacked_size-unpacked_size_before_PAP==pap_sizeW(pap)+FMs_in_PAP*sizeofW(StgFetchMe)));
+}
+# else  /* GRAN */
+
+/* Fake the packing of a closure */
+
+void
+PackClosure(closure)
+StgClosure *closure;
+{
+  StgInfoTable *info, *childInfo;
+  nat size, ptrs, nonptrs, vhs;
+  char info_hdr_ty[80];
+  nat i;
+  StgClosure *indirectee, *rbh;
+  char str[80];
+  rtsBool is_mutable, will_be_rbh, no_more_thunks_please;
+
+  is_mutable = rtsFalse;
+
+  /* In GranSim we don't pack and unpack closures -- we just simulate
+     packing by updating the bitmask. So, the graph structure is unchanged
+     i.e. we don't short out indirections here. -- HWL */
+
+  /* Nothing to do with packing but good place to (sanity) check closure;
+     if the closure is a thunk, it must be unique; otherwise we have copied
+     work at some point before that which violates one of our main global
+     assertions in GranSim/GUM */
+  ASSERT(!closure_THUNK(closure) || is_unique(closure));
+
+  IF_GRAN_DEBUG(pack,
+		belch("**  Packing closure %p (%s)",
+		      closure, info_type(closure)));
+
+  if (where_is(closure) != where_is(graph_root)) {
+    IF_GRAN_DEBUG(pack,
+		  belch("**   faking a FETCHME [current PE: %d, closure's PE: %d]",
+			where_is(graph_root), where_is(closure)));
+
+    /* GUM would pack a FETCHME here; simulate that by increasing the */
+    /* unpacked size accordingly but don't pack anything -- HWL */
+    unpacked_size += _HS + 2 ; // sizeofW(StgFetchMe);
+    return; 
+  }
+
+  /* If the closure's not already being packed */
+  if (!NotYetPacking(closure)) 
+    /* Don't have to do anything in GrAnSim if closure is already */
+    /* packed -- HWL */
+    {
+      IF_GRAN_DEBUG(pack,
+		    belch("**    Closure %p is already packed and omitted now!",
+			    closure));
+      return;
+    }
+
+  switch (get_itbl(closure)->type) {
+    /* ToDo: check for sticky bit here? */
+    /* BH-like closures which must not be moved to another PE */
+    case CAF_BLACKHOLE:       /* # of ptrs, nptrs: 0,2 */
+    case SE_BLACKHOLE:        /* # of ptrs, nptrs: 0,2 */
+    case SE_CAF_BLACKHOLE:    /* # of ptrs, nptrs: 0,2 */
+    case BLACKHOLE:           /* # of ptrs, nptrs: 0,2 */
+    case BLACKHOLE_BQ:        /* # of ptrs, nptrs: 1,1 */
+    case RBH:                 /* # of ptrs, nptrs: 1,1 */
+      /* same for these parallel specific closures */
+    case BLOCKED_FETCH:
+    case FETCH_ME:
+    case FETCH_ME_BQ:
+      IF_GRAN_DEBUG(pack,
+	belch("**    Avoid packing BH-like closures (%p, %s)!", 
+	      closure, info_type(closure)));
+      /* Just ignore RBHs i.e. they stay where they are */
+      return;
+
+    case THUNK_SELECTOR:
+      {
+	StgClosure *selectee = ((StgSelector *)closure)->selectee;
+
+	IF_GRAN_DEBUG(pack,
+		      belch("**    Avoid packing THUNK_SELECTOR (%p, %s) but queuing %p (%s)!", 
+			    closure, info_type(closure), selectee, info_type(selectee)));
+	QueueClosure(selectee);
+	IF_GRAN_DEBUG(pack,
+		      belch("**    [%p (%s) (Queueing closure) ....]",
+			    selectee, info_type(selectee)));
+      }
+      return;
+
+    case CONSTR_STATIC:
+    case CONSTR_NOCAF_STATIC:
+                                  /* For now we ship indirections to CAFs:
+				   * They are evaluated on each PE if needed */
+      IF_GRAN_DEBUG(pack,
+	belch("**    Nothing to pack for %p (%s)!", 
+	      closure, info_type(closure)));
+      // Pack(closure); GUM only
+      return;
+
+    case CONSTR_CHARLIKE:
+    case CONSTR_INTLIKE:
+      IF_GRAN_DEBUG(pack,
+	belch("**    Nothing to pack for %s (%p)!", 
+	      closure, info_type(closure)));
+      // PackPLC(((StgIntCharlikeClosure *)closure)->data); GUM only
+      return;
+
+    case AP_UPD:   
+    case PAP:
+      /* partial applications; special treatment necessary? */
+      break;
+
+    case MVAR:
+      barf("{PackClosure}Daq Qagh: found an MVAR (%p, %s); ToDo: implement proper treatment of MVARs",
+	   closure, info_type(closure));
+
+    case ARR_WORDS:
+    case MUT_VAR:
+    case MUT_ARR_PTRS:
+    case MUT_ARR_PTRS_FROZEN:
+      /* Mutable objects; require special treatment to ship all data */
+      is_mutable = rtsTrue;
+      break;	  
+
+    case WEAK:
+    case FOREIGN:
+    case STABLE_NAME:
+	  /* weak pointers and other FFI objects */
+      barf("{PackClosure}Daq Qagh: found an FFI object (%p, %s); FFI not yet supported by GranSim, sorry",
+	   closure, info_type(closure));
+
+    case TSO:
+      /* parallel objects */
+      barf("{PackClosure}Daq Qagh: found a TSO when packing (%p, %s); thread migration not yet implemented, sorry",
+	   closure, info_type(closure));
+
+    case BCO:
+      /* Hugs objects (i.e. closures used by the interpreter) */
+      barf("{PackClosure}Daq Qagh: found a Hugs closure when packing (%p, %s); GranSim not yet integrated with Hugs, sorry",
+	   closure, info_type(closure));
+      
+    case IND:              /* # of ptrs, nptrs: 1,0 */
+    case IND_STATIC:       /* # of ptrs, nptrs: 1,0 */
+    case IND_PERM:         /* # of ptrs, nptrs: 1,1 */
+    case IND_OLDGEN:       /* # of ptrs, nptrs: 1,1 */
+    case IND_OLDGEN_PERM:  /* # of ptrs, nptrs: 1,1 */
+      /* we shouldn't find an indirection here, because we have shorted them
+	 out at the beginning of this functions already.
+      */
+      break;
+      /* should be:
+      barf("{PackClosure}Daq Qagh: found indirection when packing (%p, %s)",
+	   closure, info_type(closure));
+      */
+
+    case UPDATE_FRAME:
+    case CATCH_FRAME:
+    case SEQ_FRAME:
+    case STOP_FRAME:
+      /* stack frames; should never be found when packing for now;
+	 once we support thread migration these have to be covered properly
+      */
+      barf("{PackClosure}Daq Qagh: found stack frame when packing (%p, %s)",
+	   closure, info_type(closure));
+
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+    case RET_BIG:
+    case RET_VEC_BIG:
+    case RET_DYN:
+      /* vectored returns; should never be found when packing; */
+      barf("{PackClosure}Daq Qagh: found vectored return (%p, %s)",
+	   closure, info_type(closure));
+
+    case INVALID_OBJECT:
+      barf("{PackClosure}Daq Qagh: found Invalid object (%p, %s)",
+	   closure, info_type(closure));
+
+    default:
+      /* 
+	 Here we know that the closure is a CONSTR, FUN or THUNK (maybe
+	 a specialised version with wired in #ptr/#nptr info; currently
+	 we treat these specialised versions like the generic version)
+      */
+    }     /* switch */
+
+    /* Otherwise it's not Fixed */
+
+    info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+    will_be_rbh = closure_THUNK(closure) && !closure_UNPOINTED(closure);
+
+    IF_GRAN_DEBUG(pack,
+		belch("**    Info on closure %p (%s): size=%d; ptrs=%d",
+		      closure, info_type(closure),
+		      size, ptrs, 
+		      (will_be_rbh) ? "will become RBH" : "will NOT become RBH"));
+    
+    // check whether IS_UPDATABLE(closure) == !closure_UNPOINTED(closure) -- HWL
+    no_more_thunks_please = 
+      (RtsFlags.GranFlags.ThunksToPack>0) && 
+      (packed_thunks>=RtsFlags.GranFlags.ThunksToPack);
+
+    /*
+      should be covered by get_closure_info
+    if (info->type == FETCH_ME || info->type == FETCH_ME_BQ || 
+	info->type == BLACKHOLE || info->type == RBH )
+      size = ptrs = nonptrs = vhs = 0;
+    */
+    /* Now peek ahead to see whether the closure has any primitive */
+    /* array children */ 
+    /* 
+       ToDo: fix this code
+       for (i = 0; i < ptrs; ++i) {
+       P_ childInfo;
+       W_ childSize, childPtrs, childNonPtrs, childVhs;
+       
+       childInfo = get_closure_info(((StgPtrPtr) (closure))[i + _HS + vhs],
+       &childSize, &childPtrs, &childNonPtrs,
+       &childVhs, junk_str);
+       if (IS_BIG_MOTHER(childInfo)) {
+       reservedPAsize += PACK_GA_SIZE + _HS + 
+       childVhs + childNonPtrs +
+       childPtrs * PACK_FETCHME_SIZE;
+       PAsize += PACK_GA_SIZE + _HS + childSize;
+       PAptrs += childPtrs;
+       }
+       }
+    */
+    /* Don't pack anything (GrAnSim) if it's a black hole, or the buffer
+     * is full and it isn't a primitive array. N.B. Primitive arrays are
+     * always packed (because their parents index into them directly) */
+
+    if (IS_BLACK_HOLE(closure))
+	/*
+	  ToDo: fix this code
+	  || 
+	  !(RoomToPack(PACK_GA_SIZE + _HS + vhs + nonptrs, ptrs) 
+	  || IS_BIG_MOTHER(info))) 
+	  */
+      return;
+
+    /* At last! A closure we can actually pack! */
+
+    if (closure_MUTABLE(closure)) // not nec. && (info->type != FETCHME))
+      belch("ghuH: Replicated a Mutable closure!");
+
+    if (RtsFlags.GranFlags.GranSimStats.Global &&  
+	no_more_thunks_please && will_be_rbh) {
+      globalGranStats.tot_cuts++;
+      if ( RtsFlags.GranFlags.Debug.pack ) 
+	belch("**    PackClosure (w/ ThunksToPack=%d): Cutting tree with root at %#x\n",
+		RtsFlags.GranFlags.ThunksToPack, closure);
+    } else if (will_be_rbh || (closure==graph_root) ) {
+      packed_thunks++;
+      globalGranStats.tot_thunks++;
+    }
+
+    if (no_more_thunks_please && will_be_rbh) 
+      return; /* don't pack anything */
+
+    /* actual PACKING done here --  HWL */
+    Pack(closure);         
+    for (i = 0; i < ptrs; ++i) {
+      /* extract i-th pointer from closure */
+      QueueClosure((StgClosure *)(closure->payload[i]));
+      IF_GRAN_DEBUG(pack,
+		    belch("**    [%p (%s) (Queueing closure) ....]",
+			  closure->payload[i], 
+	                  info_type(*stgCast(StgPtr*,((closure)->payload+(i))))));
+                                  //^^^^^^^^^^^ payloadPtr(closure,i))));
+    }
+
+    /* 
+       for packing words (GUM only) do something like this:
+
+       for (i = 0; i < ptrs; ++i) {
+         Pack(payloadWord(obj,i+j));
+       }
+    */
+    /* Turn thunk into a revertible black hole. */
+    if (will_be_rbh) { 
+	rbh = convertToRBH(closure);
+	ASSERT(rbh != NULL);
+    }
+}
+# endif  /* PAR */
+
+//@node Low level packing routines, Unpacking routines, Packing Functions, Graph packing
+//@subsection Low level packing routines
+
+/*
+   @Pack@ is the basic packing routine.  It just writes a word of data into
+   the pack buffer and increments the pack location.  */
+
+//@cindex Pack
+
+# if defined(PAR)
+static  void
+Pack(data)
+StgWord data;
+{
+  ASSERT(pack_locn < RtsFlags.ParFlags.packBufferSize);
+  globalPackBuffer->buffer[pack_locn++] = data;
+}
+#endif
+
+#if defined(GRAN)
+static  void
+Pack(closure)
+StgClosure *closure;
+{
+  StgInfoTable *info;
+  nat size, ptrs, nonptrs, vhs;
+  char str[80];
+
+  /* This checks the size of the GrAnSim internal pack buffer. The simulated
+     pack buffer is checked via RoomToPack (as in GUM) */
+  if (pack_locn >= (int)globalPackBuffer->size+sizeofW(rtsPackBuffer)) 
+    reallocPackBuffer();
+
+  if (closure==(StgClosure*)NULL) 
+    belch("Qagh {Pack}Daq: Trying to pack 0");
+  globalPackBuffer->buffer[pack_locn++] = closure;
+  /* ASSERT: Data is a closure in GrAnSim here */
+  info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+  // ToDo: is check for MIN_UPD_SIZE really needed? */
+  unpacked_size += _HS + (size < MIN_UPD_SIZE ? 
+				        MIN_UPD_SIZE : 
+				        size);
+}
+# endif  /* GRAN */
+
+/*
+   If a closure is local, make it global.  Then, divide its weight for
+   export.  The GA is then packed into the pack buffer.  */
+
+# if defined(PAR)
+//@cindex GlobaliseAndPackGA
+static void
+GlobaliseAndPackGA(closure)
+StgClosure *closure;
+{
+  globalAddr *ga;
+  globalAddr packGA;
+
+  if ((ga = LAGAlookup(closure)) == NULL) {
+    ga = makeGlobal(closure, rtsTrue);
+
+    // Global statistics: increase amount of global data by closure-size
+    if (RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+      StgInfoTable *info;
+      nat size, ptrs, nonptrs, vhs, i, m; // stats only!!
+      char str[80]; // stats only!!
+
+      info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+      globalParStats.tot_global += size;
+    }
+  }
+  ASSERT(ga->weight==MAX_GA_WEIGHT || ga->weight > 2);
+  
+  if(dest_gtid==ga->payload.gc.gtid)
+  {  packGA.payload = ga->payload;
+     packGA.weight = 0xFFFFFFFF; // 0,1,2 are used already
+  }
+  else
+  { splitWeight(&packGA, ga);
+    ASSERT(packGA.weight > 0);
+  }  
+ 
+  IF_PAR_DEBUG(pack,
+	       fprintf(stderr, "*>## %p (%s): Globalising (%s) closure with GA ",
+		       closure, info_type(closure),
+		       ( (ga->payload.gc.gtid==dest_gtid)?"returning":
+		           ( (ga->payload.gc.gtid==mytid)?"creating":"sharing" ) ));
+	       printGA(&packGA);
+	       fputc('\n', stderr));
+
+
+  Pack((StgWord) packGA.weight);
+  Pack((StgWord) packGA.payload.gc.gtid);
+  Pack((StgWord) packGA.payload.gc.slot);
+}
+
+/*
+   @PackPLC@ makes up a bogus GA for a PLC. Weight 0 implies that a PLC
+   address follows instead of PE, slot.  */
+
+//@cindex PackPLC
+
+static void
+PackPLC(addr)
+StgPtr addr;
+{
+  Pack(0L);			/* weight */
+  Pack((StgWord) addr);		/* address */
+}
+
+/*
+   @PackOffset@ packs a special GA value that will be interpreted as an
+   offset to a closure in the pack buffer.  This is used to avoid unfolding
+   the graph structure into a tree.  */
+
+static void
+PackOffset(offset)
+int offset;
+{
+  /*
+  IF_PAR_DEBUG(pack,
+	       belch("** Packing Offset %d at pack location %u",
+		     offset, pack_locn));
+  */
+  Pack(1L);			/* weight */
+  Pack(0L);			/* pe */
+  Pack(offset);		        /* slot/offset */
+}
+# endif  /* PAR */
+
+//@node Unpacking routines, Aux fcts for packing, Low level packing routines, Graph packing
+//@subsection Unpacking routines
+
+/*
+  This was formerly in the (now deceased) module Unpack.c
+
+  Unpacking closures which have been exported to remote processors
+
+  This module defines routines for unpacking closures in the parallel
+  runtime system (GUM).
+
+  In the case of GrAnSim, this module defines routines for *simulating* the
+  unpacking of closures as it is done in the parallel runtime system.
+*/
+
+//@node GUM code, GranSim Code, Unpacking routines, Unpacking routines
+//@subsubsection GUM code
+
+#if defined(PAR) 
+
+//@cindex InitPendingGABuffer
+void
+InitPendingGABuffer(size)
+nat size; 
+{
+  if (PendingGABuffer==(globalAddr *)NULL)
+    PendingGABuffer = (globalAddr *) 
+      stgMallocBytes(size*2*sizeof(globalAddr),
+		     "InitPendingGABuffer");
+
+  /* current location in the buffer */
+  gaga = PendingGABuffer; 
+}
+
+/*
+  @CommonUp@ commons up two closures which we have discovered to be
+  variants of the same object.  One is made an indirection to the other.  */
+
+//@cindex CommonUp
+void
+CommonUp(StgClosure *src, StgClosure *dst)
+{
+  StgBlockingQueueElement *bqe;
+#if defined(DEBUG)
+  StgInfoTable *info;
+  nat size, ptrs, nonptrs, vhs, i;
+  char str[80];
+
+  /* get info about basic layout of the closure */
+  info = get_closure_info(src, &size, &ptrs, &nonptrs, &vhs, str);
+#endif
+
+  ASSERT(src != (StgClosure *)NULL && dst != (StgClosure *)NULL);
+  ASSERT(src != dst);
+
+  IF_PAR_DEBUG(pack,
+	       belch("*___  CommonUp %p (%s) --> %p (%s)",
+		     src, info_type(src), dst, info_type(dst)));
+  
+  switch (get_itbl(src)->type) {
+  case BLACKHOLE_BQ:
+    bqe = ((StgBlockingQueue *)src)->blocking_queue;
+    break;
+
+  case FETCH_ME_BQ:
+    bqe = ((StgFetchMeBlockingQueue *)src)->blocking_queue;
+    break;
+    
+  case RBH:
+    bqe = ((StgRBH *)src)->blocking_queue;
+    break;
+    
+  case BLACKHOLE:
+  case FETCH_ME:
+    bqe = END_BQ_QUEUE;
+    break;
+
+    /* These closures are too small to be updated with an indirection!!! */
+  case CONSTR_1_0:
+  case CONSTR_0_1:
+    ASSERT(size<_HS+MIN_UPD_SIZE); // that's why we have to avoid UPD_IND
+    return;
+
+    /* currently we also common up 2 CONSTRs; this should reduce heap 
+     * consumption but also does more work; not sure whether it's worth doing 
+     */ 
+  case CONSTR:
+  case CONSTR_2_0:
+  case CONSTR_1_1:
+  case CONSTR_0_2:
+  case ARR_WORDS:
+  case MUT_ARR_PTRS:
+  case MUT_ARR_PTRS_FROZEN:
+  case MUT_VAR:
+    break;
+
+  default:
+    /* Don't common up anything else */
+    return;
+  }
+
+  /* closure must be big enough to permit update with ind */
+  ASSERT(size>=_HS+MIN_UPD_SIZE);
+  /* NB: this also awakens the blocking queue for src */
+  UPD_IND(src, dst);
+}
+
+/*
+ * Common up the new closure with any existing closure having the same
+ * GA
+ */
+//@cindex SetGAandCommonUp
+static StgClosure *
+SetGAandCommonUp(globalAddr *ga, StgClosure *closure, rtsBool hasGA)
+{
+  StgClosure *existing;
+  StgInfoTable *ip, *oldip;
+  globalAddr *newGA;
+
+  if (!hasGA)
+    return closure;
+  
+  /* should we already have a local copy? */
+  if (ga->weight==0xFFFFFFFF) { 
+    ASSERT(ga->payload.gc.gtid==mytid); //sanity
+    ga->weight=0;
+    /* probably should also ASSERT that a commonUp takes place...*/
+  }
+  
+  ip = get_itbl(closure);
+  if ((existing = GALAlookup(ga)) == NULL) {
+    /* Just keep the new object */
+    IF_PAR_DEBUG(pack,
+		 belch("*<##  New local object for GA ((%x, %d, %x)) is %p (%s)", 
+		       ga->payload.gc.gtid, ga->payload.gc.slot, ga->weight,
+		       closure, info_type(closure)));
+
+    // make an entry binding closure to ga in the RemoteGA table
+    newGA = setRemoteGA(closure, ga, rtsTrue);
+    // if local closure is a FETCH_ME etc fill in the global indirection
+    if (ip->type == FETCH_ME || ip->type == REMOTE_REF)
+      ((StgFetchMe *)closure)->ga = newGA;
+  } else {
+    
+
+#ifdef DIST 
+// ***************************************************************************
+// ***************************************************************************
+// REMOTE_REF HACK - dual is in PackRemoteRef  
+// - prevents the weight ever being updated
+  if (ip->type == REMOTE_REF)
+    ga->weight=0;
+// ***************************************************************************
+// ***************************************************************************
+#endif /* DIST */
+    
+    /* Two closures, one global name.  Someone loses */
+    oldip = get_itbl(existing);
+    if ((oldip->type == FETCH_ME || 
+	 IS_BLACK_HOLE(existing) ||
+	 /* try to share evaluated closures */
+         oldip->type == CONSTR ||
+	 oldip->type == CONSTR_1_0 ||
+	 oldip->type == CONSTR_0_1 ||
+	 oldip->type == CONSTR_2_0 ||
+	 oldip->type == CONSTR_1_1 ||
+	 oldip->type == CONSTR_0_2 
+	) &&
+	ip->type != FETCH_ME) 
+    {
+      IF_PAR_DEBUG(pack,
+		   belch("*<#-  Duplicate local object for GA ((%x, %d, %x)); redirecting %p (%s) -> %p (%s)",
+			 ga->payload.gc.gtid, ga->payload.gc.slot, ga->weight,
+			 existing, info_type(existing), closure, info_type(closure)));
+
+      /* 
+       * What we had wasn't worth keeping, so make the old closure an
+       * indirection to the new closure (copying BQs if necessary) and
+       * make sure that the old entry is not the preferred one for this
+       * closure.
+       */
+      CommonUp(existing, closure);
+      //GALAdeprecate(ga);
+#if defined(DEBUG)
+      { 
+      	 StgInfoTable *info;
+      	 nat size, ptrs, nonptrs, vhs, i;
+      	 char str[80];
+      
+      	 /* get info about basic layout of the closure */
+      	 info = get_closure_info(GALAlookup(ga), &size, &ptrs, &nonptrs, &vhs, str);
+      
+      	 /* now ga indirectly refers to the new closure */
+      	 ASSERT(size<_HS+MIN_UPD_SIZE || 
+      		UNWIND_IND(GALAlookup(ga))==closure);
+      }
+#endif
+    } else {
+      /*
+       * Either we already had something worthwhile by this name or
+       * the new thing is just another FetchMe.  However, the thing we
+       * just unpacked has to be left as-is, or the child unpacking
+       * code will fail.  Remember that the way pointer words are
+       * filled in depends on the info pointers of the parents being
+       * the same as when they were packed.
+       */
+      IF_PAR_DEBUG(pack,
+		   belch("*<#@  Duplicate local object for GA ((%x, %d, %x)); keeping %p (%s) nuking unpacked %p (%s)", 
+			 ga->payload.gc.gtid, ga->payload.gc.slot, ga->weight,
+			 existing, info_type(existing), closure, info_type(closure)));
+
+      /* overwrite 2nd word; indicates that the closure is garbage */
+      IF_DEBUG(sanity,
+	       ((StgFetchMe*)closure)->ga = (globalAddr*)GARBAGE_MARKER;
+	       IF_PAR_DEBUG(pack,
+			    belch("++++  unpacked closure %p (%s) is garbage: %p",
+				  closure, info_type(closure), *(closure+1))));
+
+      closure = existing;
+#if 0
+      // HACK
+      ty = get_itbl(closure)->type;
+      if (ty == CONSTR ||
+	  ty == CONSTR_1_0 ||
+	  ty == CONSTR_0_1 ||
+	  ty == CONSTR_2_0 ||
+	  ty == CONSTR_1_1 ||
+	  ty == CONSTR_0_2)
+	CommonUp(closure, graph);
+#endif
+    }
+    /* We don't use this GA after all, so give back the weight */
+    (void) addWeight(ga);
+  }
+
+  /* if we have unpacked a FETCH_ME, we have a GA, too */
+  ASSERT(get_itbl(closure)->type!=FETCH_ME || 
+	 looks_like_ga(((StgFetchMe*)closure)->ga));
+
+  /* Sort out the global address mapping */
+  if (ip_THUNK(ip)){ 
+    // || // (ip_THUNK(ip) && !ip_UNPOINTED(ip)) || 
+    //(ip_MUTABLE(ip) && ip->type != FETCH_ME)) {
+    /* Make up new GAs for single-copy closures */
+    globalAddr *newGA = makeGlobal(closure, rtsTrue);
+    
+    // It's a new GA and therefore has the full weight
+    ASSERT(newGA->weight==0);
+
+    /* Create an old GA to new GA mapping */
+    *gaga++ = *ga;
+    splitWeight(gaga, newGA);
+    /* inlined splitWeight; we know that newGALA has full weight 
+    newGA->weight = gaga->weight = 1L << (BITS_IN(unsigned) - 1);    
+    gaga->payload = newGA->payload;
+    */
+    ASSERT(gaga->weight == 1U << (BITS_IN(unsigned) - 1));
+    gaga++;
+  }
+  return closure;
+}
+
+/*
+  Copies a segment of the buffer, starting at @bufptr@, representing a closure
+  into the heap at @graph@.
+ */
+//@cindex FillInClosure
+static nat
+FillInClosure(StgWord ***bufptrP, StgClosure *graph)
+{
+  StgInfoTable *ip;
+  StgWord **bufptr = *bufptrP;
+  nat ptrs, nonptrs, vhs, i, size;
+  char str[80];
+
+  ASSERT(LOOKS_LIKE_GHC_INFO(((StgClosure*)bufptr)->header.info));
+
+  /*
+   * Close your eyes.  You don't want to see where we're looking. You
+   * can't get closure info until you've unpacked the variable header,
+   * but you don't know how big it is until you've got closure info.
+   * So...we trust that the closure in the buffer is organized the
+   * same way as they will be in the heap...at least up through the
+   * end of the variable header.
+   */
+  ip = get_closure_info((StgClosure *)bufptr, &size, &ptrs, &nonptrs, &vhs, str);
+	  
+  /* Make sure that nothing sans the fixed header is filled in
+     The ga field of the FETCH_ME is filled in in SetGAandCommonUp */
+  if (ip->type == FETCH_ME || ip->type == REMOTE_REF) {
+    ASSERT(size>=_HS+MIN_UPD_SIZE);    // size of the FM in the heap
+    ptrs = nonptrs = vhs = 0;      // i.e. only unpack FH from buffer
+  }
+  /* ToDo: check whether this is really needed */
+  if (ip->type == ARR_WORDS) {
+    UnpackArray(bufptrP, graph);
+    return arr_words_sizeW((StgArrWords *)bufptr);
+  }
+
+  if (ip->type == PAP || ip->type == AP_UPD) {
+    return UnpackPAP(bufptrP, graph); // includes size of unpackes FMs
+  }
+
+  /* 
+     Remember, the generic closure layout is as follows:
+     +-------------------------------------------------+
+     | FIXED HEADER | VARIABLE HEADER | PTRS | NON-PRS |
+     +-------------------------------------------------+
+  */
+  /* Fill in the fixed header */
+  for (i = 0; i < _HS; i++)
+    ((StgPtr)graph)[i] = (StgWord)*bufptr++;
+
+  /* Fill in the packed variable header */
+  for (i = 0; i < vhs; i++)
+    ((StgPtr)graph)[_HS + i] = (StgWord)*bufptr++;
+  
+  /* Pointers will be filled in later */
+  
+  /* Fill in the packed non-pointers */
+  for (i = 0; i < nonptrs; i++)
+    ((StgPtr)graph)[_HS + i + vhs + ptrs] = (StgWord)*bufptr++;
+
+  /* Indirections are never packed */
+  // ASSERT(INFO_PTR(graph) != (W_) Ind_info_TO_USE);
+  // return bufptr;
+   *bufptrP = bufptr;
+   ASSERT(((ip->type==FETCH_ME || ip->type==REMOTE_REF)&& sizeofW(StgFetchMe)==size) ||
+	  _HS+vhs+ptrs+nonptrs == size);
+   return size; 
+}
+
+/*
+  Find the next pointer field in the parent closure.
+  If the current parent has been completely unpacked already, get the
+  next closure from the global closure queue.
+*/
+//@cindex LocateNextParent
+static void
+LocateNextParent(parentP, pptrP, pptrsP, sizeP)
+StgClosure **parentP;
+nat *pptrP, *pptrsP, *sizeP;
+{
+  StgInfoTable *ip; // debugging
+  nat nonptrs, pvhs;
+  char str[80];
+
+  /* pptr as an index into the current parent; find the next pointer field
+     in the parent by increasing pptr; if that takes us off the closure
+     (i.e. *pptr + 1 > *pptrs) grab a new parent from the closure queue
+  */
+  (*pptrP)++;
+  while (*pptrP + 1 > *pptrsP) {
+    /* *parentP has been constructed (all pointer set); so check it now */
+    IF_DEBUG(sanity,
+	     if ((*parentP!=(StgClosure*)NULL) &&         // not root
+		 (*((StgPtr)(*parentP)+1)!=GARBAGE_MARKER) && // not commoned up
+		 (get_itbl(*parentP)->type != FETCH_ME))
+	       checkClosure(*parentP));
+
+    *parentP = DeQueueClosure();
+    
+    if (*parentP == NULL)
+      break;
+    else {
+      ip = get_closure_info(*parentP, sizeP, pptrsP, &nonptrs,
+			    &pvhs, str);
+      *pptrP = 0;
+    }
+  }
+  /* *parentP points to the new (or old) parent; */
+  /* *pptr, *pptrs and *size have been updated referring to the new parent */
+}
+
+/* 
+   UnpackClosure is the heart of the unpacking routine. It is called for 
+   every closure found in the packBuffer. Any prefix such as GA, PLC marker
+   etc has been unpacked into the *ga structure. 
+   UnpackClosure does the following:
+     - check for the kind of the closure (PLC, Offset, std closure)
+     - copy the contents of the closure from the buffer into the heap
+     - update LAGA tables (in particular if we end up with 2 closures 
+       having the same GA, we make one an indirection to the other)
+     - set the GAGA map in order to send back an ACK message
+
+   At the end of this function *graphP has been updated to point to the
+   next free word in the heap for unpacking the rest of the graph and
+   *bufptrP points to the next word in the pack buffer to be unpacked.
+*/
+
+static  StgClosure*
+UnpackClosure (StgWord ***bufptrP, StgClosure **graphP, globalAddr *ga) {
+  StgClosure *closure;
+  nat size;
+  rtsBool hasGA = rtsFalse, unglobalised = rtsFalse;
+
+  /* Now unpack the closure body, if there is one; three cases:
+     - PLC: closure is just a pointer to a static closure
+     - Offset: closure has been unpacked already
+     - else: copy data from packet into closure
+  */
+  if (isFixed(ga)) {
+    closure = UnpackPLC(ga);
+  } else if (isOffset(ga)) {
+    closure = UnpackOffset(ga);
+  } else {
+    /* if not PLC or Offset it must be a GA and then the closure */
+    ASSERT(RtsFlags.ParFlags.globalising!=0 || LOOKS_LIKE_GA(ga));
+    /* check whether this is an unglobalised closure */
+    unglobalised = isUnglobalised(ga);
+    /* Now we have to build something. */
+    hasGA = !isConstr(ga);
+    /* the new closure will be built here */
+    closure = *graphP;
+
+    /* fill in the closure from the buffer */
+    size = FillInClosure(/*in/out*/bufptrP, /*in*/closure);
+    /* if it is unglobalised, it may not be a thunk!! */
+    ASSERT(!unglobalised || !closure_THUNK(closure));
+    
+   /* Add to queue for processing */
+    QueueClosure(closure);
+
+    /* common up with other graph if necessary */
+    if (!unglobalised)
+      closure = SetGAandCommonUp(ga, closure, hasGA);
+
+    /* if we unpacked a THUNK, check that it is large enough to update */
+    ASSERT(!closure_THUNK(closure) || size>=_HS+MIN_UPD_SIZE);
+    /* graph shall point to next free word in the heap */
+    *graphP += size;
+    //*graphP += (size < _HS+MIN_UPD_SIZE) ? _HS+MIN_UPD_SIZE : size; // see ASSERT
+  }
+  return closure;
+}
+
+/*
+  @UnpackGraph@ unpacks the graph contained in a message buffer.  It
+  returns a pointer to the new graph.  The @gamap@ parameter is set to
+  point to an array of (oldGA,newGA) pairs which were created as a result
+  of unpacking the buffer; @nGAs@ is set to the number of GA pairs which
+  were created.
+
+  The format of graph in the pack buffer is as defined in @Pack.lc@.  */
+
+//@cindex UnpackGraph
+StgClosure *
+UnpackGraph(packBuffer, gamap, nGAs)
+rtsPackBuffer *packBuffer;
+globalAddr **gamap;
+nat *nGAs;
+{
+  StgWord **bufptr, **slotptr;
+  globalAddr gaS;
+  StgClosure *closure, *graphroot, *graph, *parent;
+  nat size, heapsize, bufsize, 
+      pptr = 0, pptrs = 0, pvhs = 0;
+  nat unpacked_closures = 0, unpacked_thunks = 0; // stats only
+
+  IF_PAR_DEBUG(resume,
+	       graphFingerPrint[0] = '\0');
+
+  ASSERT(_HS==1);  // HWL HACK; compile time constant
+
+#if defined(PAR_TICKY) // HWL HAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACK
+  PAR_TICKY_UNPACK_GRAPH_START();
+#endif
+  
+  /* Initialisation */
+  InitPacking(rtsTrue);      // same as in PackNearbyGraph
+  globalUnpackBuffer = packBuffer;
+
+  IF_DEBUG(sanity, // do a sanity check on the incoming packet
+	   checkPacket(packBuffer));
+
+  ASSERT(gaga==PendingGABuffer); 
+  graphroot = (StgClosure *)NULL;
+
+  /* Unpack the header */
+  bufsize = packBuffer->size;
+  heapsize = packBuffer->unpacked_size;
+  bufptr = packBuffer->buffer;
+
+  /* allocate heap */
+  if (heapsize > 0) {
+    graph = (StgClosure *)allocate(heapsize);
+    ASSERT(graph != NULL);
+    // parallel global statistics: increase amount of global data
+    if (RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+      globalParStats.tot_global += heapsize;
+    }
+  }
+
+  /* iterate over the buffer contents and unpack all closures */
+  parent = (StgClosure *)NULL;
+  do {
+    /* check that we aren't at the end of the buffer, yet */
+    IF_DEBUG(sanity, ASSERT(*bufptr != END_OF_BUFFER_MARKER));
+
+    /* This is where we will ultimately save the closure's address */
+    slotptr = bufptr;
+
+    /* fill in gaS from buffer; gaS may receive GA, PLC- or offset-marker */
+    bufptr = UnpackGA(/*in*/bufptr, /*out*/&gaS);
+
+    /* this allocates heap space, updates LAGA tables etc */
+    closure = UnpackClosure (/*in/out*/&bufptr, /*in/out*/&graph, /*in*/&gaS);
+    unpacked_closures++; // stats only; doesn't count FMs in PAP!!!
+    unpacked_thunks += (closure_THUNK(closure)) ? 1 : 0; // stats only
+
+    /*
+     * Set parent pointer to point to chosen closure.  If we're at the top of
+     * the graph (our parent is NULL), then we want to arrange to return the
+     * chosen closure to our caller (possibly in place of the allocated graph
+     * root.)
+     */
+    if (parent == NULL)
+      graphroot = closure;
+    else
+      ((StgPtr)parent)[_HS + pvhs + pptr] = (StgWord) closure;
+
+    /* Save closure pointer for resolving offsets */
+    *slotptr = (StgWord*) closure;
+
+    /* Locate next parent pointer */
+    LocateNextParent(&parent, &pptr, &pptrs, &size);
+
+    IF_DEBUG(sanity,
+	     gaS.weight          = 0xdeadffff;
+	     gaS.payload.gc.gtid = 0xdead;
+	     gaS.payload.gc.slot = 0xdeadbeef;);
+  } while (parent != NULL);
+
+  IF_PAR_DEBUG(resume,
+	       GraphFingerPrint(graphroot, graphFingerPrint);
+	       ASSERT(strlen(graphFingerPrint)<=MAX_FINGER_PRINT_LEN);
+	       belch(">>> Fingerprint of graph rooted at %p (after unpacking <<%d>>:\n    {%s}",
+		     graphroot, packBuffer->id, graphFingerPrint));
+
+  /* we unpacked exactly as many words as there are in the buffer */
+  ASSERT(bufsize == (nat) (bufptr-(packBuffer->buffer)));
+  /* we filled no more heap closure than we allocated at the beginning; 
+     ideally this should be a ==; 
+     NB: test is only valid if we unpacked anything at all (graphroot might
+         end up to be a PLC!), therfore the strange test for HEAP_ALLOCED 
+  */
+
+  /*
+  {
+   StgInfoTable *info = get_itbl(graphroot);
+   ASSERT(!HEAP_ALLOCED(graphroot) || heapsize >= (nat) (graph-graphroot) ||
+          // ToDo: check whether CAFs are really a special case here!!
+          info->type==CAF_BLACKHOLE || info->type==FETCH_ME || info->type==FETCH_ME_BQ); 
+  }
+  */
+
+  /* check for magic end-of-buffer word */
+  IF_DEBUG(sanity, ASSERT(*bufptr == END_OF_BUFFER_MARKER));
+
+  *gamap = PendingGABuffer;
+  *nGAs = (gaga - PendingGABuffer) / 2;
+
+  IF_PAR_DEBUG(tables,
+	       belch("**   LAGA table after unpacking closure %p:",
+		     graphroot);
+	       printLAGAtable());
+
+  /* ToDo: are we *certain* graphroot has been set??? WDP 95/07 */
+  ASSERT(graphroot!=NULL);
+
+  IF_DEBUG(sanity,
+           {
+	     StgPtr p;
+
+	     /* check the unpacked graph */
+	     //checkHeapChunk(graphroot,graph-sizeof(StgWord));
+
+	     // if we do sanity checks, then wipe the pack buffer after unpacking
+	     for (p=(StgPtr)packBuffer->buffer; p<(StgPtr)(packBuffer->buffer)+(packBuffer->size); )
+	       *p++ = 0xdeadbeef;
+            });
+
+  /* reset the global variable */
+  globalUnpackBuffer = (rtsPackBuffer*)NULL;
+
+#if defined(PAR_TICKY) // HWL HAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACK
+  PAR_TICKY_UNPACK_GRAPH_END(unpacked_closures, unpacked_thunks);
+#endif
+
+  return (graphroot);
+}
+
+//@cindex UnpackGA
+static  StgWord **
+UnpackGA(StgWord **bufptr, globalAddr *ga)
+{
+  /* First, unpack the next GA or PLC */
+  ga->weight = (rtsWeight) *bufptr++;
+
+  if (ga->weight == 2) {  // unglobalised closure to follow
+    // nothing to do; closure starts at *bufptr
+  } else if (ga->weight > 0) { // fill in GA
+    ga->payload.gc.gtid = (GlobalTaskId) *bufptr++;
+    ga->payload.gc.slot = (int) *bufptr++;
+  } else {
+    ga->payload.plc = (StgPtr) *bufptr++;
+  }
+  return bufptr;
+}
+
+//@cindex UnpackPLC
+static  StgClosure *
+UnpackPLC(globalAddr *ga)
+{
+  /* No more to unpack; just set closure to local address */
+  IF_PAR_DEBUG(pack,
+	       belch("*<^^ Unpacked PLC at %x", ga->payload.plc)); 
+  return (StgClosure*)ga->payload.plc;
+}
+
+//@cindex UnpackOffset
+static  StgClosure *
+UnpackOffset(globalAddr *ga)
+{
+  /* globalUnpackBuffer is a global var init in UnpackGraph */
+  ASSERT(globalUnpackBuffer!=(rtsPackBuffer*)NULL);
+  /* No more to unpack; just set closure to cached address */
+  IF_PAR_DEBUG(pack,
+	       belch("*<__ Unpacked indirection to %p (was OFFSET %d)", 
+		     (StgClosure *)((globalUnpackBuffer->buffer)[ga->payload.gc.slot]),
+		     ga->payload.gc.slot)); 
+  return (StgClosure *)(globalUnpackBuffer->buffer)[ga->payload.gc.slot];
+}
+
+/*
+  Input: *bufptrP, *graphP  ... ptrs to the pack buffer and into the heap.
+
+  *bufptrP points to something that should be unpacked as a FETCH_ME:
+    |
+    v
+    +-------------------------------
+    |    GA    | FH of FM
+    +-------------------------------
+
+  The first 3 words starting at *bufptrP are the GA address; the next
+  word is the generic FM info ptr followed by the remaining FH (if any)
+  The result after unpacking will be a FETCH_ME closure, pointed to by
+  *graphP at the start of the fct;
+    |
+    v
+    +------------------------+
+    | FH of FM | ptr to a GA |
+    +------------------------+
+
+   The ptr field points into the RemoteGA table, which holds the actual GA.
+   *bufptrP has been updated to point to the next word in the buffer.
+   *graphP has been updated to point to the first free word at the end.
+*/
+
+static StgClosure*
+UnpackFetchMe (StgWord ***bufptrP, StgClosure **graphP) {
+  StgClosure *closure, *foo;
+  globalAddr gaS;
+
+  /* This fct relies on size of FM < size of FM in pack buffer */
+  ASSERT(sizeofW(StgFetchMe)<=PACK_FETCHME_SIZE);
+
+  /* fill in gaS from buffer */
+  *bufptrP = UnpackGA(*bufptrP, &gaS);
+  /* might be an offset to a closure in the pack buffer */
+  if (isOffset(&gaS)) {
+    belch("*<   UnpackFetchMe: found OFFSET to %d when unpacking FM at buffer loc %p",
+		  gaS.payload.gc.slot, *bufptrP);
+
+    closure = UnpackOffset(&gaS);
+    /* return address of previously unpacked closure; leaves *graphP unchanged */
+    return closure;
+  }
+
+  /* we have a proper GA at hand */
+  ASSERT(LOOKS_LIKE_GA(&gaS));
+
+  IF_DEBUG(sanity,
+	   if (isFixed(&gaS)) 
+	   barf("*<   UnpackFetchMe: found PLC where FM was expected %p (%s)",
+		*bufptrP, info_type((StgClosure*)*bufptrP)));
+
+  IF_PAR_DEBUG(pack,
+	       belch("*<_- Unpacked @ %p a FETCH_ME to GA ", 
+		     *graphP);
+	       printGA(&gaS);
+	       fputc('\n', stderr));
+
+  /* the next thing must be the IP to a FETCH_ME closure */
+  ASSERT(get_itbl((StgClosure *)*bufptrP)->type == FETCH_ME);
+  
+  closure = *graphP;
+  /* fill in the closure from the buffer */
+  FillInClosure(bufptrP, closure);
+  
+  /* the newly built closure is a FETCH_ME */
+  ASSERT(get_itbl(closure)->type == FETCH_ME);
+  
+  /* common up with other graph if necessary 
+     this also assigns the contents of gaS to the ga field of the FM closure */
+  foo = SetGAandCommonUp(&gaS, closure, rtsTrue);
+  
+  ASSERT(foo!=closure || LOOKS_LIKE_GA(((StgFetchMe*)closure)->ga));
+  
+  IF_PAR_DEBUG(pack,
+	       if (foo==closure) {  // only if not commoned up 
+	         belch("*<_- current FM @ %p next FM @ %p; unpacked FM @ %p is ", 
+		       *graphP, *graphP+sizeofW(StgFetchMe), closure);
+	         printClosure(closure);
+               });
+  *graphP += sizeofW(StgFetchMe);
+  return foo;
+}
+
+/*
+  Unpack an array of words.
+  Could use generic unpack most of the time, but cleaner to separate it.
+  ToDo: implement packing of MUT_ARRAYs
+*/
+
+//@cindex UnackArray
+static void
+UnpackArray(StgWord ***bufptrP, StgClosure *graph)
+{
+  StgInfoTable *info;
+  StgWord **bufptr=*bufptrP;
+  nat size, ptrs, nonptrs, vhs, i, n;
+  char str[80];
+
+  /* yes, I know I am paranoid; but who's asking !? */
+  IF_DEBUG(sanity,
+	   info = get_closure_info((StgClosure*)bufptr, 
+				   &size, &ptrs, &nonptrs, &vhs, str);
+	   ASSERT(info->type == ARR_WORDS || info->type == MUT_ARR_PTRS ||
+		  info->type == MUT_ARR_PTRS_FROZEN || info->type == MUT_VAR));
+
+  n = ((StgArrWords *)bufptr)->words;
+  // this includes the header!: arr_words_sizeW(stgCast(StgArrWords*,q)); 
+
+  IF_PAR_DEBUG(pack,
+               if (n<100) 
+	         belch("*<== unpacking an array of %d words %p (%s) (size=%d) |%s|\n",
+		     n, (StgClosure*)bufptr, info_type((StgClosure*)bufptr), 
+		     arr_words_sizeW((StgArrWords *)bufptr), 
+		       /* print array (string?) */
+                     ((StgArrWords *)graph)->payload);
+               else
+	         belch("*<== unpacking an array of %d words %p (%s) (size=%d)\n",
+		     n, (StgClosure*)bufptr, info_type((StgClosure*)bufptr), 
+		     arr_words_sizeW((StgArrWords *)bufptr)));
+
+  /* Unpack the header (2 words: info ptr and the number of words to follow) */
+  ((StgArrWords *)graph)->header.info = (StgInfoTable*)*bufptr++;  // assumes _HS==1; yuck!
+  ((StgArrWords *)graph)->words = (StgWord)*bufptr++;
+
+  /* unpack the payload of the closure (all non-ptrs) */
+  for (i=0; i<n; i++)
+    ((StgArrWords *)graph)->payload[i] = (StgWord)*bufptr++;
+
+  ASSERT(bufptr==*bufptrP+arr_words_sizeW((StgArrWords *)*bufptrP));
+  *bufptrP = bufptr;
+}
+
+/* 
+   Unpack a PAP in the buffer into a heap closure.
+   For each FETCHME we find in the packed PAP we have to unpack a separate
+   FETCHME closure and insert a pointer to this closure into the PAP. 
+   We unpack all FETCHMEs into an area after the PAP proper (the `FM area').
+   Note that the size of a FETCHME in the buffer is exactly the same as
+   the size of an unpacked FETCHME plus 1 word for the pointer to it.
+   Therefore, we just allocate packed_size words in the heap for the unpacking.
+   After this routine the heap starting from *graph looks like this:
+
+   graph
+     |
+     v             PAP closure                 |   FM area        |
+     +------------------------------------------------------------+
+     | PAP header | n_args | fun | payload ... | FM_1 | FM_2 .... |
+     +------------------------------------------------------------+
+
+   where payload contains pointers to each of the unpacked FM_1, FM_2 ...
+   The size of the PAP closure plus all FMs is _HS+2+packed_size.
+*/
+
+//@cindex UnpackPAP
+static nat
+UnpackPAP(StgWord ***bufptrP, StgClosure *graph) 
+{
+  nat n, i, j, packed_size = 0;
+  StgPtr p, q, end, payload_start, p_FMs;
+  const StgInfoTable* info;
+  StgWord bitmap;
+  StgWord **bufptr = *bufptrP;
+#if defined(DEBUG)
+  nat FMs_in_PAP=0;
+  void checkPAPSanity(StgPAP *graph, StgPtr p_FM_begin, StgPtr p_FM_end);
+#endif
+
+  IF_PAR_DEBUG(pack,
+	       belch("*<** UnpackPAP: unpacking PAP @ %p with %d words to closure %p", 
+			 *bufptr, *(bufptr+1), graph));
+
+  /* Unpack the PAP header (both fixed and variable) */
+  ((StgPAP *)graph)->header.info = (StgInfoTable*)*bufptr++;
+  n = ((StgPAP *)graph)->n_args = (StgWord)*bufptr++;
+  ((StgPAP *)graph)->fun = (StgClosure*)*bufptr++;
+  packed_size = (nat)*bufptr++;
+
+  IF_PAR_DEBUG(pack,
+	       belch("*<** UnpackPAP: PAP header is [%p, %d, %p] %d",
+		     ((StgPAP *)graph)->header.info,
+		     ((StgPAP *)graph)->n_args,
+		     ((StgPAP *)graph)->fun,
+		     packed_size));
+
+  payload_start = (StgPtr)bufptr;
+  /* p points to the current word in the heap */
+  p = (StgPtr)((StgPAP *)graph)->payload;      // payload of PAP will be unpacked here
+  p_FMs = (StgPtr)graph+pap_sizeW((StgPAP*)graph);  // FMs will be unpacked here
+  end = (StgPtr) payload_start+packed_size;
+  /*
+    The main loop unpacks the PAP in *bufptr into *p, with *p_FMS as the
+    FM area for unpacking all FETCHMEs encountered during unpacking.
+  */
+  while ((StgPtr)bufptr<end) {
+    /* be sure that we don't write more than we allocated for this closure */
+    ASSERT(p_FMs <= (StgPtr)(graph+_HS+2+packed_size));
+    /* be sure that the unpacked PAP doesn't run into the FM area */
+    ASSERT(p < (StgPtr)(graph+pap_sizeW((StgPAP*)graph)));
+    /* the loop body has been borrowed from scavenge_stack */
+    q = *bufptr; // let q be the contents of the current pointer into the buffer
+
+    /* Test whether the next thing is a FETCH_ME.
+       In PAPs FETCH_ME are encoded via a starting marker of ARGTAG_MAX+1
+    */
+    if (q==(StgPtr)(ARGTAG_MAX+1)) {
+      IF_PAR_DEBUG(pack,
+		   belch("*<** UnpackPAP @ %p: unpacking FM; filling in ptr to FM area: %p", 
+			 p, p_FMs));
+      bufptr++;         // skip ARGTAG_MAX+1 marker
+      // Unpack a FM into the FM area after the PAP proper and insert pointer
+      *p++ = (StgWord)UnpackFetchMe(&bufptr, (StgClosure**)&p_FMs); 
+      IF_DEBUG(sanity, FMs_in_PAP++);
+      continue;
+    }
+
+    /* Test whether it is a PLC */
+    if (q==(StgPtr)0) { // same as isFixed(q)
+      IF_PAR_DEBUG(pack,
+		   belch("*<** UnpackPAP @ %p: unpacking PLC to %p", 
+			 p, *(bufptr+1)));
+      bufptr++;          // skip 0 marker
+      *p++ = (StgWord)*bufptr++;
+      continue;
+    }
+
+    /* If we've got a tag, pack all words in that block */
+    if (IS_ARG_TAG((W_)q)) {   // q stands for the no. of non-ptrs to follow
+      nat m = ARG_SIZE(q);     // first word after this block
+      IF_PAR_DEBUG(pack,
+		   belch("*<** UnpackPAP @ %p: unpacking %d words (tagged), starting @ %p", 
+			 p, m, p));
+      for (i=0; i<m+1; i++)
+	*p++ = (StgWord)*bufptr++;
+      continue;
+    }
+
+    /* 
+     * Otherwise, q must be the info pointer of an activation
+     * record.  All activation records have 'bitmap' style layout
+     * info.
+     */
+    info  = get_itbl((StgClosure *)q);
+    switch (info->type) {
+	
+      /* Dynamic bitmap: the mask is stored on the stack */
+    case RET_DYN:
+      IF_PAR_DEBUG(pack,
+		   belch("*<** UnpackPAP @ %p: RET_DYN", 
+			 p));
+
+      /* Pack the header as is */
+      ((StgRetDyn *)p)->info     = (StgWord)*bufptr++;
+      ((StgRetDyn *)p)->liveness = (StgWord)*bufptr++;
+      ((StgRetDyn *)p)->ret_addr = (StgWord)*bufptr++;
+      p += 3;
+
+      //bitmap = ((StgRetDyn *)p)->liveness;
+      //p      = (P_)&((StgRetDyn *)p)->payload[0];
+      goto small_bitmap;
+
+      /* probably a slow-entry point return address: */
+    case FUN:
+    case FUN_STATIC:
+      {
+      IF_PAR_DEBUG(pack,
+		   belch("*<** UnpackPAP @ %p: FUN or FUN_STATIC", 
+			 p));
+
+      ((StgClosure *)p)->header.info = (StgInfoTable*)*bufptr;
+      p++;
+
+      goto follow_srt; //??
+      }
+
+      /* Using generic code here; could inline as in scavenge_stack */
+    case UPDATE_FRAME:
+      {
+	StgUpdateFrame *frame = (StgUpdateFrame *)p;
+	//nat type = get_itbl(frame->updatee)->type;
+
+	//ASSERT(type==BLACKHOLE || type==CAF_BLACKHOLE || type==BLACKHOLE_BQ);
+
+	IF_PAR_DEBUG(pack,
+		     belch("*<** UnackPAP @ %p: UPDATE_FRAME", 
+			   p));
+
+	((StgUpdateFrame *)p)->header.info = (StgInfoTable*)*bufptr++;
+	((StgUpdateFrame *)p)->link        = (StgUpdateFrame*)*bufptr++;     // ToDo: fix intra-stack pointer
+	((StgUpdateFrame *)p)->updatee     = (StgClosure*)*bufptr++;   // ToDo: follow link 
+
+	p += 3;
+      }
+
+      /* small bitmap (< 32 entries, or 64 on a 64-bit machine) */
+    case STOP_FRAME:
+      {
+	IF_PAR_DEBUG(pack,
+		     belch("*<** UnpackPAP @ %p: STOP_FRAME", 
+			   p));
+	((StgStopFrame *)p)->header.info = (StgInfoTable*)*bufptr;
+	p++;
+      }
+
+    case CATCH_FRAME:
+      {
+	IF_PAR_DEBUG(pack,
+		     belch("*<** UnpackPAP @ %p: CATCH_FRAME",
+			   p));
+
+	((StgCatchFrame *)p)->header.info = (StgInfoTable*)*bufptr++;
+	((StgCatchFrame *)p)->link        = (StgUpdateFrame*)*bufptr++;
+	((StgCatchFrame *)p)->exceptions_blocked = (StgInt)*bufptr++;
+	((StgCatchFrame *)p)->handler     = (StgClosure*)*bufptr++;
+	p += 4;
+      }
+
+    case SEQ_FRAME:
+      {
+	IF_PAR_DEBUG(pack,
+		     belch("*<** UnpackPAP @ %p: UPDATE_FRAME",
+			   p));
+
+	((StgSeqFrame *)p)->header.info = (StgInfoTable*)*bufptr++;
+	((StgSeqFrame *)p)->link        = (StgUpdateFrame*)*bufptr++;
+
+        // ToDo: handle bitmap
+        bitmap = info->layout.bitmap;
+
+        p = (StgPtr)&(((StgClosure *)p)->payload);
+        goto small_bitmap;
+      }
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+      IF_PAR_DEBUG(pack,
+		   belch("*<** UnpackPAP @ %p: RET_{BCO,SMALL,VEC_SMALL}",
+			 p));
+
+
+      ((StgClosure *)p)->header.info = (StgInfoTable*)*bufptr++;
+      p++;
+      // ToDo: handle bitmap
+      bitmap = info->layout.bitmap;
+      /* this assumes that the payload starts immediately after the info-ptr */
+
+    small_bitmap:
+      while (bitmap != 0) {
+	if ((bitmap & 1) == 0) {
+	  *p++ = (StgWord)UnpackFetchMe(&bufptr, (StgClosure**)&p_FMs);
+	  IF_DEBUG(sanity, FMs_in_PAP++);
+	} else {
+	  *p++ = (StgWord)*bufptr++;
+	}
+	bitmap = bitmap >> 1;
+      }
+      
+    follow_srt:
+      belch("*<-- UnpackPAP: nothing to do for follow_srt");
+      continue;
+
+      /* large bitmap (> 32 entries) */
+    case RET_BIG:
+    case RET_VEC_BIG:
+      {
+	StgPtr q;
+	StgLargeBitmap *large_bitmap;
+
+	IF_PAR_DEBUG(pack,
+		     belch("*<** UnpackPAP @ %p: RET_{BIG,VEC_BIG} (large_bitmap=%p)", 
+			   p, info->layout.large_bitmap));
+
+
+	((StgClosure *)p)->header.info = (StgInfoTable*)*bufptr++;
+	p++;
+
+	large_bitmap = info->layout.large_bitmap;
+
+	for (j=0; j<large_bitmap->size; j++) {
+	  bitmap = large_bitmap->bitmap[j];
+	  q = p + BITS_IN(W_);
+	  while (bitmap != 0) {
+	    if ((bitmap & 1) == 0) {
+	      *p++ = (StgWord)UnpackFetchMe(&bufptr, (StgClosure**)&p_FMs);
+	      IF_DEBUG(sanity, FMs_in_PAP++);
+	    } else {
+	      *p++ = (StgWord)*bufptr;
+	    }
+	    bitmap = bitmap >> 1;
+	  }
+	  if (j+1 < large_bitmap->size) {
+	    while (p < q) {
+	      *p++ = (StgWord)UnpackFetchMe(&bufptr, (StgClosure**)&p_FMs);
+	      IF_DEBUG(sanity, FMs_in_PAP++);
+	    }
+	  }
+	}
+
+	/* and don't forget to follow the SRT */
+	goto follow_srt;
+      }
+
+    default:
+      barf("UnpackPAP: weird activation record found on stack: %d", 
+	   (int)(info->type));
+    }
+  }
+  IF_PAR_DEBUG(pack,
+	       belch("*<** UnpackPAP finished; unpacked closure @ %p is:",
+		     (StgClosure *)graph);
+	       printClosure((StgClosure *)graph));
+
+  IF_DEBUG(sanity,               /* check sanity of unpacked PAP */
+	   checkClosure(graph));
+
+  *bufptrP = bufptr;
+  /* 
+     Now p points to the first word after the PAP proper and p_FMs points 
+     to the next free word in the heap; everything between p and p_FMs are 
+     FETCHMEs 
+  */
+  IF_DEBUG(sanity,
+	   checkPAPSanity(graph, p, p_FMs));
+
+  /* we have to return the size of PAP + FMs as size of the unpacked thing */
+  ASSERT(graph+pap_sizeW((StgPAP*)graph)==p);
+  return (nat)((StgClosure*)p_FMs-graph);
+}
+
+#if defined(DEBUG)
+/* 
+   Check sanity of a PAP after unpacking the PAP.
+   This means that there is slice of heap after the PAP containing FETCHMEs
+*/
+void
+checkPAPSanity(StgPAP *graph, StgPtr p_FM_begin, StgPtr p_FM_end)
+{
+  StgPtr xx;
+
+  /* check that the main unpacked closure is a PAP */
+  ASSERT(graph->header.info = &stg_PAP_info);
+  checkClosure(graph);
+  /* check that all of the closures in the FM-area are FETCHMEs */
+  for (xx=p_FM_begin; xx<p_FM_end; xx += sizeofW(StgFetchMe)) {
+    /* must be a FETCHME closure */
+    ASSERT(((StgClosure*)xx)->header.info == &stg_FETCH_ME_info);
+    /* it might have been commoned up (=> marked as garbage);
+       otherwise it points to a GA */
+    ASSERT((((StgFetchMe*)xx)->ga)==GARBAGE_MARKER ||
+	   LOOKS_LIKE_GA(((StgFetchMe*)xx)->ga));
+  }
+  /* traverse the payload of the PAP */
+  for (xx=graph->payload; xx-(StgPtr)(graph->payload)<graph->n_args; xx++) {
+    /* if the current elem is a pointer into the FM area, check that
+       the GA field is ok */
+    ASSERT(!(p_FM_begin<(StgPtr)*xx && (StgPtr)*xx<p_FM_end) ||
+	   LOOKS_LIKE_GA(((StgFetchMe*)*xx)->ga));
+  }
+}
+#endif  /* DEBUG */
+#endif  /* PAR */
+
+//@node GranSim Code,  , GUM code, Unpacking routines
+//@subsubsection GranSim Code
+
+/*
+   For GrAnSim: No actual unpacking should be necessary. We just
+   have to walk over the graph and set the bitmasks appropriately.
+   Since we use RBHs similarly to GUM but without an ACK message/event
+   we have to revert the RBH from within the UnpackGraph routine (good luck!)
+   -- HWL 
+*/
+
+#if defined(GRAN)
+void
+CommonUp(StgClosure *src, StgClosure *dst)
+{
+  barf("CommonUp: should never be entered in a GranSim setup");
+}
+
+StgClosure*
+UnpackGraph(buffer)
+rtsPackBuffer* buffer;
+{
+  nat size, ptrs, nonptrs, vhs,
+      bufptr = 0;
+  StgClosure *closure, *graphroot, *graph;
+  StgInfoTable *ip;
+  StgWord bufsize, unpackedsize,
+          pptr = 0, pptrs = 0, pvhs;
+  StgTSO* tso;
+  char str[240], str1[80];
+  int i;
+
+  bufptr = 0;
+  graphroot = buffer->buffer[0];
+
+  tso = buffer->tso;
+
+  /* Unpack the header */
+  unpackedsize = buffer->unpacked_size;
+  bufsize = buffer->size;
+
+  IF_GRAN_DEBUG(pack,
+		belch("<<< Unpacking <<%d>> (buffer @ %p):\n    (root @ %p, PE %d,size=%d), demanded by TSO %d (%p)[PE %d]",
+		      buffer->id, buffer, graphroot, where_is(graphroot), 
+		      bufsize, tso->id, tso, 
+		      where_is((StgClosure *)tso)));
+
+  do {
+    closure = buffer->buffer[bufptr++]; /* that's all we need for GrAnSim -- HWL */
+      
+    /* Actually only ip is needed; rest is useful for TESTING -- HWL */
+    ip = get_closure_info(closure, 
+			  &size, &ptrs, &nonptrs, &vhs, str);
+      
+    IF_GRAN_DEBUG(pack,
+		  sprintf(str, "**    (%p): Changing bitmask[%s]: 0x%x ",
+			  closure, (closure_HNF(closure) ? "NF" : "__"),
+			  PROCS(closure)));
+
+    if (get_itbl(closure)->type == RBH) {
+      /* if it's an RBH, we have to revert it into a normal closure, thereby
+	 awakening the blocking queue; not that this is code currently not
+	 needed in GUM, but it should be added with the new features in
+	 GdH (and the implementation of an NACK message)
+      */
+      // closure->header.gran.procs = PE_NUMBER(CurrentProc);
+      SET_GRAN_HDR(closure, PE_NUMBER(CurrentProc));    /* Move node */
+
+      IF_GRAN_DEBUG(pack,
+		    strcat(str, " (converting RBH) ")); 
+
+      convertFromRBH(closure);   /* In GUM that's done by convertToFetchMe */
+
+      IF_GRAN_DEBUG(pack,
+		    belch("::  closure %p (%s) is a RBH; after reverting: IP=%p",
+			  closure, info_type(closure), get_itbl(closure)));
+    } else if (IS_BLACK_HOLE(closure)) {
+      IF_GRAN_DEBUG(pack,
+		    belch("::  closure %p (%s) is a BH; copying node to %d",
+			  closure, info_type(closure), CurrentProc));
+      closure->header.gran.procs |= PE_NUMBER(CurrentProc); /* Copy node */
+    } else if ( (closure->header.gran.procs & PE_NUMBER(CurrentProc)) == 0 ) {
+      if (closure_HNF(closure)) {
+	IF_GRAN_DEBUG(pack,
+		      belch("::  closure %p (%s) is a HNF; copying node to %d",
+			    closure, info_type(closure), CurrentProc));
+	closure->header.gran.procs |= PE_NUMBER(CurrentProc); /* Copy node */
+      } else { 
+	IF_GRAN_DEBUG(pack,
+		      belch("::  closure %p (%s) is no (R)BH or HNF; moving node to %d",
+			    closure, info_type(closure), CurrentProc));
+	closure->header.gran.procs = PE_NUMBER(CurrentProc);  /* Move node */
+      }
+    }
+
+    IF_GRAN_DEBUG(pack,
+		  sprintf(str1, "0x%x",   PROCS(closure)); strcat(str, str1));
+    IF_GRAN_DEBUG(pack, belch(str));
+    
+  } while (bufptr<buffer->size) ;   /*  (parent != NULL);  */
+
+  /* In GrAnSim we allocate pack buffers dynamically! -- HWL */
+  free(buffer->buffer);
+  free(buffer);
+
+  IF_GRAN_DEBUG(pack,
+		belch("PrintGraph of %p is:", graphroot); PrintGraph(graphroot,0));
+
+  return (graphroot);
+}
+#endif  /* GRAN */
+
+//@node Aux fcts for packing, Printing Packet Contents, Unpacking routines, Graph packing
+//@subsection Aux fcts for packing
+
+//@menu
+//* Offset table::		
+//* Packet size::		
+//* Types of Global Addresses::	 
+//* Closure Info::		
+//@end menu
+
+//@node Offset table, Packet size, Aux fcts for packing, Aux fcts for packing
+//@subsubsection Offset table
+
+/*
+   DonePacking is called when we've finished packing.  It releases memory
+   etc.  */
+
+//@cindex DonePacking
+
+# if defined(PAR)
+
+static void
+DonePacking(void)
+{
+  freeHashTable(offsetTable, NULL);
+  offsetTable = NULL;
+}
+
+/*
+   AmPacking records that the closure is being packed.  Note the abuse of
+   the data field in the hash table -- this saves calling @malloc@!  */
+
+//@cindex AmPacking
+
+static void
+AmPacking(closure)
+StgClosure *closure;
+{
+  insertHashTable(offsetTable, (StgWord) closure, (void *) (StgWord) pack_locn);
+}
+
+/*
+   OffsetFor returns an offset for a closure which is already being packed.  */
+
+//@cindex OffsetFor
+
+static int
+OffsetFor(closure)
+StgClosure *closure;
+{
+  return (int) (StgWord) lookupHashTable(offsetTable, (StgWord) closure);
+}
+
+/*
+   NotYetPacking determines whether the closure's already being packed.
+   Offsets $<$ @PACK_HDR_SIZE@ (e.g. 0) mean no.  */
+
+//@cindex NotYetPacking
+
+static rtsBool
+NotYetPacking(offset)
+int offset;
+{
+  return(offset == 0); // ToDo: what if root is found again?? FIX 
+}
+
+# else  /* GRAN */
+
+static void
+DonePacking(void)
+{
+  /* nothing */
+}
+
+/* 
+   NotYetPacking searches through the whole pack buffer for closure.  */
+
+static rtsBool
+NotYetPacking(closure)
+StgClosure *closure;
+{ nat i;
+  rtsBool found = rtsFalse;
+
+  for (i=0; (i<pack_locn) && !found; i++)
+    found = globalPackBuffer->buffer[i]==closure;
+
+  return (!found);
+}
+# endif
+
+//@node Packet size, Closure Info, Offset table, Aux fcts for packing
+//@subsubsection Packet size
+
+/* 
+   The size needed if all currently queued closures are packed as FETCH_ME
+   closures. This represents the headroom we must have when packing the
+   buffer in order to maintain all links in the graphs.
+*/
+// ToDo: check and merge cases
+#if defined(PAR)
+static nat
+QueuedClosuresMinSize (nat ptrs) {
+  return ((clq_size - clq_pos) + ptrs) * PACK_FETCHME_SIZE;
+}
+#else /* GRAN */
+static nat
+QueuedClosuresMinSize (nat ptrs) {
+  return ((clq_size - clq_pos) + ptrs) * PACK_FETCHME_SIZE;
+}
+#endif 
+
+/*
+  RoomToPack determines whether there's room to pack the closure into
+  the pack buffer based on 
+
+  o how full the buffer is already,
+  o the closures' size and number of pointers (which must be packed as GAs),
+  o the size and number of pointers held by any primitive arrays that it 
+    points to
+  
+    It has a *side-effect* (naughty, naughty) in assigning roomInBuffer 
+    to rtsFalse.
+*/
+
+//@cindex RoomToPack
+static rtsBool
+RoomToPack(size, ptrs)
+nat size, ptrs;
+{
+# if defined(PAR)
+  if (roomInBuffer &&
+      (pack_locn +                 // where we are in the buffer right now
+       size +                      // space needed for the current closure
+       QueuedClosuresMinSize(ptrs) // space for queued closures as FETCH_MEs
+       + 1                         // headroom (DEBUGGING only)
+       >= 
+       RTS_PACK_BUFFER_SIZE))
+    {
+      roomInBuffer = rtsFalse;
+    }
+# else   /* GRAN */
+  if (roomInBuffer &&
+      (unpacked_size + 
+       size +
+       QueuedClosuresMinSize(ptrs)
+       >= 
+       RTS_PACK_BUFFER_SIZE))
+    {
+      roomInBuffer = rtsFalse;
+    }
+# endif
+  return (roomInBuffer);
+}
+
+//@node Closure Info,  , Packet size, Aux fcts for packing
+//@subsubsection Closure Info
+
+/*
+   Closure Info
+
+   @get_closure_info@ determines the size, number of pointers etc. for this
+   type of closure -- see @SMInfoTables.lh@ for the legal info. types etc.
+
+[Can someone please keep this function up to date.  I keep needing it
+ (or something similar) for interpretive code, and it keeps
+ bit-rotting.  {\em It really belongs somewhere else too}.  KH @@ 17/2/95] */
+
+#if 0
+
+// {Parallel.h}Daq ngoqvam vIroQpu'
+
+# if defined(GRAN) || defined(PAR)
+/* extracting specific info out of closure; currently only used in GRAN -- HWL */
+//@cindex get_closure_info
+StgInfoTable*
+get_closure_info(node, size, ptrs, nonptrs, vhs, info_hdr_ty)
+StgClosure* node;
+nat *size, *ptrs, *nonptrs, *vhs;
+char *info_hdr_ty;
+{
+  StgInfoTable *info;
+
+  info = get_itbl(node);
+  /* the switch shouldn't be necessary, really; just use default case */
+  switch (info->type) {
+#if 0
+   case CONSTR_1_0:
+   case THUNK_1_0:
+   case FUN_1_0:
+     *size = sizeW_fromITBL(info);
+     *ptrs = (nat) 1; // (info->layout.payload.ptrs);
+     *nonptrs = (nat) 0; // (info->layout.payload.nptrs);
+     *vhs = (nat) 0; // unknown
+     info_hdr_type(node, info_hdr_ty);
+     return info;
+     
+  case CONSTR_0_1:
+  case THUNK_0_1:
+  case FUN_0_1:
+     *size = sizeW_fromITBL(info);
+     *ptrs = (nat) 0; // (info->layout.payload.ptrs);
+     *nonptrs = (nat) 1; // (info->layout.payload.nptrs);
+     *vhs = (nat) 0; // unknown
+     info_hdr_type(node, info_hdr_ty);
+     return info;
+
+  case CONSTR_2_0:
+  case THUNK_2_0:
+  case FUN_2_0:
+     *size = sizeW_fromITBL(info);
+     *ptrs = (nat) 2; // (info->layout.payload.ptrs);
+     *nonptrs = (nat) 0; // (info->layout.payload.nptrs);
+     *vhs = (nat) 0; // unknown
+     info_hdr_type(node, info_hdr_ty);
+     return info;
+
+  case CONSTR_1_1:
+  case THUNK_1_1:
+  case FUN_1_1:
+     *size = sizeW_fromITBL(info);
+     *ptrs = (nat) 1; // (info->layout.payload.ptrs);
+     *nonptrs = (nat) 1; // (info->layout.payload.nptrs);
+     *vhs = (nat) 0; // unknown
+     info_hdr_type(node, info_hdr_ty);
+     return info;
+
+  case CONSTR_0_2:
+  case THUNK_0_2:
+  case FUN_0_2:
+     *size = sizeW_fromITBL(info);
+     *ptrs = (nat) 0; // (info->layout.payload.ptrs);
+     *nonptrs = (nat) 2; // (info->layout.payload.nptrs);
+     *vhs = (nat) 0; // unknown
+     info_hdr_type(node, info_hdr_ty);
+     return info;
+#endif
+  case RBH:
+    {
+      StgInfoTable *rip = REVERT_INFOPTR(info); // closure to revert to
+      *size = sizeW_fromITBL(rip);
+      *ptrs = (nat) (rip->layout.payload.ptrs);
+      *nonptrs = (nat) (rip->layout.payload.nptrs);
+      *vhs = (nat) 0; // unknown
+      info_hdr_type(node, info_hdr_ty);
+      return rip;  // NB: we return the reverted info ptr for a RBH!!!!!!
+    }
+
+  default:
+    *size = sizeW_fromITBL(info);
+    *ptrs = (nat) (info->layout.payload.ptrs);
+    *nonptrs = (nat) (info->layout.payload.nptrs);
+    *vhs = (nat) 0; // unknown
+    info_hdr_type(node, info_hdr_ty);
+    return info;
+  }
+} 
+
+//@cindex IS_BLACK_HOLE
+rtsBool
+IS_BLACK_HOLE(StgClosure* node)          
+{ 
+  StgInfoTable *info;
+  info = get_itbl(node);
+  return ((info->type == BLACKHOLE || info->type == RBH) ? rtsTrue : rtsFalse);
+}
+
+//@cindex IS_INDIRECTION
+StgClosure *
+IS_INDIRECTION(StgClosure* node)          
+{ 
+  StgInfoTable *info;
+  info = get_itbl(node);
+  switch (info->type) {
+    case IND:
+    case IND_OLDGEN:
+    case IND_PERM:
+    case IND_OLDGEN_PERM:
+    case IND_STATIC:
+      /* relies on indirectee being at same place for all these closure types */
+      return (((StgInd*)node) -> indirectee);
+    default:
+      return NULL;
+  }
+}
+
+/*
+rtsBool
+IS_THUNK(StgClosure* node)
+{
+  StgInfoTable *info;
+  info = get_itbl(node);
+  return ((info->type == THUNK ||
+	   info->type == THUNK_STATIC ||
+	   info->type == THUNK_SELECTOR) ? rtsTrue : rtsFalse);
+}
+*/
+
+# endif /* GRAN */
+#endif /* 0 */
+
+# if 0
+/* ngoq ngo' */
+
+P_
+get_closure_info(closure, size, ptrs, nonptrs, vhs, type)
+P_ closure;
+W_ *size, *ptrs, *nonptrs, *vhs;
+char *type;
+{
+   P_ ip = (P_) INFO_PTR(closure);
+
+   if (closure==NULL) {
+     fprintf(stderr, "Qagh {get_closure_info}Daq: NULL closure\n");
+     *size = *ptrs = *nonptrs = *vhs = 0; 
+     strcpy(type,"ERROR in get_closure_info");
+     return;
+   } else if (closure==PrelBase_Z91Z93_closure) {
+     /* fprintf(stderr, "Qagh {get_closure_info}Daq: PrelBase_Z91Z93_closure closure\n"); */
+     *size = *ptrs = *nonptrs = *vhs = 0; 
+     strcpy(type,"PrelBase_Z91Z93_closure");
+     return;
+   };
+
+    ip = (P_) INFO_PTR(closure);
+
+    switch (INFO_TYPE(ip)) {
+    case INFO_SPEC_U_TYPE:
+    case INFO_SPEC_S_TYPE:
+    case INFO_SPEC_N_TYPE:
+	*size = SPEC_CLOSURE_SIZE(closure);
+	*ptrs = SPEC_CLOSURE_NoPTRS(closure);
+	*nonptrs = SPEC_CLOSURE_NoNONPTRS(closure);
+	*vhs = 0 /*SPEC_VHS*/;
+	strcpy(type,"SPEC");
+	break;
+
+    case INFO_GEN_U_TYPE:
+    case INFO_GEN_S_TYPE:
+    case INFO_GEN_N_TYPE:
+	*size = GEN_CLOSURE_SIZE(closure);
+	*ptrs = GEN_CLOSURE_NoPTRS(closure);
+	*nonptrs = GEN_CLOSURE_NoNONPTRS(closure);
+	*vhs = GEN_VHS;
+	strcpy(type,"GEN");
+	break;
+
+    case INFO_DYN_TYPE:
+	*size = DYN_CLOSURE_SIZE(closure);
+	*ptrs = DYN_CLOSURE_NoPTRS(closure);
+	*nonptrs = DYN_CLOSURE_NoNONPTRS(closure);
+	*vhs = DYN_VHS;
+	strcpy(type,"DYN");
+	break;
+
+    case INFO_TUPLE_TYPE:
+	*size = TUPLE_CLOSURE_SIZE(closure);
+	*ptrs = TUPLE_CLOSURE_NoPTRS(closure);
+	*nonptrs = TUPLE_CLOSURE_NoNONPTRS(closure);
+	*vhs = TUPLE_VHS;
+	strcpy(type,"TUPLE");
+	break;
+
+    case INFO_DATA_TYPE:
+	*size = DATA_CLOSURE_SIZE(closure);
+	*ptrs = DATA_CLOSURE_NoPTRS(closure);
+	*nonptrs = DATA_CLOSURE_NoNONPTRS(closure);
+	*vhs = DATA_VHS;
+	strcpy(type,"DATA");
+	break;
+
+    case INFO_IMMUTUPLE_TYPE:
+    case INFO_MUTUPLE_TYPE:
+	*size = MUTUPLE_CLOSURE_SIZE(closure);
+	*ptrs = MUTUPLE_CLOSURE_NoPTRS(closure);
+	*nonptrs = MUTUPLE_CLOSURE_NoNONPTRS(closure);
+	*vhs = MUTUPLE_VHS;
+	strcpy(type,"(IM)MUTUPLE");
+	break;
+
+    case INFO_STATIC_TYPE:
+	*size = STATIC_CLOSURE_SIZE(closure);
+	*ptrs = STATIC_CLOSURE_NoPTRS(closure);
+	*nonptrs = STATIC_CLOSURE_NoNONPTRS(closure);
+	*vhs = STATIC_VHS;
+	strcpy(type,"STATIC");
+	break;
+
+    case INFO_CAF_TYPE:
+    case INFO_IND_TYPE:
+	*size = IND_CLOSURE_SIZE(closure);
+	*ptrs = IND_CLOSURE_NoPTRS(closure);
+	*nonptrs = IND_CLOSURE_NoNONPTRS(closure);
+	*vhs = IND_VHS;
+	strcpy(type,"CAF|IND");
+	break;
+
+    case INFO_CONST_TYPE:
+	*size = CONST_CLOSURE_SIZE(closure);
+	*ptrs = CONST_CLOSURE_NoPTRS(closure);
+	*nonptrs = CONST_CLOSURE_NoNONPTRS(closure);
+	*vhs = CONST_VHS;
+	strcpy(type,"CONST");
+	break;
+
+    case INFO_SPEC_RBH_TYPE:
+	*size = SPEC_RBH_CLOSURE_SIZE(closure);
+	*ptrs = SPEC_RBH_CLOSURE_NoPTRS(closure);
+	*nonptrs = SPEC_RBH_CLOSURE_NoNONPTRS(closure);
+	if (*ptrs <= 2) {
+	    *nonptrs -= (2 - *ptrs);
+	    *ptrs = 1;
+	} else
+	    *ptrs -= 1;
+	*vhs = SPEC_RBH_VHS;
+	strcpy(type,"SPEC_RBH");
+	break;
+
+    case INFO_GEN_RBH_TYPE:
+	*size = GEN_RBH_CLOSURE_SIZE(closure);
+	*ptrs = GEN_RBH_CLOSURE_NoPTRS(closure);
+	*nonptrs = GEN_RBH_CLOSURE_NoNONPTRS(closure);
+	if (*ptrs <= 2) {
+	    *nonptrs -= (2 - *ptrs);
+	    *ptrs = 1;
+	} else
+	    *ptrs -= 1;
+	*vhs = GEN_RBH_VHS;
+	strcpy(type,"GEN_RBH");
+	break;
+
+    case INFO_CHARLIKE_TYPE:
+	*size = CHARLIKE_CLOSURE_SIZE(closure);
+	*ptrs = CHARLIKE_CLOSURE_NoPTRS(closure);
+	*nonptrs = CHARLIKE_CLOSURE_NoNONPTRS(closure);
+	*vhs = CHARLIKE_VHS;
+	strcpy(type,"CHARLIKE");
+	break;
+
+    case INFO_INTLIKE_TYPE:
+	*size = INTLIKE_CLOSURE_SIZE(closure);
+	*ptrs = INTLIKE_CLOSURE_NoPTRS(closure);
+	*nonptrs = INTLIKE_CLOSURE_NoNONPTRS(closure);
+	*vhs = INTLIKE_VHS;
+	strcpy(type,"INTLIKE");
+	break;
+
+#  if !defined(GRAN)
+    case INFO_FETCHME_TYPE:
+	*size = FETCHME_CLOSURE_SIZE(closure);
+        *ptrs = FETCHME_CLOSURE_NoPTRS(closure);
+        *nonptrs = FETCHME_CLOSURE_NoNONPTRS(closure);
+        *vhs = FETCHME_VHS;
+	strcpy(type,"FETCHME");
+	break;
+
+    case INFO_FMBQ_TYPE:
+	*size = FMBQ_CLOSURE_SIZE(closure);
+        *ptrs = FMBQ_CLOSURE_NoPTRS(closure);
+        *nonptrs = FMBQ_CLOSURE_NoNONPTRS(closure);
+        *vhs = FMBQ_VHS;
+	strcpy(type,"FMBQ");
+	break;
+#  endif
+
+    case INFO_BQ_TYPE:
+	*size = BQ_CLOSURE_SIZE(closure);
+        *ptrs = BQ_CLOSURE_NoPTRS(closure);
+        *nonptrs = BQ_CLOSURE_NoNONPTRS(closure);
+        *vhs = BQ_VHS;
+	strcpy(type,"BQ");
+	break;
+
+    case INFO_BH_TYPE:
+	*size = BH_CLOSURE_SIZE(closure);
+        *ptrs = BH_CLOSURE_NoPTRS(closure);
+        *nonptrs = BH_CLOSURE_NoNONPTRS(closure);
+        *vhs = BH_VHS;
+	strcpy(type,"BH");
+	break;
+
+    case INFO_TSO_TYPE:
+	*size = 0; /* TSO_CLOSURE_SIZE(closure); */
+        *ptrs = 0; /* TSO_CLOSURE_NoPTRS(closure); */
+        *nonptrs = 0; /* TSO_CLOSURE_NoNONPTRS(closure); */
+        *vhs = TSO_VHS;
+	strcpy(type,"TSO");
+	break;
+
+    case INFO_STKO_TYPE:
+        *size = 0;
+    	*ptrs = 0;
+        *nonptrs = 0;
+    	*vhs = STKO_VHS;
+    	strcpy(type,"STKO");
+        break;
+
+    default:
+	fprintf(stderr, "get_closure_info:  Unexpected closure type (%lu), closure %lx\n",
+	  INFO_TYPE(ip), (StgWord) closure);
+	EXIT(EXIT_FAILURE);
+    }
+
+    return ip;
+}
+# endif
+
+# if 0
+// Use allocate in Storage.c instead
+/*
+   @AllocateHeap@ will bump the heap pointer by @size@ words if the space
+   is available, but it will not perform garbage collection.
+   ToDo: check whether we can use an existing STG allocation routine -- HWL
+*/
+
+
+//@cindex AllocateHeap
+StgPtr
+AllocateHeap(size)
+nat size;
+{
+  StgPtr newClosure;
+  
+  /* Allocate a new closure */
+  if (Hp + size > HpLim)
+    return NULL;
+  
+  newClosure = Hp + 1;
+  Hp += size;
+  
+  return newClosure;
+}
+# endif
+
+# if defined(PAR)
+
+//@cindex doGlobalGC
+void
+doGlobalGC(void)
+{
+  fprintf(stderr,"Splat -- we just hit global GC!\n");
+  stg_exit(EXIT_FAILURE);
+  //fishing = rtsFalse;
+  outstandingFishes--;
+}
+
+# endif /* PAR */
+
+//@node Printing Packet Contents, End of file, Aux fcts for packing, Graph packing
+//@subsection Printing Packet Contents
+/*
+  Printing Packet Contents
+  */
+
+#if defined(DEBUG) || defined(GRAN_CHECK)
+
+//@cindex PrintPacket
+
+#if defined(PAR)
+void
+PrintPacket(packBuffer)
+rtsPackBuffer *packBuffer;
+{
+  StgClosure *parent, *graphroot, *closure_start;
+  const StgInfoTable *ip;
+  globalAddr ga;
+  StgWord **bufptr, **slotptr;
+
+  nat bufsize;
+  nat pptr = 0, pptrs = 0, pvhs;
+  nat locn = 0;
+  nat i;
+  nat size, ptrs, nonptrs, vhs;
+  char str[80];
+
+  /* disable printing if a non-std globalisation scheme is used; ToDo: FIX */
+  if (RtsFlags.ParFlags.globalising != 0)
+    return;
+
+  /* NB: this whole routine is more or less a copy of UnpackGraph with all
+     unpacking components replaced by printing fcts
+     Long live higher-order fcts!
+  */
+  /* Initialisation */
+  //InitPackBuffer();                  /* in case it isn't already init'd */
+  InitClosureQueue();
+  // ASSERT(gaga==PendingGABuffer); 
+  graphroot = (StgClosure *)NULL;
+
+  /* Unpack the header */
+  bufsize = packBuffer->size;
+  bufptr = packBuffer->buffer;
+
+  fprintf(stderr, "*. Printing <<%d>> (buffer @ %p):\n", 
+	  packBuffer->id, packBuffer);
+  fprintf(stderr, "*.   size: %d; unpacked_size: %d; tso: %p; buffer: %p\n",
+	  packBuffer->size, packBuffer->unpacked_size, 
+	  packBuffer->tso, packBuffer->buffer);
+
+  parent = (StgClosure *)NULL;
+
+  do {
+    /* This is where we will ultimately save the closure's address */
+    slotptr = bufptr;
+    locn = slotptr-(packBuffer->buffer); // index of closure in buffer
+
+    /* First, unpack the next GA or PLC */
+    ga.weight = (rtsWeight) *bufptr++;
+
+    if (ga.weight == 2) {  // unglobalised closure to follow
+      // nothing to do; closure starts at *bufptr
+    } else if (ga.weight > 0) { // fill in GA
+      ga.payload.gc.gtid = (GlobalTaskId) *bufptr++;
+      ga.payload.gc.slot = (int) *bufptr++;
+    } else
+      ga.payload.plc = (StgPtr) *bufptr++;
+    
+    /* Now unpack the closure body, if there is one */
+    if (isFixed(&ga)) {
+      fprintf(stderr, "*. %u: PLC @ %p\n", locn, ga.payload.plc);
+      // closure = ga.payload.plc;
+    } else if (isOffset(&ga)) {
+      fprintf(stderr, "*. %u: OFFSET TO %d\n", locn, ga.payload.gc.slot);
+      // closure = (StgClosure *) buffer[ga.payload.gc.slot];
+    } else {
+      /* Print normal closures */
+
+      ASSERT(bufsize > 0);
+
+      fprintf(stderr, "*. %u: ((%x, %d, %x)) ", locn,
+              ga.payload.gc.gtid, ga.payload.gc.slot, ga.weight);
+
+      closure_start = (StgClosure*)bufptr;
+      ip = get_closure_info((StgClosure *)bufptr, 
+			    &size, &ptrs, &nonptrs, &vhs, str);
+	  
+      /* ToDo: check whether this is really needed */
+      if (ip->type == FETCH_ME || ip->type == REMOTE_REF) {
+	size = _HS;
+	ptrs = nonptrs = vhs = 0;
+      }
+      /* ToDo: check whether this is really needed */
+      if (ip->type == ARR_WORDS) {
+	ptrs = vhs = 0;
+	nonptrs = ((StgArrWords *)bufptr)->words;
+	size = arr_words_sizeW((StgArrWords *)bufptr);
+      }
+
+      /* special code for printing a PAP in a buffer */
+      if (ip->type == PAP || ip->type == AP_UPD) {
+        vhs = 3; 
+	ptrs = 0;
+        nonptrs = (nat)((StgPAP *)bufptr)->payload[0];
+	size = _HS+vhs+ptrs+nonptrs;
+      }
+
+      /* 
+	 Remember, the generic closure layout is as follows:
+	 +-------------------------------------------------+
+	 | FIXED HEADER | VARIABLE HEADER | PTRS | NON-PRS |
+	 +-------------------------------------------------+
+      */
+      /* Print fixed header */
+      fprintf(stderr, "FH ["); 
+      for (i = 0; i < _HS; i++)
+	fprintf(stderr, " %p", *bufptr++);
+
+      if (ip->type == FETCH_ME || ip->type == REMOTE_REF)
+	size = ptrs = nonptrs = vhs = 0;
+
+      // VH is always empty in the new RTS
+      ASSERT(vhs==0 ||
+             ip->type == PAP || ip->type == AP_UPD);
+      /* Print variable header */
+      fprintf(stderr, "] VH ["); 
+      for (i = 0; i < vhs; i++)
+	fprintf(stderr, " %p", *bufptr++);
+
+      //fprintf(stderr, "] %d PTRS [", ptrs); 
+      /* Pointers will be filled in later */
+
+      fprintf(stderr, " ] (%d, %d) [", ptrs, nonptrs); 
+      /* Print non-pointers */
+      for (i = 0; i < nonptrs; i++)
+	fprintf(stderr, " %p", *bufptr++);
+
+      fprintf(stderr, "] (%s)\n", str);
+
+      /* Indirections are never packed */
+      // ASSERT(INFO_PTR(graph) != (W_) Ind_info_TO_USE);
+
+      /* Add to queue for processing 
+	 When just printing the packet we do not have an unpacked closure
+	 in hand, so we feed it the packet entry; 
+	 again, this assumes that at least the fixed header of the closure
+	 has the same layout in the packet; also we may not overwrite entries
+	 in the packet (done in Unpack), but for printing that's a bad idea
+	 anyway */
+      QueueClosure((StgClosure *)closure_start);
+	
+      /* No Common up needed for printing */
+
+      /* No Sort out the global address mapping for printing */
+
+    } /* normal closure case */
+
+    /* Locate next parent pointer */
+    pptr++;
+    while (pptr + 1 > pptrs) {
+      parent = DeQueueClosure();
+
+      if (parent == NULL)
+	break;
+      else {
+	(void) get_closure_info(parent, &size, &pptrs, &nonptrs,
+					&pvhs, str);
+	pptr = 0;
+      }
+    }
+  } while (parent != NULL);
+  fprintf(stderr, "*. --- End packet <<%d>> (claimed size=%d; real size=%d)---\n", 
+	  packBuffer->id, packBuffer->size, size);
+
+}
+
+/*
+  Doing a sanity check on a packet.
+  This does a full iteration over the packet, as in PrintPacket.
+*/
+//@cindex checkPacket
+void
+checkPacket(packBuffer)
+rtsPackBuffer *packBuffer;
+{
+  StgClosure *parent, *graphroot, *closure_start;
+  const StgInfoTable *ip;
+  globalAddr ga;
+  StgWord **bufptr, **slotptr;
+
+  nat bufsize;
+  nat pptr = 0, pptrs = 0, pvhs;
+  nat locn = 0;
+  nat size, ptrs, nonptrs, vhs;
+  char str[80];
+
+  /* NB: this whole routine is more or less a copy of UnpackGraph with all
+     unpacking components replaced by printing fcts
+     Long live higher-order fcts!
+  */
+  /* Initialisation */
+  //InitPackBuffer();                  /* in case it isn't already init'd */
+  InitClosureQueue();
+  // ASSERT(gaga==PendingGABuffer); 
+  graphroot = (StgClosure *)NULL;
+
+  /* Unpack the header */
+  bufsize = packBuffer->size;
+  bufptr = packBuffer->buffer;
+  parent = (StgClosure *)NULL;
+  ASSERT(bufsize > 0);
+  do {
+    /* check that we are not at the end of the buffer, yet */
+    IF_DEBUG(sanity, ASSERT(*bufptr != END_OF_BUFFER_MARKER));
+
+    /* This is where we will ultimately save the closure's address */
+    slotptr = bufptr;
+    locn = slotptr-(packBuffer->buffer); // index of closure in buffer
+    ASSERT(locn<=bufsize);
+  
+    /* First, check whether we have a GA, a PLC, or an OFFSET at hand */
+    ga.weight = (rtsWeight) *bufptr++;
+
+    if (ga.weight == 2) {  // unglobalised closure to follow
+      // nothing to do; closure starts at *bufptr
+    } else if (ga.weight > 0) { // fill in GA
+      ga.payload.gc.gtid = (GlobalTaskId) *bufptr++;
+      ga.payload.gc.slot = (int) *bufptr++;
+    } else
+      ga.payload.plc = (StgPtr) *bufptr++;
+    
+    /* Now unpack the closure body, if there is one */
+    if (isFixed(&ga)) {
+      /* It's a PLC */
+      ASSERT(LOOKS_LIKE_STATIC(ga.payload.plc));
+    } else if (isOffset(&ga)) {
+      ASSERT(ga.payload.gc.slot<=(int)bufsize);
+    } else {
+      /* normal closure */
+      ASSERT(!RtsFlags.ParFlags.globalising==0 || LOOKS_LIKE_GA(&ga));
+
+      closure_start = (StgClosure*)bufptr;
+      ASSERT(LOOKS_LIKE_GHC_INFO((StgPtr)*bufptr));
+      ip = get_closure_info((StgClosure *)bufptr, 
+			    &size, &ptrs, &nonptrs, &vhs, str);
+
+      /* ToDo: check whether this is really needed */
+      if (ip->type == FETCH_ME || ip->type == REMOTE_REF) {
+	size = _HS;
+	ptrs = nonptrs = vhs = 0;
+      }
+      /* ToDo: check whether this is really needed */
+      if (ip->type == ARR_WORDS) {
+	ptrs = vhs = 0;
+	nonptrs = ((StgArrWords *)bufptr)->words+1; // payload+words
+	size = arr_words_sizeW((StgArrWords *)bufptr);
+	ASSERT(size==_HS+vhs+nonptrs);
+      }
+      /* special code for printing a PAP in a buffer */
+      if (ip->type == PAP || ip->type == AP_UPD) {
+        vhs = 3; 
+	ptrs = 0;
+        nonptrs = (nat)((StgPAP *)bufptr)->payload[0];
+	size = _HS+vhs+ptrs+nonptrs;
+      }
+
+      /* no checks on contents of closure (pointers aren't packed anyway) */
+      ASSERT(_HS+vhs+nonptrs>=MIN_NONUPD_SIZE);
+      bufptr += _HS+vhs+nonptrs;
+
+      /* Add to queue for processing */
+      QueueClosure((StgClosure *)closure_start);
+	
+      /* No Common up needed for checking */
+
+      /* No Sort out the global address mapping for checking */
+
+    } /* normal closure case */
+
+    /* Locate next parent pointer */
+    pptr++;
+    while (pptr + 1 > pptrs) {
+      parent = DeQueueClosure();
+
+      if (parent == NULL)
+	break;
+      else {
+	//ASSERT(LOOKS_LIKE_GHC_INFO((StgPtr)*parent));
+	(void) get_closure_info(parent, &size, &pptrs, &nonptrs,
+					&pvhs, str);
+	pptr = 0;
+      }
+    }
+  } while (parent != NULL);
+  /* we unpacked exactly as many words as there are in the buffer */
+  ASSERT(packBuffer->size == bufptr-(packBuffer->buffer));
+  /* check for magic end-of-buffer word */  
+  IF_DEBUG(sanity, ASSERT(*bufptr == END_OF_BUFFER_MARKER));
+}
+#else  /* GRAN */
+void
+PrintPacket(buffer)
+rtsPackBuffer *buffer;
+{
+    // extern char *info_hdr_type(P_ infoptr);  /* defined in Threads.lc */
+    // extern char *display_info_type(P_ infoptr);      /* defined in Threads.lc */
+
+    StgInfoTable *info;
+    nat size, ptrs, nonptrs, vhs;
+    char info_hdr_ty[80];
+    char str1[80], str2[80], junk_str[80];
+
+    /* globalAddr ga; */
+
+    nat bufsize, unpacked_size ;
+    StgClosure *parent;
+    nat pptr = 0, pptrs = 0, pvhs;
+
+    nat unpack_locn = 0;
+    nat gastart = unpack_locn;
+    nat closurestart = unpack_locn;
+
+    StgTSO *tso;
+    StgClosure *closure, *p;
+
+    nat i;
+
+    fprintf(stderr, "*** Printing <<%d>> (buffer @ %p):\n", buffer->id, buffer);
+    fprintf(stderr, "  size: %d; unpacked_size: %d; tso: %d (%p); buffer: %p\n",
+	    buffer->size, buffer->unpacked_size, buffer->tso, buffer->buffer);
+    fputs("  contents: ", stderr);
+    for (unpack_locn=0; unpack_locn<buffer->size; unpack_locn++) {
+      closure = buffer->buffer[unpack_locn];
+      fprintf(stderr, ", %p (%s)", 
+	      closure, info_type(closure)); 
+    }
+    fputc('\n', stderr);
+
+#if 0
+    /* traverse all elements of the graph; omitted for now, but might be usefule */
+    InitClosureQueue();
+
+    tso = buffer->tso;
+
+    /* Unpack the header */
+    unpacked_size = buffer->unpacked_size;
+    bufsize = buffer->size;
+
+    fprintf(stderr, "Packet %p, size %u (unpacked size is %u); demanded by TSO %d (%p)[PE %d]\n--- Begin ---\n", 
+	            buffer, bufsize, unpacked_size,  
+	            tso->id, tso, where_is((StgClosure*)tso));
+
+    do {
+	closurestart = unpack_locn;
+	closure = buffer->buffer[unpack_locn++];
+	
+	fprintf(stderr, "[%u]: (%p) ", closurestart, closure);
+
+	info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str1);
+	strcpy(str2, str1);
+	fprintf(stderr, "(%s|%s) ", str1, str2);
+	
+        if (info->type == FETCH_ME || info->type == FETCH_ME_BQ || 
+	    IS_BLACK_HOLE(closure))
+	  size = ptrs = nonptrs = vhs = 0;
+	
+	if (closure_THUNK(closure)) {
+		if (closure_UNPOINTED(closure))
+		    fputs("UNPOINTED ", stderr);
+		else
+		    fputs("POINTED ", stderr);
+	} 
+        if (IS_BLACK_HOLE(closure)) {
+		fputs("BLACK HOLE\n", stderr);
+	} else {
+		/* Fixed header */
+		fprintf(stderr, "FH ["); 
+		for (i = 0, p = (StgClosure*)&(closure->header); i < _HS; i++, p++)
+		    fprintf(stderr, " %p", *p);
+	
+		/* Variable header 
+		if (vhs > 0) {
+		    fprintf(stderr, "] VH [%p", closure->payload[_HS]);
+	
+		    for (i = 1; i < vhs; i++)
+			fprintf(stderr, " %p", closure->payload[_HS+i]);
+		}
+		*/
+		fprintf(stderr, "] PTRS %u", ptrs);
+	
+		/* Non-pointers */
+		if (nonptrs > 0) {
+		    fprintf(stderr, " NPTRS [%p", closure->payload[_HS+vhs]);
+		
+		    for (i = 1; i < nonptrs; i++)
+			fprintf(stderr, " %p", closure->payload[_HS+vhs+i]);
+	
+		    putc(']', stderr);
+		}
+		putc('\n', stderr);
+	}
+    } while (unpack_locn<bufsize) ;  /* (parent != NULL); */
+
+    fprintf(stderr, "--- End ---\n\n");
+#endif /* 0 */
+}
+#endif /* PAR */
+#endif /* DEBUG || GRAN_CHECK */
+
+#endif /* PAR  || GRAN  -- whole file */
+
+//@node End of file,  , Printing Packet Contents, Graph packing
+//@subsection End of file
+
+//@index
+//* AllocateHeap::  @cindex\s-+AllocateHeap
+//* AmPacking::  @cindex\s-+AmPacking
+//* CommonUp::  @cindex\s-+CommonUp
+//* DeQueueClosure::  @cindex\s-+DeQueueClosure
+//* DeQueueClosure::  @cindex\s-+DeQueueClosure
+//* DonePacking::  @cindex\s-+DonePacking
+//* FillInClosure::  @cindex\s-+FillInClosure
+//* IS_BLACK_HOLE::  @cindex\s-+IS_BLACK_HOLE
+//* IS_INDIRECTION::  @cindex\s-+IS_INDIRECTION
+//* InitClosureQueue::  @cindex\s-+InitClosureQueue
+//* InitPendingGABuffer::  @cindex\s-+InitPendingGABuffer
+//* LocateNextParent::  @cindex\s-+LocateNextParent
+//* NotYetPacking::  @cindex\s-+NotYetPacking
+//* OffsetFor::  @cindex\s-+OffsetFor
+//* Pack::  @cindex\s-+Pack
+//* PackArray::  @cindex\s-+PackArray
+//* PackClosure::  @cindex\s-+PackClosure
+//* PackFetchMe::  @cindex\s-+PackFetchMe
+//* PackGeneric::  @cindex\s-+PackGeneric
+//* PackNearbyGraph::  @cindex\s-+PackNearbyGraph
+//* PackOneNode::  @cindex\s-+PackOneNode
+//* PackPAP::  @cindex\s-+PackPAP
+//* PackPLC::  @cindex\s-+PackPLC
+//* PackStkO::  @cindex\s-+PackStkO
+//* PackTSO::  @cindex\s-+PackTSO
+//* PendingGABuffer::  @cindex\s-+PendingGABuffer
+//* PrintPacket::  @cindex\s-+PrintPacket
+//* QueueClosure::  @cindex\s-+QueueClosure
+//* QueueEmpty::  @cindex\s-+QueueEmpty
+//* RoomToPack::  @cindex\s-+RoomToPack
+//* SetGAandCommonUp::  @cindex\s-+SetGAandCommonUp
+//* UnpackGA::  @cindex\s-+UnpackGA
+//* UnpackGraph::  @cindex\s-+UnpackGraph
+//* UnpackOffset::  @cindex\s-+UnpackOffset
+//* UnpackPLC::  @cindex\s-+UnpackPLC
+//* doGlobalGC::  @cindex\s-+doGlobalGC
+//* get_closure_info::  @cindex\s-+get_closure_info
+//* InitPackBuffer::  @cindex\s-+initPackBuffer
+//* isFixed::  @cindex\s-+isFixed
+//* isOffset::  @cindex\s-+isOffset
+//* offsetTable::  @cindex\s-+offsetTable
+//@end index
+
diff --git a/rts/parallel/ParInit.c b/rts/parallel/ParInit.c
new file mode 100644
index 0000000000..22c9119c89
--- /dev/null
+++ b/rts/parallel/ParInit.c
@@ -0,0 +1,322 @@
+/* --------------------------------------------------------------------------
+   Time-stamp: <Wed Mar 21 2001 16:37:16 Stardate: [-30]6363.46 hwloidl>
+
+   Initialising the parallel RTS
+
+   An extension based on Kevin Hammond's GRAPH for PVM version
+   P. Trinder, January 17th 1995.
+   Adapted for the new RTS
+   P. Trinder, July 1997.
+   H-W. Loidl, November 1999.
+
+   ------------------------------------------------------------------------ */
+
+#ifdef PAR /* whole file */
+
+//@menu
+//* Includes::			
+//* Global variables::		
+//* Initialisation Routines::	
+//@end menu
+
+//@node Includes, Global variables
+//@subsection Includes
+
+/* Evidently not Posix */
+/* #include "PosixSource.h" */
+
+#include <setjmp.h>
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "ParallelRts.h"
+#include "Sparks.h"
+#include "LLC.h"
+#include "HLC.h"
+
+//@node Global variables, Initialisation Routines, Includes
+//@subsection Global variables
+
+/* Global conditions defined here. */
+
+rtsBool	IAmMainThread = rtsFalse;	/* Set for the main thread	*/
+
+/* Task identifiers for various interesting global tasks. */
+
+GlobalTaskId IOTask = 0,                /* The IO Task Id		*/
+             SysManTask = 0,            /* The System Manager Task Id	*/
+             mytid = 0;                 /* This PE's Task Id		*/
+
+rtsTime 	main_start_time;	/* When the program started	*/
+rtsTime   	main_stop_time;	    	/* When the program finished    */
+jmp_buf		exit_parallel_system;	/* How to abort from the RTS	*/
+
+
+//rtsBool fishing = rtsFalse;             /* We have no fish out in the stream */
+rtsTime last_fish_arrived_at = 0;       /* Time of arrival of most recent fish*/
+nat     outstandingFishes = 0;          /* Number of active fishes */ 
+
+//@cindex spark queue
+/* GranSim: a globally visible array of spark queues */
+rtsSpark *pending_sparks_hd[SPARK_POOLS],  /* ptr to start of a spark pool */ 
+         *pending_sparks_tl[SPARK_POOLS],  /* ptr to end of a spark pool */ 
+         *pending_sparks_lim[SPARK_POOLS],
+         *pending_sparks_base[SPARK_POOLS]; 
+
+//@cindex spark_limit
+/* max number of sparks permitted on the PE; 
+   see RtsFlags.ParFlags.maxLocalSparks */
+nat spark_limit[SPARK_POOLS];
+
+//@cindex PendingFetches
+/* A list of fetch reply messages not yet processed; this list is filled
+   by awaken_blocked_queue and processed by processFetches */
+StgBlockedFetch *PendingFetches = END_BF_QUEUE;
+
+//@cindex allPEs
+GlobalTaskId *allPEs;
+
+//@cindex nPEs
+nat nPEs = 0;
+
+//@cindex sparksIgnored
+nat sparksIgnored = 0, sparksCreated = 0, 
+    threadsIgnored = 0, threadsCreated = 0;
+
+//@cindex advisory_thread_count
+nat advisory_thread_count = 0;
+
+globalAddr theGlobalFromGA;
+
+/* For flag handling see RtsFlags.h */
+
+//@node Prototypes
+//@subsection Prototypes
+
+/* Needed for FISH messages (initialisation of random number generator) */
+void srand48 (long);
+time_t time (time_t *);
+
+//@node Initialisation Routines,  , Global variables
+//@subsection Initialisation Routines
+
+/*
+  par_exit defines how to terminate the program.  If the exit code is
+  non-zero (i.e. an error has occurred), the PE should not halt until
+  outstanding error messages have been processed.  Otherwise, messages
+  might be sent to non-existent Task Ids.  The infinite loop will actually
+  terminate, since STG_Exception will call myexit\tr{(0)} when
+  it received a PP_FINISH from the system manager task.
+*/
+//@cindex shutdownParallelSystem
+void
+shutdownParallelSystem(StgInt n)
+{
+  /* use the file specified via -S */ 
+  FILE *sf = RtsFlags.GcFlags.statsFile;
+
+  IF_PAR_DEBUG(verbose,
+	       if (n==0)
+  	         belch("==== entered shutdownParallelSystem ...");
+               else
+  	         belch("==== entered shutdownParallelSystem (ERROR %d)...", n);
+	       );
+  
+  stopPEComms(n);
+
+#if 0
+  if (sf!=(FILE*)NULL) 
+    fprintf(sf, "PE %x: %u sparks created, %u sparks Ignored, %u threads created, %u threads Ignored", 
+	    (W_) mytid, sparksCreated, sparksIgnored,
+	    threadsCreated, threadsIgnored);
+#endif
+
+  ShutdownEachPEHook();
+}
+
+//@cindex initParallelSystem
+void
+initParallelSystem(void)
+{
+  /* Don't buffer standard channels... */
+  setbuf(stdout,NULL);
+  setbuf(stderr,NULL);
+  
+  srand48(time(NULL) * getpid()); /* Initialise Random-number generator seed*/
+                                  /* used to select target of FISH message*/
+  if (!InitPackBuffer())
+    barf("InitPackBuffer");
+
+  if (!initMoreBuffers())
+    barf("initMoreBuffers");
+
+  if (!initSparkPools())
+    barf("initSparkPools");
+}
+
+/* 
+ * SynchroniseSystem synchronises the reduction task with the system
+ * manager, and initialises the Global address tables (LAGA & GALA)
+ */
+
+//@cindex synchroniseSystem
+void
+synchroniseSystem(void)
+{
+  /* Only in debug mode? */
+  fprintf(stderr, "==== Starting parallel execution on %d processors ...\n", nPEs);
+
+  InitEachPEHook();                  /* HWL: hook to be execed on each PE */
+
+  /* Initialize global address tables */
+  initGAtables();
+
+  initParallelSystem();
+  
+  startPEComms();
+}
+
+/* 
+  Do the startup stuff (this is PVM specific!).
+  Determines global vars: mytid, IAmMainThread, SysManTask, nPEs
+  Called at the beginning of RtsStartup.startupHaskell
+*/
+void 
+startupParallelSystem(char *argv[]) { 
+ mytid = pvm_mytid();	        /* Connect to PVM */
+
+ if (*argv[0] == '-') {         /* Look to see whether we're the Main Thread */
+  IAmMainThread = rtsTrue;
+  sscanf(argv[0],"-%0X",&SysManTask);  /* extract SysMan task ID*/	
+  argv++;	                       /* Strip off flag argument */
+ } else {
+  SysManTask = pvm_parent();
+ }
+
+ IF_PAR_DEBUG(verbose,
+	       fprintf(stderr, "==== [%x] %s PE located SysMan at %x\n",
+		       mytid, IAmMainThread?"Main":"Remote", SysManTask));
+
+ nPEs = atoi(argv[1]);
+}
+
+/* 
+   Exception handler during startup.
+*/
+void *
+processUnexpectedMessageDuringStartup(rtsPacket p) {
+  OpCode opCode;
+  GlobalTaskId sender_id;
+
+  getOpcodeAndSender(p, &opCode, &sender_id);
+
+  switch(opCode) { 
+      case PP_FISH:
+        bounceFish();
+	break;
+#if defined(DIST)
+      case PP_REVAL:
+	bounceReval();
+	break;
+#endif
+      case PP_FINISH:
+        stg_exit(EXIT_SUCCESS);	
+	break;
+      default:
+	fprintf(stderr,"== Task %x: Unexpected OpCode %x (%s) from %x in startPEComms\n",
+		mytid, opCode, getOpName(opCode), sender_id);
+    }
+}
+
+void 
+startPEComms(void){ 
+
+  startUpPE(); 
+  allPEs = (GlobalTaskId *) stgMallocBytes(sizeof(GlobalTaskId) * MAX_PES,
+					   "(PEs)");
+  
+  /* Send our tid and IAmMainThread flag back to SysMan */
+  sendOp1(PP_READY, SysManTask, (StgWord)IAmMainThread);  
+  /* Wait until we get the PE-Id table from Sysman */    
+  waitForPEOp(PP_PETIDS, SysManTask, processUnexpectedMessageDuringStartup); 
+
+  IF_PAR_DEBUG(verbose,
+               belch("==-- startPEComms: methinks we just received a PP_PETIDS message"));
+
+  /* Digest the PE table we received */
+  processPEtids();
+}
+
+void
+processPEtids(void) { 
+  long newPE;
+  nat i, sentPEs, currentPEs;
+
+  nPEs=0;
+	  
+  currentPEs = nPEs;
+
+  IF_PAR_DEBUG(verbose,
+		belch("==-- processPEtids: starting to iterate over a PVM buffer"));
+  /* ToDo: this has to go into LLComms !!! */
+  GetArgs(&sentPEs,1);
+
+  ASSERT(sentPEs > currentPEs);
+  ASSERT(sentPEs < MAX_PES); /* enforced by SysMan too*/  
+  
+  for (i = 0; i < sentPEs; i++) { 
+    GetArgs(&newPE,1);
+    if (i<currentPEs) { 
+      ASSERT(newPE == allPEs[i]);
+    } else { 
+#if defined(DIST)
+      // breaks with PAR && !DEBUG
+      IF_PAR_DEBUG(verbose,
+	fprintf(stderr, "[%x] registering %d'th %x\n", mytid, i, newPE)); 
+      if(!looks_like_tid(newPE))
+	  barf("unacceptable taskID %x\n",newPE);
+#endif
+      allPEs[i] = newPE;      
+      nPEs++;
+      registerTask(newPE); 
+    }
+  }
+
+  IF_PAR_DEBUG(verbose,
+  	       /* debugging */
+  	       belch("++++ [%x] PE table as I see it:", mytid);
+  	       for (i = 0; i < sentPEs; i++) { 
+  		 belch("++++ allPEs[%d] = %x", i, allPEs[i]);
+               });
+}
+
+void 
+stopPEComms(StgInt n) { 
+  if (n != 0) { 
+    /* In case sysman doesn't know about us yet...
+    pvm_initsend(PvmDataDefault);
+    PutArgs(&IAmMainThread,1);
+    pvm_send(SysManTask, PP_READY);
+     */
+    sendOp(PP_READY, SysManTask);  
+  } 
+  
+  sendOp2(PP_FINISH, SysManTask, n, n);  
+  waitForPEOp(PP_FINISH, SysManTask, NULL);
+  fflush(gr_file);
+  shutDownPE();
+}
+
+#endif /* PAR -- whole file */
+
+//@index
+//* PendingFetches::  @cindex\s-+PendingFetches
+//* SynchroniseSystem::  @cindex\s-+SynchroniseSystem
+//* allPEs::  @cindex\s-+allPEs
+//* initParallelSystem::  @cindex\s-+initParallelSystem
+//* nPEs::  @cindex\s-+nPEs
+//* par_exit::  @cindex\s-+par_exit
+//* spark queue::  @cindex\s-+spark queue
+//* sparksIgnored::  @cindex\s-+sparksIgnored
+//@end index
+
diff --git a/rts/parallel/ParInit.h b/rts/parallel/ParInit.h
new file mode 100644
index 0000000000..a22a50bae6
--- /dev/null
+++ b/rts/parallel/ParInit.h
@@ -0,0 +1,19 @@
+/* -----------------------------------------------------------------------------
+ * ParInit.h,1
+ * 
+ * Phil Trinder
+ * July 1998
+ *
+ * External Parallel Initialisation Interface
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef PARINIT_H
+#define PARINIT_H
+
+extern void RunParallelSystem (P_);
+extern void initParallelSystem(void);
+extern void SynchroniseSystem(void);
+extern void par_exit(I_);
+
+#endif /* PARINIT_H */
diff --git a/rts/parallel/ParTicky.c b/rts/parallel/ParTicky.c
new file mode 100644
index 0000000000..347c2b8bca
--- /dev/null
+++ b/rts/parallel/ParTicky.c
@@ -0,0 +1,450 @@
+/* -------------------------------------------------------------------------
+ *
+ * (c) Hans-Wolfgang Loidl, 2000-
+ *
+ * Parallel ticky profiling, monitoring basic RTS operations in GUM.
+ * Similar in structure to TICKY_TICKY profiling, but doesn't need a 
+ * separate way of building GHC.
+ *-------------------------------------------------------------------------- */
+
+#if defined(PAR) && defined(PAR_TICKY)
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+//#include "StoragePriv.h"
+//#include "MBlock.h"
+//#include "Schedule.h"
+#include "GC.h"
+#include "Stats.h"
+#include "ParTicky.h"                       // ToDo: move into Rts.h
+#include "ParallelRts.h"
+
+#if defined(PAR) && defined(HAVE_GETRUSAGE)
+#include <sys/resource.h>
+#endif
+
+/* external data */
+extern double ElapsedTimeStart;
+
+extern ullong GC_tot_alloc;
+extern ullong GC_tot_copied;
+
+extern lnat MaxResidency;     /* in words; for stats only */
+extern lnat ResidencySamples; /* for stats only */
+
+/* ngIplu' {Stats.c}vo' */
+#define BIG_STRING_LEN              512
+
+/* ngIplu' {Ticky.c}vo' */
+#define INTAVG(a,b) ((b == 0) ? 0.0 : ((double) (a) / (double) (b)))
+#define PC(a)	    (100.0 * a)
+
+#define AVG(thing) \
+	StgDouble avg##thing  = INTAVG(tot##thing,ctr##thing)
+
+
+#if 0
+void
+set_foo_time(double *x) {
+  *x = usertime();
+}
+
+double
+get_foo_time(double x) {
+  fprintf(stderr, "get_foo_time: %7.2f (%7.5f,%7.5f) \n", 
+	  usertime()-x,usertime(),x);
+  return (usertime()-x);
+}
+#endif
+
+static double start_time_GA = 0.0;
+static double start_mark = 0.0;
+static double start_pack = 0.0;
+static double start_unpack = 0.0;
+
+void
+par_ticky_Par_start (void) {
+# if !defined(HAVE_GETRUSAGE) || irix_HOST_OS || defined(_WIN32)
+    fprintf(stderr, "|| sorry don't have RUSAGE\n");
+    return ;
+# else
+    FILE *sf = RtsFlags.GcFlags.statsFile;
+    struct rusage t;
+    double utime, stime;
+
+    if (RtsFlags.GcFlags.giveStats>1 && sf != NULL) {
+      getrusage(RUSAGE_SELF, &t);
+      
+      utime = t.ru_utime.tv_sec + 1e-6*t.ru_utime.tv_usec;
+      stime = t.ru_stime.tv_sec + 1e-6*t.ru_stime.tv_usec;
+      
+      fprintf(stderr, "|| user time: %5.2f; system time: %5.2f\n",
+	      utime, stime);
+      fprintf(stderr, "|| max RSS: %ld; int SM size: %ld; int USM data size: %ld; int USS size: %ld\n",
+	      t.ru_maxrss, t.ru_ixrss, t.ru_idrss, t.ru_isrss);
+    }
+#endif
+}
+
+#if 0	    
+FYI:
+            struct rusage
+            {
+                 struct timeval ru_utime; /* user time used */
+                 struct timeval ru_stime; /* system time used */
+                 long ru_maxrss;          /* maximum resident set size */
+                 long ru_ixrss;      /* integral shared memory size */
+                 long ru_idrss;      /* integral unshared data size */
+                 long ru_isrss;      /* integral unshared stack size */
+                 long ru_minflt;          /* page reclaims */
+                 long ru_majflt;          /* page faults */
+                 long ru_nswap;      /* swaps */
+                 long ru_inblock;         /* block input operations */
+                 long ru_oublock;         /* block output operations */
+                 long ru_msgsnd;          /* messages sent */
+                 long ru_msgrcv;          /* messages received */
+                 long ru_nsignals;        /* signals received */
+                 long ru_nvcsw;      /* voluntary context switches */
+                 long ru_nivcsw;          /* involuntary context switches */
+            };
+#endif
+
+
+void
+par_ticky_rebuildGAtables_start(void) {
+  // collect parallel global statistics (currently done together with GC stats)
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    //set_foo_time(&start_time_GA);
+    start_time_GA = usertime();
+  }
+}
+
+void
+par_ticky_rebuildGAtables_end(nat n, nat size_GA) {
+  // collect parallel global statistics (currently done together with GC stats)
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    static double foo = 0.0; 
+    foo = usertime() - start_time_GA; // get_foo_time(start_time_GA);
+    globalParStats.cnt_rebuild_GA++;
+    globalParStats.tot_rebuild_GA += n;
+    if ( n > globalParStats.res_rebuild_GA ) 
+      globalParStats.res_rebuild_GA = n;
+    // fprintf(stderr, "rebuildGAtables: footime=%7.2f (%11.5f, %11.5f)\n", 
+    //    foo, usertime(), start_time_GA);
+    globalParStats.time_rebuild_GA += foo;
+    globalParStats.tot_size_GA += size_GA;
+    if ( size_GA > globalParStats.res_size_GA ) 
+      globalParStats.res_size_GA = size_GA;
+  }
+  // fprintf(stderr, ">> n: %d; size: %d;; tot: %d;  res: %d\n",
+  //	  n, size_GA, globalParStats.tot_size_GA, globalParStats.res_size_GA);
+}
+
+void
+par_ticky_markLocalGAs_start(void) {
+  // collect parallel global statistics (currently done together with GC stats)
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    start_time_GA = usertime();
+  }
+}
+
+void
+par_ticky_markLocalGAs_end(nat n) {
+  // collect parallel global statistics (currently done together with GC stats)
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    globalParStats.cnt_mark_GA++;
+    globalParStats.tot_mark_GA += n;
+    if ( n > globalParStats.res_mark_GA ) 
+      globalParStats.res_mark_GA = n;
+    globalParStats.time_mark_GA += usertime() - start_time_GA;
+  }
+}
+
+void
+par_ticky_markSparkQueue_start(void) {
+  // collect parallel global statistics (currently done together with GC stats)
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    start_mark=usertime();
+  }
+}
+
+void
+par_ticky_markSparkQueue_end(nat n) {
+  // collect parallel global statistics (currently done together with GC stats)
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    globalParStats.time_sparks += usertime() - start_mark;
+
+    globalParStats.tot_sparks_marked += n;
+    if ( n > globalParStats.res_sparks_marked ) 
+      globalParStats.res_sparks_marked = n;
+  }
+}
+
+void
+par_ticky_PackNearbyGraph_start (void) {
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    start_pack=usertime();
+  }
+}
+
+void
+par_ticky_PackNearbyGraph_end(nat n, nat thunks) {
+  // collect parallel global statistics (currently done together with GC stats)
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    globalParStats.time_pack += usertime() - start_pack;
+
+    globalParStats.tot_packets++;
+    globalParStats.tot_packet_size += n;
+    if ( n > globalParStats.res_packet_size ) 
+      globalParStats.res_packet_size = n;
+    globalParStats.tot_thunks += thunks;
+    if ( thunks > globalParStats.res_thunks ) 
+      globalParStats.res_thunks = thunks;
+  }
+}
+
+void
+par_ticky_UnpackGraph_start (void) {
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    start_unpack=usertime();
+  }
+}
+
+void
+par_ticky_UnpackGraph_end(nat n, nat thunks) {
+  // collect parallel global statistics (currently done together with GC stats)
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    globalParStats.time_unpack += usertime() - start_unpack;
+
+    globalParStats.rec_packets++;
+    globalParStats.rec_packet_size += n;
+    /*
+    if ( n > globalParStats.res_packet_size ) 
+      globalParStats.res_packet_size = n;
+    */
+    globalParStats.rec_thunks += thunks;
+    /*
+    if ( thunks > globalParStats.res_thunks ) 
+      globalParStats.res_thunks = thunks;
+    */
+  }
+}
+
+void
+par_ticky_TP (void) {
+    StgSparkPool *pool;
+    nat tp_size, sp_size; // stats only
+
+    // Global stats gathering
+    /* the spark pool for the current PE */
+    pool = &(MainRegTable.rSparks); // generalise to cap = &MainRegTable
+
+    // Global statistics: residency of thread and spark pool
+    if (RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+      tp_size = run_queue_len() + 1; // add the TSO just poped
+      // No: there may be many blocked threads being awoken at the same time
+      // ASSERT(tp_size <= RtsFlags.ParFlags.maxThreads);
+      globalParStats.tot_tp += tp_size;
+      globalParStats.emp_tp += (tp_size==0) ? 1 : 0;
+      globalParStats.cnt_tp++;
+      if ( tp_size > globalParStats.res_tp)
+	globalParStats.res_tp = tp_size;
+      // fprintf(stderr, "run_queue_len() = %d (max %d)\n", run_queue_len(), globalParStats.res_tp);
+      sp_size = spark_queue_len(pool);
+      //ASSERT(sp_size <= RtsFlags.ParFlags.maxLocalSparks);
+      globalParStats.tot_sp += sp_size;
+      globalParStats.emp_sp += (sp_size==0) ? 1 : 0;
+      globalParStats.cnt_sp++;
+      if ( sp_size > globalParStats.res_sp)
+	globalParStats.res_sp = sp_size;
+      // fprintf(stderr, "spark_queue_len(pool) = %d (max %d)\n", spark_queue_len(pool), globalParStats.res_sp);
+    }
+}
+
+void
+globalParStat_exit(void)
+{
+    FILE *sf = RtsFlags.GcFlags.statsFile;
+    double time, etime;
+
+    /* print only if GC stats is enabled, too; i.e. -sstderr */
+    if (!(RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS)) 
+      return;
+
+    time = usertime();
+    etime = elapsedtime() - ElapsedTimeStart;
+    // fprintf(stderr, "foo=%7.2f\n", time);
+
+    if (sf != NULL){
+        char temp[BIG_STRING_LEN];
+
+	// GC_tot_alloc += alloc;
+	fprintf(sf,"\n");
+
+	fprintf(sf, "%11d threads created\n", 
+		globalParStats.tot_threads_created);
+	/*
+	  Would need to add a ++ to the par macro to use this
+
+	fprintf(sf, "%11d sparks created\n", 
+		globalParStats.tot_sparks_created);
+	fprintf(sf, "%11d sparks ignored\n", 
+		globalParStats.tot_sparks_ignored);
+	*/
+	ullong_format_string(globalParStats.res_tp, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s thread pool residency", temp);
+	fprintf(sf, " (avg: %3.2f; %d times (%2.2f%%) of %d empty)\n", 
+		(double)globalParStats.tot_tp/(double)globalParStats.cnt_tp,
+		globalParStats.emp_tp, 
+		globalParStats.emp_tp*100.0/(double)globalParStats.cnt_tp,
+		globalParStats.cnt_tp);
+	ullong_format_string(globalParStats.res_sp, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s spark pool residency", temp);
+
+	fprintf(sf, " (avg: %3.2f; %d times (%2.2f%%) of %d empty)\n", 
+		(double)globalParStats.tot_sp/(double)globalParStats.cnt_sp,
+		globalParStats.emp_sp, 
+		globalParStats.emp_sp*100.0/(double)globalParStats.cnt_sp,
+		globalParStats.cnt_sp);
+	//ullong_format_string(globalParStats.tot_fishes, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11d messages sent (%d fish, %d fetch, %d resume, %d schedule", 
+		globalParStats.tot_fish_mess+globalParStats.tot_fetch_mess+
+		globalParStats.tot_resume_mess+globalParStats.tot_schedule_mess,
+		globalParStats.tot_fish_mess, globalParStats.tot_fetch_mess, 
+		globalParStats.tot_resume_mess, globalParStats.tot_schedule_mess);
+#if defined(DIST)
+	fprintf(sf, "%d revals", globalParStats.tot_reval_mess);
+#endif
+	fprintf(sf,")\n");
+	fprintf(sf, "%11d messages received (%d fish, %d fetch, %d resume, %d schedule", 
+		globalParStats.rec_fish_mess+globalParStats.rec_fetch_mess+
+		globalParStats.rec_resume_mess+globalParStats.rec_schedule_mess,
+		globalParStats.rec_fish_mess, globalParStats.rec_fetch_mess, 
+		globalParStats.rec_resume_mess, globalParStats.rec_schedule_mess);
+#if defined(DIST)
+	fprintf(sf, "%d revals", globalParStats.rec_reval_mess);
+#endif
+	fprintf(sf,")\n\n");
+
+	ullong_format_string(globalParStats.tot_size_GA*sizeof(W_), temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s bytes of global heap in total ", temp);
+	fprintf(sf, "(%5.2f%% of total allocated heap)\n", 
+		globalParStats.tot_size_GA*sizeof(W_)*100.0/(double)GC_tot_alloc*sizeof(W_));
+	ullong_format_string(globalParStats.res_size_GA*sizeof(W_), temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s bytes global heap residency ", temp);
+	fprintf(sf, "(%5.2f%% of max heap residency)\n", 
+		globalParStats.res_size_GA*sizeof(W_)*100.0/(double)MaxResidency*sizeof(W_));
+
+	//ullong_format_string(globalParStats.res_mark_GA, temp, rtsTrue/*commas*/);
+	//fprintf(sf, "%11s GAs residency in GALA table ", temp);
+	// ullong_format_string(globalParStats.tot_mark_GA, temp, rtsTrue/*commas*/);
+	//fprintf(sf, "(avg %5.2f; %d samples)\n", 
+	//	(double)globalParStats.tot_mark_GA/(double)globalParStats.cnt_mark_GA,
+	//	globalParStats.cnt_mark_GA);
+
+	ullong_format_string(globalParStats.local_alloc_GA, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s GAs locally allocated (calls to makeGlobal)\n", temp);
+
+	ullong_format_string(globalParStats.tot_rebuild_GA, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s live GAs in total (after rebuilding tables)\n", temp);
+	ullong_format_string(globalParStats.res_rebuild_GA, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s GAs residency (after rebuilding tables) ", temp);
+	fprintf(sf, "(avg %5.2f; %d samples)\n", 
+		(double)globalParStats.tot_rebuild_GA/(double)globalParStats.cnt_rebuild_GA,
+		globalParStats.cnt_rebuild_GA);
+	ullong_format_string(globalParStats.res_free_GA, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s residency of freeing GAs", temp);
+	fprintf(sf, " (avg %5.2f; %d samples)\n", 
+		(double)globalParStats.tot_free_GA/(double)globalParStats.cnt_free_GA,
+		globalParStats.cnt_free_GA);
+
+	fprintf(sf, "%11.2fs spent marking GAs (%7.2f%% of %7.2fs)\n", 
+		globalParStats.time_mark_GA,
+		globalParStats.time_mark_GA*100./time, time);
+	fprintf(sf, "%11.2fs spent rebuilding GALA tables (%7.2f%% of %7.2fs; %7.2f%% of %7.2fs)\n", 
+		globalParStats.time_rebuild_GA,
+		globalParStats.time_rebuild_GA*100./time, time,
+		globalParStats.time_rebuild_GA*100./etime, etime);
+
+	ullong_format_string(globalParStats.tot_sparks_marked, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s sparks marked\t", temp);
+	ullong_format_string(globalParStats.res_sparks_marked, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%6s spark mark residency\n", temp);
+	fprintf(sf, "%11.2fs spent marking sparks (%7.2f%% of %7.2fs; %7.2f%% of %7.2fs elapsed)\n", 
+		globalParStats.time_sparks,
+		globalParStats.time_sparks*100./time, time,
+		globalParStats.time_sparks*100./etime, etime);
+
+	fprintf(sf,"\n");
+
+	ullong_format_string(globalParStats.tot_packets, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s packets sent\n", temp);
+	ullong_format_string(globalParStats.tot_packet_size, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s bytes of graph sent in total (max %d; avg %.2f)\n",
+		temp, globalParStats.res_packet_size,
+		(double)globalParStats.tot_packet_size/(double)globalParStats.tot_packets);
+	ullong_format_string(globalParStats.tot_thunks, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s thunks sent in total (max %d; avg %.2f)\n",
+		temp, globalParStats.res_thunks,
+		(double)globalParStats.tot_thunks/(double)globalParStats.tot_packets);
+	fprintf(sf, "%11.2fs spent packing graph structures (%7.2f%% of %7.2fs; %7.2f%% of %7.2fs elapsed)\n", 
+		globalParStats.time_pack,
+		globalParStats.time_pack*100./time, time,
+		globalParStats.time_pack*100./etime, etime);
+
+	ullong_format_string(globalParStats.rec_packets, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s packets received\n", temp);
+	ullong_format_string(globalParStats.rec_packet_size, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s bytes of graph received in total (max %d; avg %.2f)\n",
+		temp, globalParStats.rec_res_packet_size,
+		(double)globalParStats.rec_packet_size/(double)globalParStats.rec_packets);
+	ullong_format_string(globalParStats.rec_thunks, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s thunks received in total (max %d; avg %.2f)\n",
+		temp, globalParStats.rec_res_thunks,
+		(double)globalParStats.rec_thunks/(double)globalParStats.rec_packets);
+	fprintf(sf, "%11.2fs spent unpacking graph structures (%7.2f%% of %7.2fs; %7.2f%% of %7.2fs elapsed)\n", 
+		globalParStats.time_unpack,
+		globalParStats.time_unpack*100./time, time,
+		globalParStats.time_unpack*100./etime, etime);
+
+	fprintf(sf,"\n");
+
+	ullong_format_string(globalParStats.tot_arrs, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s bytearrays sent; ", temp);
+	ullong_format_string(globalParStats.tot_arr_size, temp, rtsTrue/*commas*/);
+	fprintf(sf, " %s bytes in total (avg %.2f)\n",
+		temp, 
+		(double)globalParStats.tot_arr_size/(double)globalParStats.tot_arrs);
+	
+	fprintf(sf,"\n");
+
+	fprintf(sf, "%11d yields, %d stack overflows, %d heap overflows\n",
+		globalParStats.tot_yields, globalParStats.tot_stackover,
+ 		globalParStats.tot_heapover); 
+
+	fprintf(sf,"\n");
+
+	//fprintf(stderr, "Printing this pathetic statistics took %7.2fs (start @ %7.2f)\n",
+	//	usertime()-time, time);
+
+	fflush(sf);
+	// Open filehandle needed by other stats printing fcts
+	// fclose(sf);
+    }
+}
+
+#endif
+
diff --git a/rts/parallel/ParTicky.h b/rts/parallel/ParTicky.h
new file mode 100644
index 0000000000..1d6e7435c9
--- /dev/null
+++ b/rts/parallel/ParTicky.h
@@ -0,0 +1,60 @@
+/* --------------------------------------------------------------------------
+ *
+ * (c) Hans-Wolfgang Loidl, 2000-
+ *
+ * Header for ParTicky.c
+ *
+ * --------------------------------------------------------------------------*/
+
+#if defined(PAR_TICKY)
+
+/* macros */
+#define PAR_TICKY_PAR_START()              par_ticky_Par_start () 
+#define PAR_TICKY_PAR_END()                globalParStat_exit () 
+#define PAR_TICKY_REBUILD_GA_TABLES_START()  par_ticky_rebuildGAtables_start() 
+#define PAR_TICKY_REBUILD_GA_TABLES_END(n, size_GA) par_ticky_rebuildGAtables_end(n, size_GA) 
+#define PAR_TICKY_MARK_LOCAL_GAS_START()     par_ticky_markLocalGAs_start() 
+#define PAR_TICKY_MARK_LOCAL_GAS_END(n)      par_ticky_markLocalGAs_end(n) 
+#define PAR_TICKY_MARK_SPARK_QUEUE_START()   par_ticky_markSparkQueue_start() 
+#define PAR_TICKY_MARK_SPARK_QUEUE_END(n)    par_ticky_markSparkQueue_end(n) 
+#define PAR_TICKY_PACK_NEARBY_GRAPH_START()  (par_ticky_PackNearbyGraph_start())
+#define PAR_TICKY_PACK_NEARBY_GRAPH_END(n, thunks) par_ticky_PackNearbyGraph_end(n, thunks) 
+#define PAR_TICKY_UNPACK_GRAPH_START()      par_ticky_UnpackGraph_start() 
+#define PAR_TICKY_UNPACK_GRAPH_END(n,thunks) par_ticky_UnpackGraph_end(n,thunks)
+#define PAR_TICKY_TP()                     par_ticky_TP() 
+#define PAR_TICKY_CNT_FREE_GA()            stats_CntFreeGA()
+
+/* prototypes */
+extern void par_ticky_Par_start (void) ;
+extern void par_ticky_rebuildGAtables_start(void) ;
+extern void par_ticky_rebuildGAtables_end(nat n, nat size_GA) ;
+extern void par_ticky_markLocalGAs_start(void) ;
+extern void par_ticky_markLocalGAs_end(nat n) ;
+extern void par_ticky_markSparkQueue_start(void) ;
+extern void par_ticky_markSparkQueue_end(nat n) ;
+extern void par_ticky_PackNearbyGraph_start (void) ;
+extern void par_ticky_PackNearbyGraph_end(nat n, nat thunks) ;
+extern void par_ticky_UnpackGraph_start (void) ;
+extern void par_ticky_UnpackGraph_end(nat n, nat thunks) ;
+extern void par_ticky_TP (void) ;
+extern void globalParStat_exit(void);
+
+#else
+
+#define PAR_TICKY_PAR_START()
+#define PAR_TICKY_PAR_END()  
+#define PAR_TICKY_REBUILD_GA_TABLES_START()
+#define PAR_TICKY_REBUILD_GA_TABLES_END(n, size_GA)
+#define PAR_TICKY_MARK_LOCAL_GAS_START()
+#define PAR_TICKY_MARK_LOCAL_GAS_END(n) 
+#define PAR_TICKY_MARK_SPARK_QUEUE_START()
+#define PAR_TICKY_MARK_SPARK_QUEUE_END(n) 
+#define PAR_TICKY_PACK_NEARBY_GRAPH_START () 
+#define PAR_TICKY_PACK_NEARBY_GRAPH_END(n, thunks)
+#define PAR_TICKY_UNPACK_GRAPH_START ()    
+#define PAR_TICKY_UNPACK_GRAPH_END(n, thunks) 
+#define PAR_TICKY_TP ()                    
+#define PAR_TICKY_CNT_FREE_GA()            
+
+#endif
+
diff --git a/rts/parallel/ParTypes.h b/rts/parallel/ParTypes.h
new file mode 100644
index 0000000000..910a6f2d99
--- /dev/null
+++ b/rts/parallel/ParTypes.h
@@ -0,0 +1,38 @@
+/* ---------------------------------------------------------------------------
+ * Time-stamp: <Tue Nov 09 1999 16:31:38 Stardate: [-30]3873.44 hwloidl>
+ *
+ * Runtime system types for GUM
+ *
+ * ------------------------------------------------------------------------- */
+
+#ifndef PARTYPES_H
+#define PARTYPES_H
+
+#ifdef PAR /* all of it */
+
+// now in Parallel.h 
+//typedef struct hashtable  HashTable;
+//typedef struct hashlist   HashList;
+
+/* Global addresses now live in Parallel.h (needed in Closures.h) */
+// gaddr
+
+// now in Parallel.h 
+/* (GA, LA) pairs 
+typedef struct gala {
+    globalAddr   ga;
+    StgPtr       la;
+    struct gala *next;
+    rtsBool      preferred;
+} rtsGaLa;
+*/
+
+#if defined(GRAN)
+typedef unsigned long TIME;
+typedef unsigned char Proc;
+typedef unsigned char EVTTYPE;
+#endif
+
+#endif /* PAR */
+
+#endif /* ! PARTYPES_H */
diff --git a/rts/parallel/Parallel.c b/rts/parallel/Parallel.c
new file mode 100644
index 0000000000..414b7e4406
--- /dev/null
+++ b/rts/parallel/Parallel.c
@@ -0,0 +1,1140 @@
+/*
+  Time-stamp: <Wed Mar 21 2001 16:42:40 Stardate: [-30]6363.48 hwloidl>
+
+  Basic functions for use in either GranSim or GUM.
+*/
+
+#if defined(GRAN) || defined(PAR)                              /* whole file */
+
+//@menu
+//* Includes::			
+//* Variables and constants::	
+//* Writing to the log-file::	
+//* Global statistics::		
+//* Dumping routines::		
+//@end menu
+//*/ fool highlight
+
+//@node Includes, Variables and constants
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Storage.h"
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+
+//@node Variables and constants, Writing to the log-file, Includes
+//@subsection Variables and constants
+
+/* Where to write the log file */
+FILE *gr_file = NULL;
+char gr_filename[STATS_FILENAME_MAXLEN];
+
+#if defined(PAR)
+/* Global statistics */
+GlobalParStats globalParStats;
+#endif
+
+#if defined(PAR)
+ullong startTime = 0;
+#endif
+
+#if defined(PAR) && !defined(DEBUG)
+// HAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCKKKKKKKKKKKK
+// Definitely the wrong place for info_type in !DEBUG (see Printer.c) -- HWL
+
+static char *closure_type_names[] = {
+  "INVALID_OBJECT",          	/* 0  */
+  "CONSTR",                  	/* 1  */
+  "CONSTR_1_0",			/* 2  */
+  "CONSTR_0_1",			/* 3  */
+  "CONSTR_2_0",			/* 4  */
+  "CONSTR_1_1",			/* 5  */
+  "CONSTR_0_2",			/* 6  */
+  "CONSTR_INTLIKE",	        /* 7  */
+  "CONSTR_CHARLIKE",	        /* 8  */
+  "CONSTR_STATIC",	        /* 9  */
+  "CONSTR_NOCAF_STATIC",     	/* 10 */
+  "FUN",		        /* 11 */
+  "FUN_1_0",		  	/* 12 */
+  "FUN_0_1",		  	/* 13 */
+  "FUN_2_0",		  	/* 14 */
+  "FUN_1_1",		  	/* 15 */
+  "FUN_0_2",			/* 16 */
+  "FUN_STATIC",	        	/* 17 */
+  "THUNK",		        /* 18 */
+  "THUNK_1_0",	  		/* 19 */
+  "THUNK_0_1",	  		/* 20 */
+  "THUNK_2_0",	  		/* 21 */
+  "THUNK_1_1",	  		/* 22 */
+  "THUNK_0_2",			/* 23 */
+  "THUNK_STATIC",	        /* 24 */
+  "THUNK_SELECTOR",	        /* 25 */
+  "BCO",		        /* 26 */
+  "AP_UPD",		        /* 27 */
+  "PAP",			/* 28 */
+  "IND",		        /* 29 */
+  "IND_OLDGEN",	        	/* 30 */
+  "IND_PERM",	        	/* 31 */
+  "IND_OLDGEN_PERM",	        /* 32 */
+  "IND_STATIC",	        	/* 33 */
+  "CAF_UNENTERED",           	/* 34 */
+  "CAF_ENTERED",		/* 35 */
+  "CAF_BLACKHOLE",		/* 36 */
+  "RET_BCO",                 	/* 37 */
+  "RET_SMALL",	        	/* 38 */
+  "RET_VEC_SMALL",	        /* 39 */
+  "RET_BIG",		        /* 40 */
+  "RET_VEC_BIG",	        /* 41 */
+  "RET_DYN",		        /* 42 */
+  "UPDATE_FRAME",	        /* 43 */
+  "CATCH_FRAME",	        /* 44 */
+  "STOP_FRAME",	        	/* 45 */
+  "SEQ_FRAME",	        	/* 46 */
+  "BLACKHOLE",	        	/* 47 */
+  "BLACKHOLE_BQ",	        /* 48 */
+  "SE_BLACKHOLE",		/* 49 */
+  "SE_CAF_BLACKHOLE",		/* 50 */
+  "MVAR",		        /* 51 */
+  "ARR_WORDS",	        	/* 52 */
+  "MUT_ARR_PTRS",	        /* 53 */
+  "MUT_ARR_PTRS_FROZEN",     	/* 54 */
+  "MUT_VAR",		        /* 55 */
+  "WEAK",		        /* 56 */
+  "FOREIGN",		        /* 57 */
+  "STABLE_NAME",	        /* 58 */
+  "TSO",		        /* 59 */
+  "BLOCKED_FETCH",	        /* 60 */
+  "FETCH_ME",                   /* 61 */
+  "FETCH_ME_BQ",                /* 62 */
+  "RBH",                        /* 63 */
+  "EVACUATED",                  /* 64 */
+  "REMOTE_REF",                 /* 65 */
+  "N_CLOSURE_TYPES"         	/* 66 */
+};
+
+char *
+info_type(StgClosure *closure){ 
+  return closure_type_names[get_itbl(closure)->type];
+}
+
+char *
+info_type_by_ip(StgInfoTable *ip){ 
+  return closure_type_names[ip->type];
+}
+
+void
+info_hdr_type(StgClosure *closure, char *res){ 
+  strcpy(res,closure_type_names[get_itbl(closure)->type]);
+}
+#endif
+
+//@node Writing to the log-file, Global statistics, Variables and constants
+//@subsection Writing to the log-file
+/*
+  Writing to the log-file
+
+  These routines dump event-based info to the main log-file.
+  The code for writing log files is shared between GranSim and GUM.
+*/
+
+/* 
+ * If you're not using GNUC and you're on a 32-bit machine, you're 
+ * probably out of luck here.  However, since CONCURRENT currently
+ * requires GNUC, I'm not too worried about it.  --JSM
+ */
+
+//@cindex init_gr_simulation
+#if defined(GRAN)
+void
+init_gr_simulation(rts_argc, rts_argv, prog_argc, prog_argv)
+char *prog_argv[], *rts_argv[];
+int prog_argc, rts_argc;
+{
+  nat i;
+  char *extension = RtsFlags.GranFlags.GranSimStats.Binary ? "gb" : "gr";
+
+  if (RtsFlags.GranFlags.GranSimStats.Global)
+    init_gr_stats();
+
+  /* init global constants for costs of basic operations */
+  gran_arith_cost = RtsFlags.GranFlags.Costs.arith_cost;
+  gran_branch_cost = RtsFlags.GranFlags.Costs.branch_cost;
+  gran_load_cost = RtsFlags.GranFlags.Costs.load_cost;
+  gran_store_cost = RtsFlags.GranFlags.Costs.store_cost;
+  gran_float_cost = RtsFlags.GranFlags.Costs.float_cost;
+
+  if (RtsFlags.GranFlags.GranSimStats.Suppressed)
+    return;
+
+  if (!RtsFlags.GranFlags.GranSimStats.Full) 
+    return;
+
+  sprintf(gr_filename, GR_FILENAME_FMT, prog_argv[0], extension);
+
+  if ((gr_file = fopen(gr_filename, "w")) == NULL) {
+    barf("Can't open granularity simulation report file %s\n", 
+	 gr_filename);
+  }
+
+  setbuf(gr_file, NULL);                   /* turn  buffering off */
+
+  /* write header with program name, options and setup to gr_file */
+  fputs("Granularity Simulation for ", gr_file);
+  for (i = 0; i < prog_argc; ++i) {
+    fputs(prog_argv[i], gr_file);
+    fputc(' ', gr_file);
+  }
+
+  if (rts_argc > 0) {
+    fputs("+RTS ", gr_file);
+    
+    for (i = 0; i < rts_argc; ++i) {
+      fputs(rts_argv[i], gr_file);
+      fputc(' ', gr_file);
+    }
+  }
+
+  fputs("\nStart time: ", gr_file);
+  fputs(time_str(), gr_file);               /* defined in RtsUtils.c */
+  fputc('\n', gr_file);
+    
+  fputs("\n\n--------------------\n\n", gr_file);
+
+  fputs("General Parameters:\n\n", gr_file);
+
+  if (RtsFlags.GranFlags.Light) 
+    fprintf(gr_file, "GrAnSim-Light\nPEs infinite, %s Scheduler, %sMigrate Threads %s, %s\n",
+	    RtsFlags.GranFlags.DoFairSchedule?"Fair":"Unfair",
+	    RtsFlags.GranFlags.DoThreadMigration?"":"Don't ",
+	    RtsFlags.GranFlags.DoThreadMigration && RtsFlags.GranFlags.DoStealThreadsFirst?" Before Sparks":"",
+	    RtsFlags.GranFlags.DoAsyncFetch ? "Asynchronous Fetch" :
+	    "Block on Fetch");
+  else 
+    fprintf(gr_file, "PEs %u, %s Scheduler, %sMigrate Threads %s, %s\n",
+	    RtsFlags.GranFlags.proc,RtsFlags.GranFlags.DoFairSchedule?"Fair":"Unfair",
+	    RtsFlags.GranFlags.DoThreadMigration?"":"Don't ",
+	    RtsFlags.GranFlags.DoThreadMigration && RtsFlags.GranFlags.DoStealThreadsFirst?" Before Sparks":"",
+	    RtsFlags.GranFlags.DoAsyncFetch ? "Asynchronous Fetch" :
+	    "Block on Fetch");
+  
+  if (RtsFlags.GranFlags.DoBulkFetching) 
+    if (RtsFlags.GranFlags.ThunksToPack)
+      fprintf(gr_file, "Bulk Fetching: Fetch %d Thunks in Each Packet (Packet Size = %d closures)\n",
+	      RtsFlags.GranFlags.ThunksToPack, 
+	      RtsFlags.GranFlags.packBufferSize);
+    else
+      fprintf(gr_file, "Bulk Fetching: Fetch as many closures as possible (Packet Size = %d closures)\n",
+	      RtsFlags.GranFlags.packBufferSize);
+  else
+    fprintf(gr_file, "Incremental Fetching: Fetch Exactly One Closure in Each Packet\n");
+  
+  fprintf(gr_file, "Fetch Strategy(%u):If outstanding fetches %s\n",
+	  RtsFlags.GranFlags.FetchStrategy,
+	  RtsFlags.GranFlags.FetchStrategy==0 ?
+	    " block (block-on-fetch)":
+	  RtsFlags.GranFlags.FetchStrategy==1 ?
+	    "only run runnable threads":
+	  RtsFlags.GranFlags.FetchStrategy==2 ? 
+	    "create threads only from local sparks":
+	  RtsFlags.GranFlags.FetchStrategy==3 ? 
+	    "create threads from local or global sparks":
+	  RtsFlags.GranFlags.FetchStrategy==4 ?
+	    "create sparks and steal threads if necessary":
+	  "unknown");
+
+  if (RtsFlags.GranFlags.DoPrioritySparking)
+    fprintf(gr_file, "Priority Sparking (i.e. keep sparks ordered by priority)\n");
+
+  if (RtsFlags.GranFlags.DoPriorityScheduling)
+    fprintf(gr_file, "Priority Scheduling (i.e. keep threads ordered by priority)\n");
+
+  fprintf(gr_file, "Thread Creation Time %u, Thread Queue Time %u\n",
+	  RtsFlags.GranFlags.Costs.threadcreatetime, 
+	  RtsFlags.GranFlags.Costs.threadqueuetime);
+  fprintf(gr_file, "Thread DeSchedule Time %u, Thread Schedule Time %u\n",
+	  RtsFlags.GranFlags.Costs.threaddescheduletime, 
+	  RtsFlags.GranFlags.Costs.threadscheduletime);
+  fprintf(gr_file, "Thread Context-Switch Time %u\n",
+	  RtsFlags.GranFlags.Costs.threadcontextswitchtime);
+  fputs("\n\n--------------------\n\n", gr_file);
+
+  fputs("Communication Metrics:\n\n", gr_file);
+  fprintf(gr_file,
+	  "Latency %u (1st) %u (rest), Fetch %u, Notify %u (Global) %u (Local)\n",
+	  RtsFlags.GranFlags.Costs.latency, 
+	  RtsFlags.GranFlags.Costs.additional_latency, 
+	  RtsFlags.GranFlags.Costs.fetchtime,
+	  RtsFlags.GranFlags.Costs.gunblocktime, 
+	  RtsFlags.GranFlags.Costs.lunblocktime);
+  fprintf(gr_file,
+	  "Message Creation %u (+ %u after send), Message Read %u\n",
+	  RtsFlags.GranFlags.Costs.mpacktime, 
+	  RtsFlags.GranFlags.Costs.mtidytime, 
+	  RtsFlags.GranFlags.Costs.munpacktime);
+  fputs("\n\n--------------------\n\n", gr_file);
+
+  fputs("Instruction Metrics:\n\n", gr_file);
+  fprintf(gr_file, "Arith %u, Branch %u, Load %u, Store %u, Float %u, Alloc %u\n",
+	  RtsFlags.GranFlags.Costs.arith_cost, 
+	  RtsFlags.GranFlags.Costs.branch_cost,
+	  RtsFlags.GranFlags.Costs.load_cost, 
+	  RtsFlags.GranFlags.Costs.store_cost, 
+	  RtsFlags.GranFlags.Costs.float_cost, 
+	  RtsFlags.GranFlags.Costs.heapalloc_cost);
+  fputs("\n\n++++++++++++++++++++\n\n", gr_file);
+
+# if 0
+  /* binary log files are currently not supported */
+  if (RtsFlags.GranFlags.GranSimStats.Binary)
+    grputw(sizeof(rtsTime));
+# endif
+
+  return (0);
+}
+
+#elif defined(PAR)
+
+void init_gr_stats (void);
+
+void
+init_gr_simulation(rts_argc, rts_argv, prog_argc, prog_argv)
+char *prog_argv[], *rts_argv[];
+int prog_argc, rts_argc;
+{
+  nat i;
+  char time_string[TIME_STR_LEN], node_str[NODE_STR_LEN];
+  char *extension = RtsFlags.ParFlags.ParStats.Binary ? "gb" : "gr";
+
+  sprintf(gr_filename, GR_FILENAME_FMT_GUM, prog_argv[0], thisPE, extension);
+
+  if (!RtsFlags.ParFlags.ParStats.Full) 
+    return;
+
+  if (RtsFlags.ParFlags.ParStats.Global)
+    init_gr_stats();
+
+  if ((gr_file = fopen(gr_filename, "w")) == NULL)
+    barf("Can't open activity report file %s\n", gr_filename);
+
+  setbuf(gr_file, NULL);                   /* turn  buffering off */
+
+  /* write header with program name, options and setup to gr_file */
+  for (i = 0; i < prog_argc; ++i) {
+    fputs(prog_argv[i], gr_file);
+    fputc(' ', gr_file);
+  }
+
+  if (rts_argc > 0) {
+    fputs("+RTS ", gr_file);
+    
+    for (i = 0; i < rts_argc; ++i) {
+      fputs(rts_argv[i], gr_file);
+      fputc(' ', gr_file);
+    }
+  }
+  fputc('\n', gr_file);
+
+  /* record the absolute start time to allow synchronisation of log-files */
+  fputs("Start-Time: ", gr_file);
+  fputs(time_str(), gr_file);
+  fputc('\n', gr_file);
+
+  ASSERT(startTime==0);
+  // startTime = msTime();
+  startTime = CURRENT_TIME;
+  ullong_format_string(CURRENT_TIME, time_string, rtsFalse/*no commas!*/);
+  fprintf(gr_file, "PE %2u [%s]: TIME\n", thisPE, time_string);
+
+# if 0
+    ngoq Dogh'q' vImuS
+  IF_PAR_DEBUG(verbose,
+	       belch("== Start-time: %ld (%s)",
+		     startTime, time_string));
+
+    if (startTime > LL(1000000000)) {
+      fprintf(gr_file, "PE %2u [%lu%lu]: TIME\n", thisPE, 
+	    (rtsTime) (startTime / LL(1000000000)),
+	    (rtsTime) (startTime % LL(1000000000)));
+    } else {
+      fprintf(gr_file, "PE %2u [%lu]: TIME\n", thisPE, (TIME) startTime);
+    } 
+    /* binary log files are currently not supported */
+    if (RtsFlags.GranFlags.GranSimStats.Binary)
+	grputw(sizeof(rtsTime));
+# endif
+
+    return;
+}
+
+void 
+init_gr_stats (void) {
+  // memset(&globalParStats, '\0', sizeof(GlobalParStats));
+
+  globalParStats.tot_mark_GA = globalParStats.tot_rebuild_GA = globalParStats.tot_free_GA = globalParStats.res_mark_GA = globalParStats.res_rebuild_GA = globalParStats.res_free_GA = globalParStats.tot_size_GA = globalParStats.res_size_GA = globalParStats.tot_global = globalParStats.tot_local = 0;
+  globalParStats.cnt_mark_GA = globalParStats.cnt_rebuild_GA = globalParStats.cnt_free_GA = globalParStats.res_free_GA = globalParStats.local_alloc_GA = 0;
+
+  globalParStats.time_mark_GA = 0.0;
+  globalParStats.time_rebuild_GA = 0.0;
+  globalParStats.time_sparks = 0.0;
+  globalParStats.time_pack = 0.0;
+
+  globalParStats.res_sp = globalParStats.res_tp = globalParStats.tot_sp = globalParStats.tot_tp = globalParStats.cnt_sp = globalParStats.cnt_tp = globalParStats.emp_sp = globalParStats.emp_tp = 0;
+  globalParStats.tot_packets = globalParStats.tot_packet_size = globalParStats.tot_thunks = globalParStats.res_packet_size = globalParStats.res_thunks = globalParStats.rec_res_packet_size = globalParStats.rec_res_thunks = 0;
+
+  globalParStats.tot_fish_mess = globalParStats.tot_fetch_mess = globalParStats.tot_resume_mess = globalParStats.tot_schedule_mess = 0;
+  globalParStats.rec_fish_mess = globalParStats.rec_resume_mess = globalParStats.rec_schedule_mess = 0;
+  globalParStats.rec_fetch_mess = 0;
+#if defined(DIST)
+  globalParStats.tot_reval_mess = 0;
+  globalParStats.rec_reval_mess = 0;
+#endif
+
+  globalParStats.tot_threads_created = globalParStats.tot_sparks_created = globalParStats.tot_sparks_ignored = globalParStats.tot_sparks_marked = globalParStats.res_sparks_created = globalParStats.res_sparks_ignored = globalParStats.res_sparks_marked = 0;
+   globalParStats.tot_yields = globalParStats.tot_stackover = globalParStats.tot_heapover = 0;
+
+   globalParStats.tot_arrs = globalParStats.tot_arr_size = 0; 
+}
+
+#endif /* PAR */
+
+//@cindex end_gr_simulation
+#if defined(GRAN)
+void
+end_gr_simulation(void)
+{
+   char time_string[TIME_STR_LEN];
+
+   ullong_format_string(CURRENT_TIME, time_string, rtsFalse/*no commas!*/);
+
+   if (RtsFlags.GranFlags.GranSimStats.Suppressed)
+     return;
+
+   /* Print event stats */
+   if (RtsFlags.GranFlags.GranSimStats.Global) {
+     nat i;
+   
+     fprintf(stderr,"Total yields: %d\n",
+             globalGranStats.tot_yields);
+
+     fprintf(stderr,"Total number of threads created: %d ; per PE:\n",
+             globalGranStats.tot_threads_created);
+     for (i=0; i<RtsFlags.GranFlags.proc; i++) {
+       fprintf(stderr,"  PE %d: %d\t", 
+	       i, globalGranStats.threads_created_on_PE[i]);
+       if (i+1 % 4 == 0) fprintf(stderr,"\n");
+     }
+     if (RtsFlags.GranFlags.proc+1 % 4 != 0) fprintf(stderr,"\n");
+     fprintf(stderr,"Total number of threads migrated: %d\n",
+             globalGranStats.tot_TSOs_migrated);
+
+     fprintf(stderr,"Total number of sparks created: %d ; per PE:\n",
+             globalGranStats.tot_sparks_created);
+     for (i=0; i<RtsFlags.GranFlags.proc; i++) {
+       fprintf(stderr,"  PE %d: %d\t", 
+	       i, globalGranStats.sparks_created_on_PE[i]);
+       if (i+1 % 4 == 0) fprintf(stderr,"\n");
+     }
+     if (RtsFlags.GranFlags.proc+1 % 4 != 0) fprintf(stderr,"\n");
+
+     fprintf(stderr,"Event statistics (number of events: %d):\n",
+             globalGranStats.noOfEvents);
+     for (i=0; i<=MAX_EVENT; i++) {
+       fprintf(stderr,"  %s (%d): \t%d \t%f%%\t%f%%\n",
+               event_names[i],i,globalGranStats.event_counts[i],
+               (float)(100*globalGranStats.event_counts[i])/(float)(globalGranStats.noOfEvents),
+               (i==ContinueThread ? 0.0 :
+   		   (float)(100*(globalGranStats.event_counts[i])/(float)(globalGranStats.noOfEvents-globalGranStats.event_counts[ContinueThread])) ));
+     }
+     fprintf(stderr,"Randomized steals: %ld sparks, %ld threads \n \t(Sparks: #%u (avg ntimes=%f; avg fl=%f)\n\t(Threads: %ld)", 
+   	             globalGranStats.rs_sp_count, 
+	             globalGranStats.rs_t_count, 
+	             globalGranStats.no_of_steals, 
+   	             (float)globalGranStats.ntimes_total/(float)stg_max(globalGranStats.no_of_steals,1),
+   	             (float)globalGranStats.fl_total/(float)stg_max(globalGranStats.no_of_steals,1),
+	             globalGranStats.no_of_migrates);
+     fprintf(stderr,"Moved sparks: %d  Withered sparks: %d (%.2f %%)\n",
+   	      globalGranStats.tot_sparks, globalGranStats.withered_sparks,
+             ( globalGranStats.tot_sparks == 0 ? 0 :
+                  (float)(100*globalGranStats.withered_sparks)/(float)(globalGranStats.tot_sparks)) );
+     /* Print statistics about priority sparking */
+     if (RtsFlags.GranFlags.DoPrioritySparking) {
+   	fprintf(stderr,"About Priority Sparking:\n");
+   	fprintf(stderr,"  Total no. NewThreads: %d   Avg. spark queue len: %.2f \n", globalGranStats.tot_sq_probes, (float)globalGranStats.tot_sq_len/(float)globalGranStats.tot_sq_probes);
+     }
+     /* Print statistics about priority sparking */
+     if (RtsFlags.GranFlags.DoPriorityScheduling) {
+   	fprintf(stderr,"About Priority Scheduling:\n");
+   	fprintf(stderr,"  Total no. of StartThreads: %d (non-end: %d) Avg. thread queue len: %.2f\n", 
+   		globalGranStats.tot_add_threads, globalGranStats.non_end_add_threads, 
+   		(float)globalGranStats.tot_tq_len/(float)globalGranStats.tot_add_threads);
+     }
+     /* Blocking queue statistics */
+     if (1) {
+   	fprintf(stderr,"Blocking queue statistcs:\n");
+   	fprintf(stderr,"  Total no. of FMBQs generated: %d\n",
+		globalGranStats.tot_FMBQs);
+   	fprintf(stderr,"  Total no. of bqs awakened: %d\n",
+		globalGranStats.tot_awbq);
+   	fprintf(stderr,"  Total length of all bqs: %d\tAvg length of bqs: %.2f\n",
+		globalGranStats.tot_bq_len, (float)globalGranStats.tot_bq_len/(float)globalGranStats.tot_awbq);
+   	fprintf(stderr,"  Percentage of local TSOs in BQs: %.2f\n",
+		(float)globalGranStats.tot_bq_len*100.0/(float)globalGranStats.tot_bq_len);
+   	fprintf(stderr,"  Total time spent processing BQs: %lx\n",
+		globalGranStats.tot_bq_processing_time);
+     }
+
+     /* Fetch misses and thunk stealing */
+     fprintf(stderr,"Number of fetch misses: %d\n", 
+	     globalGranStats.fetch_misses);
+
+     /* Print packet statistics if GUMM fetching is turned on */
+     if (RtsFlags.GranFlags.DoBulkFetching) {
+   	fprintf(stderr,"Packet statistcs:\n");
+   	fprintf(stderr,"  Total no. of packets: %d   Avg. packet size: %.2f \n", globalGranStats.tot_packets, (float)globalGranStats.tot_packet_size/(float)globalGranStats.tot_packets);
+   	fprintf(stderr,"  Total no. of thunks: %d   Avg. thunks/packet: %.2f \n", globalGranStats.tot_thunks, (float)globalGranStats.tot_thunks/(float)globalGranStats.tot_packets);
+   	fprintf(stderr,"  Total no. of cuts: %d   Avg. cuts/packet: %.2f\n", globalGranStats.tot_cuts, (float)globalGranStats.tot_cuts/(float)globalGranStats.tot_packets);
+        /* 
+   	if (closure_queue_overflows>0) 
+   	  fprintf(stderr,"  Number of closure queue overflows: %u\n",
+   		  closure_queue_overflows);
+	*/
+     }
+   } /* RtsFlags.GranFlags.GranSimStats.Global */
+
+#  if defined(GRAN_COUNT)
+#  error "GRAN_COUNT not supported; should be parallel ticky profiling, really"
+    fprintf(stderr,"Update count statistics:\n");
+    fprintf(stderr,"  Total number of updates: %u\n",nUPDs);
+    fprintf(stderr,"  Needed to awaken BQ: %u with avg BQ len of: %f\n",
+	    nUPDs_BQ,(float)BQ_lens/(float)nUPDs_BQ);
+    fprintf(stderr,"  Number of PAPs: %u\n",nPAPs);
+#  endif
+
+    fprintf(stderr, "Simulation finished after @ %s @ cycles. %d sparks created, %d sparks ignored. Check %s for details.\n",
+	    time_string, sparksCreated, sparksIgnored, gr_filename);
+
+    if (RtsFlags.GranFlags.GranSimStats.Full) 
+      fclose(gr_file);
+}
+
+#elif defined(PAR)
+
+/*
+  Under GUM we print only one line. 
+*/
+void
+end_gr_simulation(void)
+{
+  char time_string[TIME_STR_LEN];
+
+  ullong_format_string(CURRENT_TIME-startTime, time_string, rtsFalse/*no commas!*/);
+
+  fprintf(stderr, "Computation finished after @ %s @ ms. %d sparks created, %d sparks ignored. Check %s for details.\n",
+	    time_string, sparksCreated, sparksIgnored, gr_filename);
+
+  if (RtsFlags.ParFlags.ParStats.Full) 
+    fclose(gr_file);
+}
+#endif /* PAR */
+
+//@node Global statistics, Dumping routines, Writing to the log-file
+//@subsection Global statistics
+/* 
+   Called at the end of execution
+*/
+
+//@node Dumping routines,  , Global statistics
+//@subsection Dumping routines
+
+//@cindex DumpGranEvent
+void
+DumpGranEvent(name, tso)
+GranEventType name;
+StgTSO *tso;
+{
+    DumpRawGranEvent(CURRENT_PROC, (PEs)0, name, tso, &stg_END_TSO_QUEUE_closure, (StgInt)0, (StgInt)0);
+}
+
+//@cindex DumpRawGranEvent
+void
+DumpRawGranEvent(proc, p, name, tso, node, sparkname, len)
+PEs proc, p;         /* proc ... where it happens; p ... where node lives */
+GranEventType name;
+StgTSO *tso;
+StgClosure *node;
+StgInt sparkname, len;
+{
+# if defined(GRAN)
+  DumpVeryRawGranEvent(TIME_ON_PROC(proc), 
+		       proc, p, name, tso, node, sparkname, len);
+# elif defined(PAR)
+  DumpVeryRawGranEvent(CURRENT_TIME,
+		       proc, p, name, tso, node, sparkname, len);
+# endif
+}
+
+//@cindex DumpVeryRawGranEvent
+void
+DumpVeryRawGranEvent(time, proc, p, name, tso, node, sparkname, len)
+rtsTime time;
+PEs proc, p;         /* proc ... where it happens; p ... where node lives */
+GranEventType name;
+StgTSO *tso;
+StgClosure *node;
+StgInt sparkname, len;
+{
+  FILE *output_file; // DEBUGGING ONLY !!!!!!!!!!!!!!!!!!!!!!!!!1
+  StgWord id;
+  char time_string[TIME_STR_LEN], node_str[NODE_STR_LEN];
+# if defined(GRAN)
+  ullong_format_string(time,
+		       time_string, rtsFalse/*no commas!*/);
+# elif defined(PAR)
+  ullong_format_string(time,
+		       time_string, rtsFalse/*no commas!*/);
+# endif
+  output_file = gr_file;
+
+# if defined(GRAN)
+
+  if (RtsFlags.GranFlags.GranSimStats.Full) 
+    ASSERT(output_file!=NULL);
+
+  if (RtsFlags.GranFlags.GranSimStats.Suppressed)
+    return;
+# elif defined(PAR)
+
+  if (RtsFlags.ParFlags.ParStats.Full) 
+    ASSERT(output_file!=NULL);
+
+  if (RtsFlags.ParFlags.ParStats.Suppressed)
+    return;
+
+# endif
+
+  id = tso == NULL ? -1 : tso->id;
+  if (node==stgCast(StgClosure*,&stg_END_TSO_QUEUE_closure))
+      strcpy(node_str,"________");  /* "END_TSO_QUEUE"); */
+  else
+      sprintf(node_str,"0x%-6lx",node);
+
+  if (name > GR_EVENT_MAX)
+	name = GR_EVENT_MAX;
+
+  if (BINARY_STATS)
+    barf("binary log files not yet supported");
+#if 0
+    /* ToDo: fix code for writing binary GrAnSim statistics */
+    switch (name) { 
+      case GR_START:
+      case GR_STARTQ:
+                      grputw(name);
+		      grputw(proc);
+		      abort();        /* die please: a single word */
+				      /* doesn't represent long long times */
+		      grputw(TIME_ON_PROC(proc));
+		      grputw((StgWord)node);
+		      break;
+      case GR_FETCH:
+      case GR_REPLY:
+      case GR_BLOCK:
+		      grputw(name);
+		      grputw(proc);
+		      abort();        /* die please: a single word */
+				      /* doesn't represent long long times */
+		      grputw(TIME_ON_PROC(proc));  /* this line is bound to */
+		      grputw(id);                  /*   do the wrong thing */
+		      break;
+      default: 
+                      grputw(name);
+		      grputw(proc);
+		      abort();        /* die please: a single word */
+				      /* doesn't represent long long times */
+		      grputw(TIME_ON_PROC(proc));
+		      grputw((StgWord)node);
+    }
+#endif
+  else /* !BINARY_STATS */
+    switch (name) { 
+     case GR_START:
+     case GR_STARTQ:
+        fprintf(output_file,"PE %2u [%s]: %-9s\t%lx\t%s\t[SN %u]\t[sparks %u]\n", 
+	        proc,time_string,gran_event_names[name],
+	        id,node_str,sparkname,len);
+        break;
+     case GR_FETCH:
+     case GR_REPLY:
+     case GR_BLOCK:
+     case GR_STOLEN:
+     case GR_STOLENQ:
+     case GR_STEALING:
+	fprintf(output_file, "PE %2u [%s]: %-9s\t%lx \t%s\t(from %2u)\n",
+	        proc, time_string, gran_event_names[name], 
+		id,node_str,p);
+	break;
+     case GR_RESUME:
+     case GR_RESUMEQ:
+     case GR_SCHEDULE:
+     case GR_DESCHEDULE:
+        fprintf(output_file,"PE %2u [%s]: %-9s\t%lx \n",
+	        proc,time_string,gran_event_names[name],id);
+        break;
+     case GR_ALLOC:
+        fprintf(output_file,"PE %2u [%s]: %-9s\t%lx\t        \tallocating %u words\n",
+	        proc,time_string,gran_event_names[name],id,len);
+        break;
+     default:
+        fprintf(output_file,"PE %2u [%s]: %-9s\t%lx\t%s\t[sparks %u]\n",
+	        proc,time_string,gran_event_names[name],id,node_str,len);
+    }
+}
+
+//@cindex DumpGranInfo
+void
+DumpEndEvent(proc, tso, mandatory_thread)
+PEs proc;
+StgTSO *tso;
+rtsBool mandatory_thread;
+{
+  FILE *output_file; // DEBUGGING ONLY !!!!!!!!!!!!!!!!!!!!!!!!!1
+  char time_string[TIME_STR_LEN];
+# if defined(GRAN)
+  ullong_format_string(TIME_ON_PROC(proc), 
+		       time_string, rtsFalse/*no commas!*/);
+# elif defined(PAR)
+  ullong_format_string(CURRENT_TIME,
+		       time_string, rtsFalse/*no commas!*/);
+# endif
+
+  output_file = gr_file;
+  ASSERT(output_file!=NULL);
+#if defined(GRAN)
+    if (RtsFlags.GranFlags.GranSimStats.Suppressed)
+      return;
+#endif
+
+    if (BINARY_STATS) {
+    barf("binary log files not yet supported");
+#if 0
+	grputw(GR_END);
+	grputw(proc);
+	abort(); /* die please: a single word doesn't represent long long times */
+	grputw(CURRENT_TIME); /* this line is bound to fail */
+	grputw(tso->id);
+#ifdef PAR
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+#else
+	grputw(tso->gran.sparkname);
+	grputw(tso->gran.startedat);
+	grputw(tso->gran.exported);
+	grputw(tso->gran.basicblocks);
+	grputw(tso->gran.allocs);
+	grputw(tso->gran.exectime);
+	grputw(tso->gran.blocktime);
+	grputw(tso->gran.blockcount);
+	grputw(tso->gran.fetchtime);
+	grputw(tso->gran.fetchcount);
+	grputw(tso->gran.localsparks);
+	grputw(tso->gran.globalsparks);
+#endif
+	grputw(mandatory_thread);
+#endif /* 0 */
+    } else {
+
+	/*
+	 * NB: DumpGranEvent cannot be used because PE may be wrong 
+	 * (as well as the extra info)
+	 */
+	fprintf(output_file, "PE %2u [%s]: END %lx, SN %u, ST %lu, EXP %s, BB %u, HA %u, RT %u, BT %u (%u), FT %u (%u), LS %u, GS %u, MY %s\n"
+	  ,proc
+	  ,time_string
+	  ,tso->id
+#if defined(GRAN)		
+	  ,tso->gran.sparkname
+	  ,tso->gran.startedat
+	  ,((tso->gran.exported) ? 'T' : 'F')
+	  ,tso->gran.basicblocks
+	  ,tso->gran.allocs
+	  ,tso->gran.exectime
+	  ,tso->gran.blocktime
+	  ,tso->gran.blockcount
+	  ,tso->gran.fetchtime
+	  ,tso->gran.fetchcount
+	  ,tso->gran.localsparks
+	  ,tso->gran.globalsparks
+#elif defined(PAR)
+	  ,tso->par.sparkname
+	  ,tso->par.startedat
+	  ,(tso->par.exported) ? "T" : "F"
+	  ,tso->par.basicblocks
+	  ,tso->par.allocs
+	  ,tso->par.exectime
+	  ,tso->par.blocktime
+	  ,tso->par.blockcount
+	  ,tso->par.fetchtime
+	  ,tso->par.fetchcount
+	  ,tso->par.localsparks
+	  ,tso->par.globalsparks
+#endif
+	  ,(mandatory_thread ? "T" : "F")
+	  );
+    }
+}
+
+//@cindex DumpTSO
+void
+DumpTSO(tso)
+StgTSO *tso;
+{
+  FILE *output_file; // DEBUGGING ONLY !!!!!!!!!!!!!!!!!!!!!!!!!1
+
+  output_file = gr_file;
+  ASSERT(output_file!=NULL);
+  fprintf(stderr,"TSO 0x%lx, NAME 0x%lx, ID %u, LINK 0x%lx, TYPE %s\n"
+          ,tso
+#if defined(GRAN)
+          ,tso->gran.sparkname
+#elif defined(PAR)
+          ,tso->par.sparkname
+#endif
+          ,tso->id
+          ,tso->link
+          ,/*tso->state==T_MAIN?"MAIN":
+           TSO_TYPE(tso)==T_FAIL?"FAIL":
+           TSO_TYPE(tso)==T_REQUIRED?"REQUIRED":
+           TSO_TYPE(tso)==T_ADVISORY?"ADVISORY":
+	   */
+           "???"
+          );
+          
+  fprintf(output_file,"TSO %lx: SN %u, ST %u, GBL %c, BB %u, HA %u, RT %u, BT %u (%u), FT %u (%u) LS %u, GS %u\n"
+	  ,tso->id
+#if defined(GRAN)
+          ,tso->gran.sparkname
+          ,tso->gran.startedat
+          ,tso->gran.exported?'T':'F'
+          ,tso->gran.basicblocks
+          ,tso->gran.allocs
+          ,tso->gran.exectime
+          ,tso->gran.blocktime
+          ,tso->gran.blockcount
+          ,tso->gran.fetchtime
+          ,tso->gran.fetchcount
+          ,tso->gran.localsparks
+          ,tso->gran.globalsparks
+#elif defined(PAR)
+          ,tso->par.sparkname
+          ,tso->par.startedat
+          ,tso->par.exported?'T':'F'
+          ,tso->par.basicblocks
+          ,tso->par.allocs
+          ,tso->par.exectime
+          ,tso->par.blocktime
+          ,tso->par.blockcount
+          ,tso->par.fetchtime
+          ,tso->par.fetchcount
+          ,tso->par.localsparks
+          ,tso->par.globalsparks
+#endif
+          );
+}
+
+#if 0
+/*
+  ToDo: fix binary output of log files, and support new log file format.
+*/
+/*
+   Output a terminate event and an 8-byte time.
+*/
+
+//@cindex grterminate
+void
+grterminate(v)
+rtsTime v;
+{
+  if (!BINARY_STATS) 
+    barf("grterminate: binary statistics not enabled\n");
+
+# if defined(GRAN)
+    if (RtsFlags.GranFlags.GranSimStats.Suppressed)
+      return;
+# endif
+
+    DumpGranEvent(GR_TERMINATE, stgCast(StgTSO*,&stg_END_TSO_QUEUE_closure));
+
+    if (sizeof(rtsTime) == 4) {
+      putc('\0', gr_file);
+      putc('\0', gr_file);
+      putc('\0', gr_file);
+      putc('\0', gr_file);
+    } else {
+      putc(v >> 56l, gr_file);
+      putc((v >> 48l) & 0xffl, gr_file);
+      putc((v >> 40l) & 0xffl, gr_file);
+      putc((v >> 32l) & 0xffl, gr_file);
+    }
+    putc((v >> 24l) & 0xffl, gr_file);
+    putc((v >> 16l) & 0xffl, gr_file);
+    putc((v >> 8l) & 0xffl, gr_file);
+    putc(v & 0xffl, gr_file);
+}
+
+/*
+   Length-coded output: first 3 bits contain length coding
+
+     00x        1 byte
+     01x        2 bytes
+     10x        4 bytes
+     110        8 bytes
+     111        5 or 9 bytes
+*/
+
+//@cindex grputw
+void
+grputw(v)
+rtsTime v;
+{
+  if (!BINARY_STATS) 
+    barf("grputw: binary statistics not enabled\n");
+
+# if defined(GRAN)
+    if (RtsFlags.GranFlags.GranSimStats.Suppressed)
+      return;
+# endif
+
+    if (v <= 0x3fl) {                           /* length v = 1 byte */ 
+	fputc(v & 0x3f, gr_file);
+    } else if (v <= 0x3fffl) {                  /* length v = 2 byte */ 
+	fputc((v >> 8l) | 0x40l, gr_file);
+	fputc(v & 0xffl, gr_file);
+    } else if (v <= 0x3fffffffl) {              /* length v = 4 byte */ 
+	fputc((v >> 24l) | 0x80l, gr_file);
+	fputc((v >> 16l) & 0xffl, gr_file);
+	fputc((v >> 8l) & 0xffl, gr_file);
+	fputc(v & 0xffl, gr_file);
+    } else if (sizeof(TIME) == 4) {
+	fputc(0x70, gr_file);
+	fputc((v >> 24l) & 0xffl, gr_file);
+	fputc((v >> 16l) & 0xffl, gr_file);
+	fputc((v >> 8l) & 0xffl, gr_file);
+	fputc(v & 0xffl, gr_file);
+    } else {
+	if (v <= 0x3fffffffffffffl)
+	    putc((v >> 56l) | 0x60l, gr_file);
+	else {
+	    putc(0x70, gr_file);
+	    putc((v >> 56l) & 0xffl, gr_file);
+	}
+
+	putc((v >> 48l) & 0xffl, gr_file);
+	putc((v >> 40l) & 0xffl, gr_file);
+	putc((v >> 32l) & 0xffl, gr_file);
+	putc((v >> 24l) & 0xffl, gr_file);
+	putc((v >> 16l) & 0xffl, gr_file);
+	putc((v >> 8l) & 0xffl, gr_file);
+	putc(v & 0xffl, gr_file);
+    }
+}
+#endif /* 0 */
+
+/* 
+   extracting specific info out of a closure; used in packing (GranSim, GUM)
+*/
+//@cindex get_closure_info
+StgInfoTable*
+get_closure_info(StgClosure* node, nat *size, nat *ptrs, nat *nonptrs, 
+		 nat *vhs, char *info_hdr_ty)
+{
+  StgInfoTable *info;
+
+  ASSERT(LOOKS_LIKE_COOL_CLOSURE(node)); 
+  info = get_itbl(node);
+  /* the switch shouldn't be necessary, really; just use default case */
+  switch (info->type) {
+  case RBH:
+    {
+      StgInfoTable *rip = REVERT_INFOPTR(info); // closure to revert to
+      *size = sizeW_fromITBL(rip);
+      *ptrs = (nat) (rip->layout.payload.ptrs);
+      *nonptrs = (nat) (rip->layout.payload.nptrs);
+      *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+#if 0 /* DEBUG */
+      info_hdr_type(node, info_hdr_ty);
+#else
+      strcpy(info_hdr_ty, "RBH");
+#endif
+      return rip;  // NB: we return the reverted info ptr for a RBH!!!!!!
+    }
+
+#if defined(PAR)
+  /* Closures specific to GUM */
+  case FETCH_ME:
+    *size = sizeofW(StgFetchMe);
+    *ptrs = (nat)0;
+    *nonptrs = (nat)0;
+    *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+#if 0 /* DEBUG */
+    info_hdr_type(node, info_hdr_ty);
+#else
+    strcpy(info_hdr_ty, "FETCH_ME");
+#endif
+    return info;
+
+#ifdef DIST    
+  case REMOTE_REF: //same as for FETCH_ME...
+    *size = sizeofW(StgFetchMe);
+    *ptrs = (nat)0;
+    *nonptrs = (nat)0;
+    *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+#if 0 /* DEBUG */
+    info_hdr_type(node, info_hdr_ty);
+#else
+    strcpy(info_hdr_ty, "REMOTE_REF");
+#endif
+    return info; 
+#endif /* DIST */
+    
+  case FETCH_ME_BQ:
+    *size = sizeofW(StgFetchMeBlockingQueue);
+    *ptrs = (nat)0;
+    *nonptrs = (nat)0;
+    *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+#if 0 /* DEBUG */
+    info_hdr_type(node, info_hdr_ty);
+#else
+    strcpy(info_hdr_ty, "FETCH_ME_BQ");
+#endif
+    return info;
+
+  case BLOCKED_FETCH:
+    *size = sizeofW(StgBlockedFetch);
+    *ptrs = (nat)0;
+    *nonptrs = (nat)0;
+    *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+#if 0 /* DEBUG */
+    info_hdr_type(node, info_hdr_ty);
+#else
+    strcpy(info_hdr_ty, "BLOCKED_FETCH");
+#endif
+    return info;
+#endif /* PAR */
+    
+  /* these magic constants are outrageous!! why does the ITBL lie about it? */
+  case THUNK_SELECTOR:
+    *size = THUNK_SELECTOR_sizeW();
+    *ptrs = 1;
+    *nonptrs = MIN_UPD_SIZE-*ptrs;   // weird
+    *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+    return info;
+
+  case ARR_WORDS:
+    /* ToDo: check whether this can be merged with the default case */
+    *size = arr_words_sizeW((StgArrWords *)node); 
+    *ptrs = 0;
+    *nonptrs = ((StgArrWords *)node)->words;
+    *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+    return info;
+
+  case PAP:
+    /* ToDo: check whether this can be merged with the default case */
+    *size = pap_sizeW((StgPAP *)node); 
+    *ptrs = 0;
+    *nonptrs = 0;
+    *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+    return info;
+
+  case AP_UPD:
+    /* ToDo: check whether this can be merged with the default case */
+    *size = AP_sizeW(((StgAP_UPD *)node)->n_args); 
+    *ptrs = 0;
+    *nonptrs = 0;
+    *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+    return info;
+
+  default:
+    *size = sizeW_fromITBL(info);
+    *ptrs = (nat) (info->layout.payload.ptrs);
+    *nonptrs = (nat) (info->layout.payload.nptrs);
+    *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+#if 0 /* DEBUG */
+      info_hdr_type(node, info_hdr_ty);
+#else
+      strcpy(info_hdr_ty, "UNKNOWN");
+#endif
+    return info;
+  }
+} 
+
+//@cindex IS_BLACK_HOLE
+rtsBool
+IS_BLACK_HOLE(StgClosure* node)          
+{ 
+  // StgInfoTable *info;
+  ASSERT(LOOKS_LIKE_COOL_CLOSURE(node));
+  switch (get_itbl(node)->type) {
+  case BLACKHOLE:
+  case BLACKHOLE_BQ:
+  case RBH:
+  case FETCH_ME:
+  case FETCH_ME_BQ:
+    return rtsTrue;
+  default:
+    return rtsFalse;
+  }
+//return ((info->type == BLACKHOLE || info->type == RBH) ? rtsTrue : rtsFalse);
+}
+
+//@cindex IS_INDIRECTION
+StgClosure *
+IS_INDIRECTION(StgClosure* node)          
+{ 
+  StgInfoTable *info;
+  ASSERT(LOOKS_LIKE_COOL_CLOSURE(node));
+  info = get_itbl(node);
+  switch (info->type) {
+    case IND:
+    case IND_OLDGEN:
+    case IND_PERM:
+    case IND_OLDGEN_PERM:
+    case IND_STATIC:
+      /* relies on indirectee being at same place for all these closure types */
+      return (((StgInd*)node) -> indirectee);
+#if 0
+    case EVACUATED:           // counting as ind to use in GC routines, too
+      // could use the same code as above (evacuee is at same pos as indirectee)
+      return (((StgEvacuated *)node) -> evacuee);
+#endif
+    default:
+      return NULL;
+  }
+}
+
+//@cindex unwindInd
+StgClosure *
+UNWIND_IND (StgClosure *closure)
+{
+  StgClosure *next;
+
+  while ((next = IS_INDIRECTION((StgClosure *)closure)) != NULL) 
+    closure = next;
+
+  ASSERT(next==(StgClosure *)NULL);
+  ASSERT(LOOKS_LIKE_COOL_CLOSURE(closure)); 
+  return closure;
+}
+
+#endif /* GRAN || PAR   whole file */
diff --git a/rts/parallel/ParallelDebug.c b/rts/parallel/ParallelDebug.c
new file mode 100644
index 0000000000..b357af6379
--- /dev/null
+++ b/rts/parallel/ParallelDebug.c
@@ -0,0 +1,1955 @@
+/*
+  Time-stamp: <Sun Mar 18 2001 19:32:56 Stardate: [-30]6349.07 hwloidl>
+
+  Various debugging routines for GranSim and GUM
+*/
+
+#if defined(DEBUG) && (defined(GRAN) || defined(PAR))        /* whole file */
+
+//@node Debugging routines for GranSim and GUM, , ,
+//@section Debugging routines for GranSim and GUM
+
+//@menu
+//* Includes::			
+//* Constants and Variables::	
+//* Closures::			
+//* Threads::			
+//* Events::			
+//* Sparks::			
+//* Processors::		
+//* Shortcuts::			
+//* Printing info type::	
+//* Printing Pack:et Contents::	
+//* End of File::		
+//@end menu
+//*/
+
+//@node Includes, Prototypes, Debugging routines for GranSim and GUM, Debugging routines for GranSim and GUM
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+#include "StgMiscClosures.h"
+#include "Printer.h"
+# if defined(DEBUG)
+# include "Hash.h" 
+# include "Storage.h"
+# include "ParallelDebug.h"
+# endif
+
+//@node Prototypes, Constants and Variables, Includes, Debugging routines for GranSim and GUM
+//@subsection Prototypes
+/*
+rtsBool  isOffset(globalAddr *ga);
+rtsBool  isFixed(globalAddr *ga);
+*/
+//@node Constants and Variables, Closures, Prototypes, Debugging routines for GranSim and GUM
+//@subsection Constants and Variables
+
+static HashTable *tmpClosureTable;  // used in GraphFingerPrint and PrintGraph
+
+#if defined(PAR)
+static char finger_print_char[] = {
+ '/',  /* INVALID_OBJECT          0 */
+ 'C', /* CONSTR                  1 */
+ 'C', /*	CONSTR_1_0		2 */
+ 'C', /*	CONSTR_0_1		3 */
+ 'C', /*	CONSTR_2_0		4 */
+ 'C', /*	CONSTR_1_1		5 */
+ 'C', /*	CONSTR_0_2		6 */
+ 'I', /* CONSTR_INTLIKE	        7  */
+ 'I', /* CONSTR_CHARLIKE	        8  */
+ 'S', /* CONSTR_STATIC	        9  */
+ 'S', /* CONSTR_NOCAF_STATIC     10 */
+ 'F', /* FUN		        11 */
+ 'F', /*	FUN_1_0		  	12 */
+ 'F', /*	FUN_0_1		  	13 */
+ 'F', /*	FUN_2_0		  	14 */
+ 'F', /*	FUN_1_1		  	15 */
+ 'F', /*	FUN_0_2			16 */
+ 'S', /* FUN_STATIC	        17 */
+ 'T', /* THUNK		        18 */
+ 'T', /*	THUNK_1_0	19 */
+ 'T', /*	THUNK_0_1	20 */
+ 'T', /*	THUNK_2_0	21 */
+ 'T', /*	THUNK_1_1	22 */
+ 'T', /*	THUNK_0_2	23 */
+ 'S', /* THUNK_STATIC	        24 */
+ 'E', /* THUNK_SELECTOR	        25 */
+ 'b', /* BCO		        26 */
+ 'p', /* AP_UPD		        27 */
+ 'p', /* PAP			28 */
+ '_', /* IND		        29 */
+ '_', /* IND_OLDGEN	        30 */
+ '_', /* IND_PERM	        31 */
+ '_', /* IND_OLDGEN_PERM	32 */
+ '_', /* IND_STATIC	        33 */
+ '?', /* ***unused***	        34 */
+ '?', /* ***unused***	        35 */
+ '^', /* RET_BCO                36 */
+ '^', /* RET_SMALL	        37 */
+ '^', /* RET_VEC_SMALL	        38 */
+ '^', /* RET_BIG		39 */
+ '^', /* RET_VEC_BIG	        40 */
+ '^', /* RET_DYN		41 */
+ '~', /* UPDATE_FRAME	        42 */
+ '~', /* CATCH_FRAME	        43 */
+ '~', /* STOP_FRAME	        44 */
+ '~', /* SEQ_FRAME	        45 */
+ 'o', /* CAF_BLACKHOLE	        46 */
+ 'o', /* BLACKHOLE	        47 */
+ 'o', /* BLACKHOLE_BQ	        48 */
+ 'o', /* SE_BLACKHOLE		49 */
+ 'o', /* SE_CAF_BLACKHOLE	50 */
+ 'm', /* MVAR		        51 */
+ 'a', /* ARR_WORDS	        52 */
+ 'a', /* MUT_ARR_PTRS	        53 */
+ 'a', /* MUT_ARR_PTRS_FROZEN    54 */
+ 'q', /* MUT_VAR		55 */
+ 'w', /* WEAK		        56 */
+ 'f', /* FOREIGN		57 */
+ 's', /* STABLE_NAME	        58 */
+ '@', /* TSO		        59 */
+ '#', /* BLOCKED_FETCH	        60 */
+ '>', /* FETCH_ME               61 */
+ '>', /* FETCH_ME_BQ            62 */
+ '$', /* RBH                    63 */
+ 'v', /* EVACUATED              64 */
+ '>' /* REMOTE_REF              65 */  
+     /* ASSERT(there are N_CLOSURE_TYPES (==66) in this arrary) */
+};
+#endif /* PAR */
+
+#if defined(GRAN) && defined(GRAN_CHECK)
+//@node Closures, Threads, Constants and Variables, Debugging routines for GranSim and GUM
+//@subsection Closures
+
+void
+G_PRINT_NODE(node)
+StgClosure* node;
+{
+   StgInfoTable *info_ptr;
+   StgTSO* bqe;
+   nat size = 0, ptrs = 0, nonptrs = 0, i, vhs = 0;
+   char info_hdr_ty[80], info_ty[80];
+
+   if (node==NULL) {
+     fprintf(stderr,"NULL\n");
+     return;
+   } else if (node==END_TSO_QUEUE) {
+     fprintf(stderr,"END_TSO_QUEUE\n");
+     return;
+   }
+   /* size_and_ptrs(node,&size,&ptrs); */
+   info_ptr = get_closure_info(node, &size, &ptrs, &nonptrs, &vhs, info_hdr_ty);
+
+   /* vhs = var_hdr_size(node); */
+   display_info_type(info_ptr,info_ty);
+
+   fprintf(stderr,"Node: 0x%lx", node);
+
+#if defined(PAR)
+   fprintf(stderr," [GA: 0x%lx]",GA(node));
+#endif
+
+#if defined(USE_COST_CENTRES)
+   fprintf(stderr," [CC: 0x%lx]",CC_HDR(node));
+#endif
+
+#if defined(GRAN)
+   fprintf(stderr," [Bitmask: 0%lo]",PROCS(node));
+#endif
+
+   if (info_ptr->type==TSO) 
+     fprintf(stderr," TSO: 0x%lx (%x) IP: 0x%lx (%s), type %s \n     ",
+	     (StgTSO*)node, ((StgTSO*)node)->id, info_ptr, info_hdr_ty, info_ty);
+   else
+     fprintf(stderr," IP: 0x%lx (%s), type %s \n       VHS: %d, size: %ld, ptrs:%ld, nonptrs:  %ld\n     ",
+	     info_ptr,info_hdr_ty,info_ty,vhs,size,ptrs,nonptrs);
+
+   /* For now, we ignore the variable header */
+
+   fprintf(stderr," Ptrs: ");
+   for(i=0; i < ptrs; ++i)
+     {
+     if ( (i+1) % 6 == 0)
+       fprintf(stderr,"\n      ");
+     fprintf(stderr," 0x%lx[P]",node->payload[i]);
+     };
+
+   fprintf(stderr," Data: ");
+   for(i=0; i < nonptrs; ++i)
+     {
+       if( (i+1) % 6 == 0)
+         fprintf(stderr,"\n      ");
+       fprintf(stderr," %lu[D]",node->payload[ptrs+i]);
+     }
+   fprintf(stderr, "\n");
+
+
+   switch (info_ptr->type)
+    {
+     case TSO: 
+      fprintf(stderr,"\n TSO_LINK: %#lx", 
+	      ((StgTSO*)node)->link);
+      break;
+
+    case BLACKHOLE:
+    case RBH:
+      bqe = ((StgBlockingQueue*)node)->blocking_queue;
+      fprintf(stderr," BQ of %#lx: ", node);
+      G_PRINT_BQ(bqe);
+      break;
+    case FETCH_ME:
+    case FETCH_ME_BQ:
+      printf("Panic: found FETCH_ME or FETCH_ME_BQ Infotable in GrAnSim system.\n");
+      break;
+    default:
+      /* do nothing */
+    }
+}
+
+void
+G_PPN(node)  /* Extracted from PrintPacket in Pack.lc */
+StgClosure* node;
+{
+   StgInfoTable *info ;
+   nat size = 0, ptrs = 0, nonptrs = 0, i, vhs = 0, locn = 0;
+   char info_type[80];
+
+   /* size_and_ptrs(node,&size,&ptrs); */
+   info = get_closure_info(node, &size, &ptrs, &nonptrs, &vhs, info_type);
+
+   if (info->type == FETCH_ME || info->type == FETCH_ME_BQ || 
+       info->type == BLACKHOLE || info->type == RBH )
+     size = ptrs = nonptrs = vhs = 0;
+
+   if (closure_THUNK(node)) {
+     if (!closure_UNPOINTED(node))
+       fputs("SHARED ", stderr);
+     else
+       fputs("UNSHARED ", stderr);
+   } 
+   if (info->type==BLACKHOLE) {
+     fputs("BLACK HOLE\n", stderr);
+   } else {
+     /* Fixed header */
+     fprintf(stderr, "(%s) FH [%#lx", info_type, node[locn++]);
+     for (i = 1; i < _HS; i++)
+       fprintf(stderr, " %#lx", node[locn++]);
+     
+     /* Variable header */
+     if (vhs > 0) {
+       fprintf(stderr, "] VH [%#lx", node->payload[0]);
+       
+       for (i = 1; i < vhs; i++)
+	 fprintf(stderr, " %#lx", node->payload[i]);
+     }
+     
+     fprintf(stderr, "] PTRS %u", ptrs);
+     
+     /* Non-pointers */
+     if (nonptrs > 0) {
+       fprintf(stderr, " NPTRS [%#lx", node->payload[ptrs]);
+       
+       for (i = 1; i < nonptrs; i++)
+	 fprintf(stderr, " %#lx", node->payload[ptrs+i]);
+       
+       putc(']', stderr);
+     }
+     putc('\n', stderr);
+   }
+   
+}
+
+#if 0
+// ToDo: fix this!! -- HWL
+void
+G_INFO_TABLE(node)
+StgClosure *node;
+{
+  StgInfoTable *info_ptr;
+  nat size = 0, ptrs = 0, nonptrs = 0, vhs = 0;
+  char info_type[80], hdr_type[80];
+
+  info_hdr_type(info_ptr, hdr_type);
+
+  // get_itbl(node);
+  info_ptr = get_closure_info(node, &size, &ptrs, &nonptrs, &vhs, info_type);
+  fprintf(stderr,"%s Info Ptr @0x%lx; Entry: 0x%lx; Size: %lu; Ptrs: %lu\n\n",
+                 info_type,info_ptr,(W_) ENTRY_CODE(info_ptr),
+	         size, ptrs);
+	         // INFO_SIZE(info_ptr),INFO_NoPTRS(info_ptr));
+
+  if (closure_THUNK(node) && !closure_UNPOINTED(node) ) {
+    fprintf(stderr,"  RBH InfoPtr: %#lx\n",
+	    RBH_INFOPTR(info_ptr));
+  }
+
+#if defined(PAR)
+  fprintf(stderr,"Enter Flush Entry: 0x%lx;\tExit Flush Entry: 0x%lx\n",INFO_FLUSHENT(info_ptr),INFO_FLUSH(info_ptr));
+#endif
+
+#if defined(USE_COST_CENTRES)
+  fprintf(stderr,"Cost Centre (?):       0x%lx\n",INFO_CAT(info_ptr));
+#endif
+
+#if defined(_INFO_COPYING)
+  fprintf(stderr,"Evacuate Entry:    0x%lx;\tScavenge Entry: 0x%lx\n",
+          INFO_EVAC_2S(info_ptr),INFO_SCAV_2S(info_ptr));
+#endif
+
+#if defined(_INFO_COMPACTING)
+  fprintf(stderr,"Scan Link:         0x%lx;\tScan Move:      0x%lx\n",
+          (W_) INFO_SCAN_LINK_1S(info_ptr), (W_) INFO_SCAN_MOVE_1S(info_ptr));
+  fprintf(stderr,"Mark:              0x%lx;\tMarked:         0x%lx;\t",
+          (W_) INFO_MARK_1S(info_ptr), (W_) INFO_MARKED_1S(info_ptr));
+#if 0 /* avoid INFO_TYPE */
+  if(BASE_INFO_TYPE(info_ptr)==INFO_SPEC_TYPE)
+    fprintf(stderr,"plus specialised code\n");
+  else
+    fprintf(stderr,"Marking:           0x%lx\n",(W_) INFO_MARKING_1S(info_ptr));
+#endif /* 0 */
+#endif /* _INFO_COMPACTING */
+}
+#endif /* 0 */
+
+//@cindex G_PRINT_BQ
+void
+G_PRINT_BQ(node)
+StgClosure* node;
+{
+    StgInfoTable *info;
+    StgTSO *tso, *last;
+    char str[80], str0[80];
+
+    fprintf(stderr,"\n[PE %d] @ %lu BQ: ",
+	            CurrentProc,CurrentTime[CurrentProc]);
+    if ( node == (StgClosure*)NULL ) {
+      fprintf(stderr," NULL.\n");
+      return;
+    }
+    if ( node == END_TSO_QUEUE ) {
+      fprintf(stderr," _|_\n");
+      return;
+    }
+    tso = ((StgBlockingQueue*)node)->blocking_queue;
+    while (node != END_TSO_QUEUE) {
+      PEs proc;                     
+      
+      /* Find where the tso lives */
+      proc = where_is(node);
+      info = get_itbl(node);
+
+      switch (info->type) {
+	  case TSO:
+	    strcpy(str0,"TSO");
+	    break;
+	  case BLOCKED_FETCH:
+	    strcpy(str0,"BLOCKED_FETCH");
+	    break;
+	  default:
+	    strcpy(str0,"???");
+	    break;
+	  }
+
+      if(proc == CurrentProc)
+	fprintf(stderr," %#lx (%x) L %s,", 
+		node, ((StgBlockingQueue*)node)->blocking_queue, str0);
+      else
+	fprintf(stderr," %#lx (%x) G (PE %d) %s,", 
+		node, ((StgBlockingQueue*)node)->blocking_queue, proc, str0);
+
+      last = tso;
+      tso = last->link;
+    }
+    if ( tso == END_TSO_QUEUE ) 
+      fprintf(stderr," _|_\n");
+}
+
+//@node Threads, Events, Closures, Debugging routines for GranSim and GUM
+//@subsection Threads
+
+void
+G_CURR_THREADQ(verbose) 
+StgInt verbose;
+{ 
+  fprintf(stderr,"Thread Queue on proc %d: ", CurrentProc);
+  G_THREADQ(run_queue_hd, verbose);
+}
+
+void 
+G_THREADQ(closure, verbose) 
+StgTSO* closure;
+StgInt verbose;
+{
+ StgTSO* x;
+
+ fprintf(stderr,"Thread Queue: ");
+ for (x=closure; x!=END_TSO_QUEUE; x=x->link)
+   if (verbose) 
+     G_TSO(x,0);
+   else
+     fprintf(stderr," %#lx",x);
+
+ if (closure==END_TSO_QUEUE)
+   fprintf(stderr,"NIL\n");
+ else
+   fprintf(stderr,"\n");
+}
+
+void 
+G_TSO(closure,verbose) 
+StgTSO* closure;
+StgInt verbose;
+{
+ 
+ if (closure==END_TSO_QUEUE) {
+   fprintf(stderr,"TSO at %#lx is END_TSO_QUEUE!\n");
+   return;
+ }
+
+ if ( verbose & 0x08 ) {   /* short info */
+   fprintf(stderr,"[TSO @ %#lx, PE %d]: Id: %#lx, Link: %#lx\n",
+	   closure,where_is(closure),
+	   closure->id,closure->link);
+   return;
+ }
+   
+ fprintf(stderr,"TSO at %#lx has the following contents:\n",
+                 closure);
+
+ fprintf(stderr,"> Id:   \t%#lx",closure->id);
+ // fprintf(stderr,"\tstate: \t%#lx",closure->state);
+ fprintf(stderr,"\twhat_next: \t%#lx",closure->what_next);
+ fprintf(stderr,"\tlink: \t%#lx\n",closure->link);
+ fprintf(stderr,"\twhy_blocked: \t%d", closure->why_blocked);
+ fprintf(stderr,"\tblock_info: \t%p\n", closure->block_info);
+ // fprintf(stderr,"\tType: \t%s\n",type_name[TSO_TYPE(closure)]);
+ fprintf(stderr,">PRI: \t%#lx", closure->gran.pri);
+ fprintf(stderr,"\tMAGIC: \t%#lx %s\n", closure->gran.magic, 
+	 (closure->gran.magic==TSO_MAGIC ? "it IS a TSO" : "THIS IS NO TSO!!"));
+ if ( verbose & 0x04 ) {
+   fprintf(stderr, "Stack: stack @ %#lx (stack_size: %u; max_stack_size: %u)\n", 
+	   closure->stack, closure->stack_size, closure->max_stack_size);
+   fprintf(stderr, "  sp: %#lx, su: %#lx, splim: %#lx\n", 
+	   closure->sp, closure->su, closure->splim);
+ }
+ // fprintf(stderr,"\n");
+ if (verbose & 0x01) {
+   // fprintf(stderr,"} LOCKED: \t%#lx",closure->locked);
+   fprintf(stderr,"} SPARKNAME: \t%#lx\n", closure->gran.sparkname);
+   fprintf(stderr,"} STARTEDAT: \t%#lx", closure->gran.startedat);
+   fprintf(stderr,"\tEXPORTED: \t%#lx\n", closure->gran.exported);
+   fprintf(stderr,"} BASICBLOCKS: \t%#lx", closure->gran.basicblocks);
+   fprintf(stderr,"\tALLOCS: \t%#lx\n", closure->gran.allocs);
+   fprintf(stderr,"} EXECTIME: \t%#lx", closure->gran.exectime);
+   fprintf(stderr,"\tFETCHTIME: \t%#lx\n", closure->gran.fetchtime);
+   fprintf(stderr,"} FETCHCOUNT: \t%#lx", closure->gran.fetchcount);
+   fprintf(stderr,"\tBLOCKTIME: \t%#lx\n", closure->gran.blocktime);
+   fprintf(stderr,"} BLOCKCOUNT: \t%#lx", closure->gran.blockcount);
+   fprintf(stderr,"\tBLOCKEDAT: \t%#lx\n", closure->gran.blockedat);
+   fprintf(stderr,"} GLOBALSPARKS:\t%#lx", closure->gran.globalsparks);
+   fprintf(stderr,"\tLOCALSPARKS:\t%#lx\n", closure->gran.localsparks);
+ }
+ if ( verbose & 0x02 ) {
+   fprintf(stderr,"BQ that starts with this TSO: ");
+   G_PRINT_BQ(closure);
+ }
+}
+
+//@node Events, Sparks, Threads, Debugging routines for GranSim and GUM
+//@subsection Events
+
+void 
+G_EVENT(event, verbose) 
+rtsEventQ event;
+StgInt verbose;
+{
+  if (verbose) {
+    print_event(event);
+  }else{
+    fprintf(stderr," %#lx",event);
+  }
+}
+
+void
+G_EVENTQ(verbose)
+StgInt verbose;
+{
+ extern rtsEventQ EventHd;
+ rtsEventQ x;
+
+ fprintf(stderr,"RtsEventQ (hd @%#lx):\n",EventHd);
+ for (x=EventHd; x!=NULL; x=x->next) {
+   G_EVENT(x,verbose);
+ }
+ if (EventHd==NULL) 
+   fprintf(stderr,"NIL\n");
+ else
+   fprintf(stderr,"\n");
+}
+
+void
+G_PE_EQ(pe,verbose)
+PEs pe;
+StgInt verbose;
+{
+ extern rtsEventQ EventHd;
+ rtsEventQ x;
+
+ fprintf(stderr,"RtsEventQ (hd @%#lx):\n",EventHd);
+ for (x=EventHd; x!=NULL; x=x->next) {
+   if (x->proc==pe)
+     G_EVENT(x,verbose);
+ }
+ if (EventHd==NULL) 
+   fprintf(stderr,"NIL\n");
+ else
+   fprintf(stderr,"\n");
+}
+
+//@node Sparks, Processors, Events, Debugging routines for GranSim and GUM
+//@subsection Sparks
+
+void 
+G_SPARK(spark, verbose) 
+rtsSparkQ spark;
+StgInt verbose;
+{
+ if (spark==(rtsSpark*)NULL) {
+   belch("G_SPARK: NULL spark; aborting");
+   return;
+ }
+  if (verbose)
+    print_spark(spark);
+  else
+    fprintf(stderr," %#lx",spark);
+}
+
+void 
+G_SPARKQ(spark,verbose) 
+rtsSparkQ spark;
+StgInt verbose;
+{
+ rtsSparkQ x;
+
+ if (spark==(rtsSpark*)NULL) {
+   belch("G_SPARKQ: NULL spark; aborting");
+   return;
+ }
+   
+ fprintf(stderr,"RtsSparkQ (hd @%#lx):\n",spark);
+ for (x=spark; x!=NULL; x=x->next) {
+   G_SPARK(x,verbose);
+ }
+ if (spark==NULL) 
+   fprintf(stderr,"NIL\n");
+ else
+   fprintf(stderr,"\n");
+}
+
+void 
+G_CURR_SPARKQ(verbose) 
+StgInt verbose;
+{
+  G_SPARKQ(pending_sparks_hd,verbose);
+}
+
+//@node Processors, Shortcuts, Sparks, Debugging routines for GranSim and GUM
+//@subsection Processors
+
+void 
+G_PROC(proc,verbose)
+StgInt proc;
+StgInt verbose;
+{ 
+  extern rtsEventQ EventHd;
+  extern char *proc_status_names[];
+
+  fprintf(stderr,"Status of proc %d at time %d (%#lx): %s (%s)\n",
+          proc,CurrentTime[proc],CurrentTime[proc],
+          (CurrentProc==proc)?"ACTIVE":"INACTIVE",
+          proc_status_names[procStatus[proc]]);
+  G_THREADQ(run_queue_hds[proc],verbose & 0x2);
+  if ( (CurrentProc==proc) )
+    G_TSO(CurrentTSO,1);
+
+  if (EventHd!=NULL)
+    fprintf(stderr,"Next event (%s) is on proc %d\n",
+            event_names[EventHd->evttype],EventHd->proc);
+
+  if (verbose & 0x1) {
+    fprintf(stderr,"\nREQUIRED sparks: ");
+    G_SPARKQ(pending_sparks_hds[proc],1);
+    fprintf(stderr,"\nADVISORY_sparks: ");
+    G_SPARKQ(pending_sparks_hds[proc],1);
+  }
+}
+
+//@node Shortcuts, Printing info type, Processors, Debugging routines for GranSim and GUM
+//@subsection Shortcuts
+
+/* Debug Processor */
+void 
+GP(proc)
+StgInt proc;
+{ G_PROC(proc,1);
+}
+
+/* Debug Current Processor */
+void
+GCP(){ G_PROC(CurrentProc,2); }
+
+/* Debug TSO */
+void
+GT(StgPtr tso){ 
+  G_TSO(tso,1);
+}
+
+/* Debug CurrentTSO */
+void
+GCT(){ 
+  fprintf(stderr,"Current Proc: %d\n",CurrentProc);
+  G_TSO(CurrentTSO,1);
+}
+
+/* Shorthand for debugging event queue */
+void
+GEQ() { G_EVENTQ(1); }
+
+/* Shorthand for debugging thread queue of a processor */
+void 
+GTQ(PEs p) { G_THREADQ(run_queue_hds[p],1); } 
+
+/* Shorthand for debugging thread queue of current processor */
+void 
+GCTQ() { G_THREADQ(run_queue_hds[CurrentProc],1); } 
+
+/* Shorthand for debugging spark queue of a processor */
+void
+GSQ(PEs p) { G_SPARKQ(pending_sparks_hds[p],1); }
+
+/* Shorthand for debugging spark queue of current processor */
+void
+GCSQ() { G_CURR_SPARKQ(1); }
+
+/* Shorthand for printing a node */
+void
+GN(StgPtr node) { G_PRINT_NODE(node); }
+
+/* Shorthand for printing info table */
+#if 0
+// ToDo: fix -- HWL
+void
+GIT(StgPtr node) { G_INFO_TABLE(node); }
+#endif
+
+void 
+printThreadQPtrs(void)
+{
+  PEs p;
+  for (p=0; p<RtsFlags.GranFlags.proc; p++) {
+    fprintf(stderr,", PE %d: (hd=%p,tl=%p)", 
+	    run_queue_hds[p], run_queue_tls[p]);
+  }
+}
+
+void
+printThreadQ(StgTSO *tso) { G_THREADQ(tso, 0); };
+
+void
+printSparkQ(rtsSpark *spark) { G_SPARKQ(spark, 0); };
+
+void
+printThreadQ_verbose(StgTSO *tso) { G_THREADQ(tso, 1); };
+
+void
+printSparkQ_verbose(rtsSpark *spark) { G_SPARKQ(spark, 1); };
+
+/* Shorthand for some of ADRs debugging functions */
+
+#endif /* GRAN && GRAN_CHECK*/
+
+#if 0
+void
+DEBUG_PRINT_NODE(node)
+StgPtr node;
+{
+   W_ info_ptr = INFO_PTR(node);
+   StgInt size = 0, ptrs = 0, i, vhs = 0;
+   char info_type[80];
+
+   info_hdr_type(info_ptr, info_type);
+
+   size_and_ptrs(node,&size,&ptrs);
+   vhs = var_hdr_size(node);
+
+   fprintf(stderr,"Node: 0x%lx", (W_) node);
+
+#if defined(PAR)
+   fprintf(stderr," [GA: 0x%lx]",GA(node));
+#endif
+
+#if defined(PROFILING)
+   fprintf(stderr," [CC: 0x%lx]",CC_HDR(node));
+#endif
+
+#if defined(GRAN)
+   fprintf(stderr," [Bitmask: 0%lo]",PROCS(node));
+#endif
+
+   fprintf(stderr," IP: 0x%lx (%s), size %ld, %ld ptrs\n",
+                  info_ptr,info_type,size,ptrs);
+
+   /* For now, we ignore the variable header */
+
+   for(i=0; i < size; ++i)
+     {
+       if(i == 0)
+         fprintf(stderr,"Data: ");
+
+       else if(i % 6 == 0)
+         fprintf(stderr,"\n      ");
+
+       if(i < ptrs)
+         fprintf(stderr," 0x%lx[P]",*(node+_HS+vhs+i));
+       else
+         fprintf(stderr," %lu[D]",*(node+_HS+vhs+i));
+     }
+   fprintf(stderr, "\n");
+}
+
+
+#define INFO_MASK       0x80000000
+
+void
+DEBUG_TREE(node)
+StgPtr node;
+{
+  W_ size = 0, ptrs = 0, i, vhs = 0;
+
+  /* Don't print cycles */
+  if((INFO_PTR(node) & INFO_MASK) != 0)
+    return;
+
+  size_and_ptrs(node,&size,&ptrs);
+  vhs = var_hdr_size(node);
+
+  DEBUG_PRINT_NODE(node);
+  fprintf(stderr, "\n");
+
+  /* Mark the node -- may be dangerous */
+  INFO_PTR(node) |= INFO_MASK;
+
+  for(i = 0; i < ptrs; ++i)
+    DEBUG_TREE((StgPtr)node[i+vhs+_HS]);
+
+  /* Unmark the node */
+  INFO_PTR(node) &= ~INFO_MASK;
+}
+
+
+void
+DEBUG_INFO_TABLE(node)
+StgPtr node;
+{
+  W_ info_ptr = INFO_PTR(node);
+  char *iStgPtrtype = info_hdr_type(info_ptr);
+
+  fprintf(stderr,"%s Info Ptr @0x%lx; Entry: 0x%lx; Size: %lu; Ptrs: %lu\n\n",
+                 iStgPtrtype,info_ptr,(W_) ENTRY_CODE(info_ptr),INFO_SIZE(info_ptr),INFO_NoPTRS(info_ptr));
+#if defined(PAR)
+  fprintf(stderr,"Enter Flush Entry: 0x%lx;\tExit Flush Entry: 0x%lx\n",INFO_FLUSHENT(info_ptr),INFO_FLUSH(info_ptr));
+#endif
+
+#if defined(PROFILING)
+  fprintf(stderr,"Cost Centre (?):       0x%lx\n",INFO_CAT(info_ptr));
+#endif
+
+#if defined(_INFO_COPYING)
+  fprintf(stderr,"Evacuate Entry:    0x%lx;\tScavenge Entry: 0x%lx\n",
+          INFO_EVAC_2S(info_ptr),INFO_SCAV_2S(info_ptr));
+#endif
+
+#if defined(_INFO_COMPACTING)
+  fprintf(stderr,"Scan Link:         0x%lx;\tScan Move:      0x%lx\n",
+          (W_) INFO_SCAN_LINK_1S(info_ptr), (W_) INFO_SCAN_MOVE_1S(info_ptr));
+  fprintf(stderr,"Mark:              0x%lx;\tMarked:         0x%lx;\t",
+          (W_) INFO_MARK_1S(info_ptr), (W_) INFO_MARKED_1S(info_ptr));
+#if 0 /* avoid INFO_TYPE */
+  if(BASE_INFO_TYPE(info_ptr)==INFO_SPEC_TYPE)
+    fprintf(stderr,"plus specialised code\n");
+  else
+    fprintf(stderr,"Marking:           0x%lx\n",(W_) INFO_MARKING_1S(info_ptr));
+#endif /* 0 */
+#endif /* _INFO_COMPACTING */
+}
+#endif /* 0 */
+
+//@node Printing info type, Printing Packet Contents, Shortcuts, Debugging routines for GranSim and GUM
+//@subsection Printing info type
+
+char *
+display_info_type(closure, str)
+StgClosure *closure;
+char *str;
+{ 
+  strcpy(str,"");
+  if ( closure_HNF(closure) )
+    strcat(str,"|_HNF ");
+  else if ( closure_BITMAP(closure) )
+    strcat(str,"|_BTM");
+  else if ( !closure_SHOULD_SPARK(closure) )
+    strcat(str,"|_NS");
+  else if ( closure_STATIC(closure) )
+    strcat(str,"|_STA");
+  else if ( closure_THUNK(closure) )
+    strcat(str,"|_THU");
+  else if ( closure_MUTABLE(closure) )
+    strcat(str,"|_MUT");
+  else if ( closure_UNPOINTED(closure) )
+    strcat(str,"|_UPT");
+  else if ( closure_SRT(closure) )
+    strcat(str,"|_SRT");
+
+  return(str);
+}
+
+/*
+  PrintPacket is in Pack.c because it makes use of closure queues
+*/
+
+#if defined(GRAN) || defined(PAR)
+
+/*
+  Print graph rooted at q. The structure of this recursive printing routine
+  should be the same as in the graph traversals when packing a graph in
+  GUM. Thus, it demonstrates the structure of such a generic graph
+  traversal, and in particular, how to extract pointer and non-pointer info
+  from the multitude of different heap objects available. 
+
+  {evacuate}Daq ngoqvam nIHlu'pu'!!
+*/
+
+void
+PrintGraph(StgClosure *p, int indent_level)
+{
+  void PrintGraph_(StgClosure *p, int indent_level);
+
+  ASSERT(tmpClosureTable==NULL);
+
+  /* init hash table */
+  tmpClosureTable = allocHashTable();
+
+  /* now do the real work */
+  PrintGraph_(p, indent_level);
+
+  /* nuke hash table */
+  freeHashTable(tmpClosureTable, NULL);
+  tmpClosureTable = NULL;
+}
+
+/*
+  This is the actual worker functions. 
+  All recursive calls should be made to this function.
+*/
+void
+PrintGraph_(StgClosure *p, int indent_level)
+{
+  StgPtr x, q;
+  rtsBool printed = rtsFalse;
+  nat i, j;
+  const StgInfoTable *info;
+  
+  /* check whether we have met this node already to break cycles */
+  if (lookupHashTable(tmpClosureTable, (StgWord)p)) { // ie. already touched
+    /* indentation */
+    for (j=0; j<indent_level; j++)
+      fputs(" ", stderr);
+
+    fprintf(stderr, "#### cylce to %p", p);
+    return; 
+  }
+
+  /* record that we are processing this closure */
+  insertHashTable(tmpClosureTable, (StgWord) p, (void *)rtsTrue/*non-NULL*/);
+
+  q = p;			/* save ptr to object */
+  
+  /* indentation */
+  for (j=0; j<indent_level; j++)
+    fputs(" ", stderr);
+
+  ASSERT(p!=(StgClosure*)NULL);
+  ASSERT(LOOKS_LIKE_STATIC(p) ||
+	 LOOKS_LIKE_GHC_INFO(GET_INFO((StgClosure *)p)) ||
+         IS_HUGS_CONSTR_INFO(GET_INFO((StgClosure *)p)));
+
+  printClosure(p); // prints contents of this one closure
+
+  /* indentation */
+  for (j=0; j<indent_level; j++)
+    fputs(" ", stderr);
+
+  info = get_itbl((StgClosure *)p);
+  /* the rest of this fct recursively traverses the graph */
+  switch (info -> type) {
+  
+  case BCO:
+    {
+  	StgBCO* bco = stgCast(StgBCO*,p);
+  	nat i;
+	fprintf(stderr, "BCO (%p)\n", p);
+        /*
+  	for (i = 0; i < bco->n_ptrs; i++) {
+  	  // bcoConstCPtr(bco,i) = 
+	  PrintGraph_(bcoConstCPtr(bco,i), indent_level+1);
+  	}
+	*/
+  	// p += bco_sizeW(bco);
+  	break;
+    }
+  
+  case MVAR:
+    /* treat MVars specially, because we don't want to PrintGraph the
+     * mut_link field in the middle of the closure.
+     */
+    { 
+  	StgMVar *mvar = ((StgMVar *)p);
+  	// evac_gen = 0;
+	fprintf(stderr, "MVAR (%p) with 3 pointers (head, tail, value)\n", p);
+  	// (StgClosure *)mvar->head = 
+	PrintGraph_((StgClosure *)mvar->head, indent_level+1);
+  	// (StgClosure *)mvar->tail = 
+	PrintGraph_((StgClosure *)mvar->tail, indent_level+1);
+  	//(StgClosure *)mvar->value = 
+	PrintGraph_((StgClosure *)mvar->value, indent_level+1);
+  	// p += sizeofW(StgMVar);
+  	// evac_gen = saved_evac_gen;
+  	break;
+    }
+  
+  case THUNK_2_0:
+    if (!printed) {
+      fprintf(stderr, "THUNK_2_0 (%p) with 2 pointers\n", p);
+      printed = rtsTrue;
+    }
+  case FUN_2_0:
+    if (!printed) {
+      fprintf(stderr, "FUN_2_0 (%p) with 2 pointers\n", p);
+      printed = rtsTrue;
+    }
+    // scavenge_srt(info);
+  case CONSTR_2_0:
+    if (!printed) {
+      fprintf(stderr, "CONSTR_2_0 (%p) with 2 pointers\n", p);
+      printed = rtsTrue;
+    }
+    // ((StgClosure *)p)->payload[0] = 
+    PrintGraph_(((StgClosure *)p)->payload[0],
+	       indent_level+1);
+    // ((StgClosure *)p)->payload[1] = 
+    PrintGraph_(((StgClosure *)p)->payload[1],
+	       indent_level+1);
+    // p += sizeofW(StgHeader) + 2;
+    break;
+  
+  case THUNK_1_0:
+    // scavenge_srt(info);
+    fprintf(stderr, "THUNK_1_0 (%p) with 1 pointer\n", p);
+    // ((StgClosure *)p)->payload[0] = 
+    PrintGraph_(((StgClosure *)p)->payload[0],
+	       indent_level+1);
+    // p += sizeofW(StgHeader) + 2; /* MIN_UPD_SIZE */
+    break;
+  
+  case FUN_1_0:
+    if (!printed) {
+      fprintf(stderr, "FUN_1_0 (%p) with 1 pointer\n", p);
+      printed = rtsTrue;
+    }
+    // scavenge_srt(info);
+  case CONSTR_1_0:
+    if (!printed) {
+      fprintf(stderr, "CONSTR_2_0 (%p) with 2 pointers\n", p);
+      printed = rtsTrue;
+    }
+    // ((StgClosure *)p)->payload[0] = 
+    PrintGraph_(((StgClosure *)p)->payload[0],
+	       indent_level+1);
+    // p += sizeofW(StgHeader) + 1;
+    break;
+  
+  case THUNK_0_1:
+    fprintf(stderr, "THUNK_0_1 (%p) with 0 pointers\n", p);
+    // scavenge_srt(info);
+    // p += sizeofW(StgHeader) + 2; /* MIN_UPD_SIZE */
+    break;
+  
+  case FUN_0_1:
+    fprintf(stderr, "FUN_0_1 (%p) with 0 pointers\n", p);
+    //scavenge_srt(info);
+  case CONSTR_0_1:
+    fprintf(stderr, "CONSTR_0_1 (%p) with 0 pointers\n", p);
+    //p += sizeofW(StgHeader) + 1;
+    break;
+  
+  case THUNK_0_2:
+    if (!printed) {
+      fprintf(stderr, "THUNK_0_2 (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+  case FUN_0_2:
+    if (!printed) {
+      fprintf(stderr, "FUN_0_2 (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+    // scavenge_srt(info);
+  case CONSTR_0_2:
+    if (!printed) {
+      fprintf(stderr, "CONSTR_0_2 (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+    // p += sizeofW(StgHeader) + 2;
+    break;
+  
+  case THUNK_1_1:
+    if (!printed) {
+      fprintf(stderr, "THUNK_1_1 (%p) with 1 pointer\n", p);
+      printed = rtsTrue;
+    }
+  case FUN_1_1:
+    if (!printed) {
+      fprintf(stderr, "FUN_1_1 (%p) with 1 pointer\n", p);
+      printed = rtsTrue;
+    }
+    // scavenge_srt(info);
+  case CONSTR_1_1:
+    if (!printed) {
+      fprintf(stderr, "CONSTR_1_1 (%p) with 1 pointer\n", p);
+      printed = rtsTrue;
+    }
+    // ((StgClosure *)p)->payload[0] = 
+    PrintGraph_(((StgClosure *)p)->payload[0],
+	       indent_level+1);
+    // p += sizeofW(StgHeader) + 2;
+    break;
+  
+  case FUN:
+    if (!printed) {
+      fprintf(stderr, "FUN (%p) with %d pointers\n", p, info->layout.payload.ptrs);
+      printed = rtsTrue;
+    }
+    /* fall through */
+  
+  case THUNK:
+    if (!printed) {
+      fprintf(stderr, "THUNK (%p) with %d pointers\n", p, info->layout.payload.ptrs);
+      printed = rtsTrue;
+    }
+    // scavenge_srt(info);
+    /* fall through */
+  
+  case CONSTR:
+    if (!printed) {
+      fprintf(stderr, "CONSTR (%p) with %d pointers\n", p, info->layout.payload.ptrs);
+      printed = rtsTrue;
+    }
+    /* basically same as loop in STABLE_NAME case  */
+    for (i=0; i<info->layout.payload.ptrs; i++)
+      PrintGraph_(((StgClosure *)p)->payload[i],
+		 indent_level+1);
+    break;
+    /* NOT fall through */
+  
+  case WEAK:
+    if (!printed) {
+      fprintf(stderr, "WEAK (%p) with %d pointers\n", p, info->layout.payload.ptrs);
+      printed = rtsTrue;
+    }
+    /* fall through */
+  
+  case FOREIGN:
+    if (!printed) {
+      fprintf(stderr, "FOREIGN (%p) with %d pointers\n", p, info->layout.payload.ptrs);
+      printed = rtsTrue;
+    }
+    /* fall through */
+  
+  case STABLE_NAME:
+    {
+      StgPtr end;
+      
+      if (!printed) {
+	fprintf(stderr, "STABLE_NAME (%p) with %d pointers (not followed!)\n", 
+		p, info->layout.payload.ptrs);
+	printed = rtsTrue;
+      }
+      end = (StgPtr)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+      for (p = (StgPtr)((StgClosure *)p)->payload; p < end; p++) {
+	// (StgClosure *)*p = 
+	//PrintGraph_((StgClosure *)*p, indent_level+1);
+	fprintf(stderr, ", %p", *p); 
+      }
+      //fputs("\n", stderr);
+      // p += info->layout.payload.nptrs;
+      break;
+    }
+  
+  case IND_PERM:
+    //if (step->gen->no != 0) {
+    //	SET_INFO(((StgClosure *)p), &IND_OLDGEN_PERM_info);
+    //}
+    if (!printed) {
+      fprintf(stderr, "IND_PERM (%p) with indirection to\n", 
+	      p, ((StgIndOldGen *)p)->indirectee);
+      printed = rtsTrue;
+    }
+    /* fall through */
+
+  case IND_OLDGEN_PERM:
+    if (!printed) {
+      fprintf(stderr, "IND_OLDGEN_PERM (%p) with indirection to %p\n", 
+	      p, ((StgIndOldGen *)p)->indirectee);
+      printed = rtsTrue;
+    }
+    // ((StgIndOldGen *)p)->indirectee = 
+    PrintGraph_(((StgIndOldGen *)p)->indirectee,
+	       indent_level+1);
+    //if (failed_to_evac) {
+    //	failed_to_evac = rtsFalse;
+    //	recordOldToNewPtrs((StgMutClosure *)p);
+    //}
+    // p += sizeofW(StgIndOldGen);
+    break;
+  
+  case MUT_VAR:
+    /* ignore MUT_CONSs */
+    fprintf(stderr, "MUT_VAR (%p) pointing to %p\n", p, ((StgMutVar *)p)->var);
+    if (((StgMutVar *)p)->header.info != &stg_MUT_CONS_info) {
+      //evac_gen = 0;
+      PrintGraph_(((StgMutVar *)p)->var, indent_level+1);
+  	//evac_gen = saved_evac_gen;
+    }
+    //p += sizeofW(StgMutVar);
+    break;
+  
+  case CAF_BLACKHOLE:
+    if (!printed) {
+      fprintf(stderr, "CAF_BLACKHOLE (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+  case SE_CAF_BLACKHOLE:
+    if (!printed) {
+      fprintf(stderr, "SE_CAF_BLACKHOLE (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+  case SE_BLACKHOLE:
+    if (!printed) {
+      fprintf(stderr, "SE_BLACKHOLE (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+  case BLACKHOLE:
+    if (!printed) {
+      fprintf(stderr, "BLACKHOLE (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+    //p += BLACKHOLE_sizeW();
+    break;
+  
+  case BLACKHOLE_BQ:
+    { 
+      StgBlockingQueue *bh = (StgBlockingQueue *)p;
+      // (StgClosure *)bh->blocking_queue = 
+      fprintf(stderr, "BLACKHOLE_BQ (%p) pointing to %p\n", 
+	      p, (StgClosure *)bh->blocking_queue);
+      PrintGraph_((StgClosure *)bh->blocking_queue, indent_level+1);
+      //if (failed_to_evac) {
+      //  failed_to_evac = rtsFalse;
+      //  recordMutable((StgMutClosure *)bh);
+      //}
+      // p += BLACKHOLE_sizeW();
+      break;
+    }
+  
+  case THUNK_SELECTOR:
+    { 
+      StgSelector *s = (StgSelector *)p;
+      fprintf(stderr, "THUNK_SELECTOR (%p) pointing to %p\n", 
+	      p, s->selectee);
+      PrintGraph_(s->selectee, indent_level+1);
+      // p += THUNK_SELECTOR_sizeW();
+      break;
+    }
+  
+  case IND:
+    fprintf(stderr, "IND (%p) pointing to %p\n", p, ((StgInd*)p)->indirectee);
+    PrintGraph_(((StgInd*)p)->indirectee, indent_level+1);
+    break;
+
+  case IND_OLDGEN:
+    fprintf(stderr, "IND_OLDGEN (%p) pointing to %p\n", 
+	    p, ((StgIndOldGen*)p)->indirectee);
+    PrintGraph_(((StgIndOldGen*)p)->indirectee, indent_level+1);
+    break;
+  
+  case CONSTR_INTLIKE:
+    fprintf(stderr, "CONSTR_INTLIKE (%p) with 0 pointers\n", p);
+    break;
+  case CONSTR_CHARLIKE:
+    fprintf(stderr, "CONSTR_CHARLIKE (%p) with 0 pointers\n", p);
+    break;
+  case CONSTR_STATIC:
+    fprintf(stderr, "CONSTR_STATIC (%p) with 0 pointers\n", p);
+    break;
+  case CONSTR_NOCAF_STATIC:
+    fprintf(stderr, "CONSTR_NOCAF_STATIC (%p) with 0 pointers\n", p);
+    break;
+  case THUNK_STATIC:
+    fprintf(stderr, "THUNK_STATIC (%p) with 0 pointers\n", p);
+    break;
+  case FUN_STATIC:
+    fprintf(stderr, "FUN_STATIC (%p) with 0 pointers\n", p);
+    break;
+  case IND_STATIC:
+    fprintf(stderr, "IND_STATIC (%p) with 0 pointers\n", p);
+    break;
+  
+  case RET_BCO:
+    fprintf(stderr, "RET_BCO (%p) with 0 pointers\n", p);
+    break;
+  case RET_SMALL:
+    fprintf(stderr, "RET_SMALL (%p) with 0 pointers\n", p);
+    break;
+  case RET_VEC_SMALL:
+    fprintf(stderr, "RET_VEC_SMALL (%p) with 0 pointers\n", p);
+    break;
+  case RET_BIG:
+    fprintf(stderr, "RET_BIG (%p) with 0 pointers\n", p);
+    break;
+  case RET_VEC_BIG:
+    fprintf(stderr, "RET_VEC_BIG (%p) with 0 pointers\n", p);
+    break;
+  case RET_DYN:
+    fprintf(stderr, "RET_DYN (%p) with 0 pointers\n", p);
+    break;
+  case UPDATE_FRAME:
+    fprintf(stderr, "UPDATE_FRAME (%p) with 0 pointers\n", p);
+    break;
+  case STOP_FRAME:
+    fprintf(stderr, "STOP_FRAME (%p) with 0 pointers\n", p);
+    break;
+  case CATCH_FRAME:
+    fprintf(stderr, "CATCH_FRAME (%p) with 0 pointers\n", p);
+    break;
+  case SEQ_FRAME:
+    fprintf(stderr, "SEQ_FRAME (%p) with 0 pointers\n", p);
+    break;
+  
+  case AP_UPD: /* same as PAPs */
+    fprintf(stderr, "AP_UPD (%p) with 0 pointers\n", p);
+  case PAP:
+    /* Treat a PAP just like a section of stack, not forgetting to
+     * PrintGraph_ the function pointer too...
+     */
+    { 
+  	StgPAP* pap = stgCast(StgPAP*,p);
+  
+	fprintf(stderr, "PAP (%p) pointing to %p\n", p, pap->fun);
+  	// pap->fun = 
+	//PrintGraph_(pap->fun, indent_level+1);
+  	//scavenge_stack((P_)pap->payload, (P_)pap->payload + pap->n_args);
+  	//p += pap_sizeW(pap);
+  	break;
+    }
+    
+  case ARR_WORDS:
+    /* an array of (non-mutable) words */
+    fprintf(stderr, "ARR_WORDS (%p) of %d non-ptrs (maybe a string?)\n", 
+	    p, ((StgArrWords *)q)->words);
+    break;
+
+  case MUT_ARR_PTRS:
+    /* follow everything */
+    {
+  	StgPtr next;
+  
+	fprintf(stderr, "MUT_ARR_PTRS (%p) with %d pointers (not followed)\n", 
+		p, mut_arr_ptrs_sizeW((StgMutArrPtrs*)p));
+  	// evac_gen = 0;		/* repeatedly mutable */
+  	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+  	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+  	  // (StgClosure *)*p = 
+	  // PrintGraph_((StgClosure *)*p, indent_level+1);
+	  fprintf(stderr, ", %p", *p); 
+  	}
+	fputs("\n", stderr);
+  	//evac_gen = saved_evac_gen;
+  	break;
+    }
+  
+  case MUT_ARR_PTRS_FROZEN:
+    /* follow everything */
+    {
+  	StgPtr start = p, next;
+  
+	fprintf(stderr, "MUT_ARR_PTRS (%p) with %d pointers (not followed)", 
+		p, mut_arr_ptrs_sizeW((StgMutArrPtrs*)p));
+  	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+  	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+  	  // (StgClosure *)*p = 
+	  // PrintGraph_((StgClosure *)*p, indent_level+1);
+	  fprintf(stderr, ", %p", *p); 
+  	}
+	fputs("\n", stderr);
+  	//if (failed_to_evac) {
+  	  /* we can do this easier... */
+  	//  recordMutable((StgMutClosure *)start);
+  	//  failed_to_evac = rtsFalse;
+  	//}
+  	break;
+    }
+  
+  case TSO:
+    { 
+  	StgTSO *tso;
+  	
+  	tso = (StgTSO *)p;
+	fprintf(stderr, "TSO (%p) with link field %p\n", p, (StgClosure *)tso->link);
+  	// evac_gen = 0;
+  	/* chase the link field for any TSOs on the same queue */
+  	// (StgClosure *)tso->link = 
+	PrintGraph_((StgClosure *)tso->link, indent_level+1);
+  	//if (tso->blocked_on) {
+  	//  tso->blocked_on = PrintGraph_(tso->blocked_on);
+  	//}
+  	/* scavenge this thread's stack */
+  	//scavenge_stack(tso->sp, &(tso->stack[tso->stack_size]));
+  	//evac_gen = saved_evac_gen;
+  	//p += tso_sizeW(tso);
+  	break;
+    }
+  
+#if defined(GRAN) || defined(PAR)
+  case RBH:
+    {
+    StgInfoTable *rip = REVERT_INFOPTR(get_itbl(p));
+    //if (LOOKS_LIKE_GHC_INFO(rip))
+    //  fprintf(stderr, "RBH (%p) with 0 pointers (reverted type=%s)\n", 
+	//      p, info_type_by_ip(rip)); 
+    //else
+    fprintf(stderr, "RBH (%p) with 0 pointers (reverted IP=%x)\n", 
+	    p, rip); 
+    }
+    break;
+#endif
+#if defined(PAR)
+  case BLOCKED_FETCH:
+    fprintf(stderr, "BLOCKED_FETCH (%p) with 0 pointers (link=%p)\n", 
+	    p, ((StgBlockedFetch *)p)->link);
+    break;
+  case FETCH_ME:
+    fprintf(stderr, "FETCH_ME (%p) with 0 pointers\n", p);
+    break;
+  case FETCH_ME_BQ:
+    fprintf(stderr, "FETCH_ME_BQ (%p) with 0 pointers (blocking_queue=%p)\n", 
+	    p, ((StgFetchMeBlockingQueue *)p)->blocking_queue);
+    break;
+#endif
+    
+#ifdef DIST    
+  case REMOTE_REF:
+    fprintf(stderr, "REMOTE_REF (%p) with 0 pointers\n", p);
+    break;
+#endif
+
+  case EVACUATED:
+    fprintf(stderr, "EVACUATED (%p) with 0 pointers (evacuee=%p)\n", 
+	    p, ((StgEvacuated *)p)->evacuee);
+    break;
+  
+  default:
+    barf("PrintGraph_: unknown closure %d (%s)",
+	 info -> type, info_type(info));
+  }
+  
+  /* If we didn't manage to promote all the objects pointed to by
+   * the current object, then we have to designate this object as
+   * mutable (because it contains old-to-new generation pointers).
+   */
+  //if (failed_to_evac) {
+  //  mkMutCons((StgClosure *)q, &generations[evac_gen]);
+  //  failed_to_evac = rtsFalse;
+  //}
+}    
+
+# if defined(PAR)
+/*
+  Generate a finger-print for a graph.
+  A finger-print is a string, with each char representing one node; 
+  depth-first traversal
+*/
+
+void
+GraphFingerPrint(StgClosure *p, char *finger_print)
+{
+  void GraphFingerPrint_(StgClosure *p, char *finger_print);
+
+  ASSERT(tmpClosureTable==NULL);
+  ASSERT(strlen(finger_print)==0);
+
+  /* init hash table */
+  tmpClosureTable = allocHashTable();
+
+  /* now do the real work */
+  GraphFingerPrint_(p, finger_print);
+
+  /* nuke hash table */
+  freeHashTable(tmpClosureTable, NULL);
+  tmpClosureTable = NULL;
+}
+
+/*
+  This is the actual worker functions. 
+  All recursive calls should be made to this function.
+*/
+void
+GraphFingerPrint_(StgClosure *p, char *finger_print)
+{
+  StgPtr x, q;
+  rtsBool printed = rtsFalse;
+  nat i, j, len;
+  const StgInfoTable *info;
+
+  q = p;			/* save ptr to object */
+  len = strlen(finger_print);
+  ASSERT(len<=MAX_FINGER_PRINT_LEN);
+  /* at most 7 chars for this node (I think) */
+  if (len+7>=MAX_FINGER_PRINT_LEN)
+    return;
+
+  /* check whether we have met this node already to break cycles */
+  if (lookupHashTable(tmpClosureTable, (StgWord)p)) { // ie. already touched
+    strcat(finger_print, "#");
+    return; 
+  }
+
+  /* record that we are processing this closure */
+  insertHashTable(tmpClosureTable, (StgWord) p, (void *)rtsTrue/*non-NULL*/);
+
+  ASSERT(p!=(StgClosure*)NULL);
+  ASSERT(LOOKS_LIKE_STATIC(p) ||
+	 LOOKS_LIKE_GHC_INFO(GET_INFO((StgClosure *)p)) ||
+         IS_HUGS_CONSTR_INFO(GET_INFO((StgClosure *)p)));
+
+  info = get_itbl((StgClosure *)p);
+  // append char for this node
+  finger_print[len] = finger_print_char[info->type]; finger_print[len+1] = '\0'; 
+  /* the rest of this fct recursively traverses the graph */
+  switch (info -> type) {
+  
+  case BCO:
+    {
+  	StgBCO* bco = stgCast(StgBCO*,p);
+  	nat i;
+	//%% fprintf(stderr, "BCO (%p) with %d pointers\n", p, bco->n_ptrs);
+        /*
+  	for (i = 0; i < bco->n_ptrs; i++) {
+  	  // bcoConstCPtr(bco,i) = 
+	  GraphFingerPrint_(bcoConstCPtr(bco,i), finger_print);
+  	}
+	*/
+  	// p += bco_sizeW(bco);
+  	break;
+    }
+  
+  case MVAR:
+    break;
+  
+  case THUNK_2_0:
+  case FUN_2_0:
+  case CONSTR_2_0:
+    // append char for this node
+    strcat(finger_print, "22(");
+    GraphFingerPrint_(((StgClosure *)p)->payload[0], finger_print);
+    GraphFingerPrint_(((StgClosure *)p)->payload[1], finger_print);
+    if (strlen(finger_print)+2<MAX_FINGER_PRINT_LEN)
+      strcat(finger_print, ")");
+    break;
+  
+  case THUNK_1_0:
+  case FUN_1_0:
+  case CONSTR_1_0:
+    // append char for this node
+    strcat(finger_print, "12(");
+    GraphFingerPrint_(((StgClosure *)p)->payload[0], finger_print);
+    if (strlen(finger_print)+2<MAX_FINGER_PRINT_LEN)
+      strcat(finger_print, ")");
+    break;
+  
+  case THUNK_0_1:
+  case FUN_0_1:
+  case CONSTR_0_1:
+    // append char for this node
+    strcat(finger_print, "01");
+    break;
+  
+  case THUNK_0_2:
+  case FUN_0_2:
+  case CONSTR_0_2:
+    // append char for this node
+    strcat(finger_print, "02");
+    break;
+  
+  case THUNK_1_1:
+  case FUN_1_1:
+  case CONSTR_1_1:
+    // append char for this node
+    strcat(finger_print, "11(");
+    GraphFingerPrint_(((StgClosure *)p)->payload[0], finger_print);
+    if (strlen(finger_print)+2<MAX_FINGER_PRINT_LEN)
+      strcat(finger_print, ")");
+    break;
+  
+  case FUN:
+  case THUNK:
+  case CONSTR:
+    /* basically same as loop in STABLE_NAME case  */
+    {
+	char str[6];
+	sprintf(str,"%d?(",info->layout.payload.ptrs);
+	strcat(finger_print,str); 
+	for (i=0; i<info->layout.payload.ptrs; i++)
+	  GraphFingerPrint_(((StgClosure *)p)->payload[i], finger_print);
+	if (strlen(finger_print)+2<MAX_FINGER_PRINT_LEN)
+	  strcat(finger_print, ")");
+    }
+    break;
+  
+  case WEAK:
+  case FOREIGN:
+  case STABLE_NAME:
+    {
+      StgPtr end;
+      char str[6];
+      sprintf(str,"%d?", info->layout.payload.ptrs);
+      strcat(finger_print,str); 
+
+	//end = (StgPtr)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+      //for (p = (StgPtr)((StgClosure *)p)->payload; p < end; p++) {
+      // GraphFingerPrint_((StgClosure *)*p, finger_print);
+      //}
+      break;
+    }
+  
+  case IND_PERM:
+  case IND_OLDGEN_PERM:
+    GraphFingerPrint_(((StgIndOldGen *)p)->indirectee, finger_print);
+    break;
+  
+  case MUT_VAR:
+    /* ignore MUT_CONSs */
+    if (((StgMutVar *)p)->header.info != &stg_MUT_CONS_info) {
+      GraphFingerPrint_(((StgMutVar *)p)->var, finger_print);
+    }
+    break;
+  
+  case CAF_BLACKHOLE:
+  case SE_CAF_BLACKHOLE:
+  case SE_BLACKHOLE:
+  case BLACKHOLE:
+    break;
+  
+  case BLACKHOLE_BQ:
+    { 
+      StgBlockingQueue *bh = (StgBlockingQueue *)p;
+      // GraphFingerPrint_((StgClosure *)bh->blocking_queue, finger_print);
+      break;
+    }
+  
+  case THUNK_SELECTOR:
+    { 
+      StgSelector *s = (StgSelector *)p;
+      GraphFingerPrint_(s->selectee, finger_print);
+      break;
+    }
+  
+  case IND:
+    GraphFingerPrint_(((StgInd*)p)->indirectee, finger_print);
+    break;
+
+  case IND_OLDGEN:
+    GraphFingerPrint_(((StgIndOldGen*)p)->indirectee, finger_print);
+    break;
+
+  case IND_STATIC:
+    GraphFingerPrint_(((StgIndOldGen*)p)->indirectee, finger_print);
+    break;
+  
+  case CONSTR_INTLIKE:
+  case CONSTR_CHARLIKE:
+  case CONSTR_STATIC:
+  case CONSTR_NOCAF_STATIC:
+  case THUNK_STATIC:
+  case FUN_STATIC:
+    break;
+  
+  case RET_BCO:
+  case RET_SMALL:
+  case RET_VEC_SMALL:
+  case RET_BIG:
+  case RET_VEC_BIG:
+  case RET_DYN:
+  case UPDATE_FRAME:
+  case STOP_FRAME:
+  case CATCH_FRAME:
+  case SEQ_FRAME:
+    break;
+  
+  case AP_UPD: /* same as PAPs */
+  case PAP:
+    /* Treat a PAP just like a section of stack, not forgetting to
+     * GraphFingerPrint_ the function pointer too...
+     */
+    { 
+  	StgPAP* pap = stgCast(StgPAP*,p);
+	char str[6];
+	sprintf(str,"%d",pap->n_args);
+	strcat(finger_print,str); 
+	//GraphFingerPrint_(pap->fun, finger_print); // ??
+  	break;
+    }
+    
+  case ARR_WORDS:
+    {
+	char str[6];
+	sprintf(str,"%d",((StgArrWords*)p)->words);
+	strcat(finger_print,str); 
+    }
+    break;
+
+  case MUT_ARR_PTRS:
+    /* follow everything */
+    {
+	char str[6];
+	sprintf(str,"%d",((StgMutArrPtrs*)p)->ptrs);
+	strcat(finger_print,str); 
+    }
+    {
+  	StgPtr next;
+  	//next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+  	//for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	//  GraphFingerPrint_((StgClosure *)*p, finger_print);
+  	//}
+  	break;
+    }
+  
+  case MUT_ARR_PTRS_FROZEN:
+    /* follow everything */
+    {
+	char str[6];
+	sprintf(str,"%d",((StgMutArrPtrs*)p)->ptrs);
+	strcat(finger_print,str); 
+    }
+    {
+  	StgPtr start = p, next;
+  	//next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+  	//for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	//  GraphFingerPrint_((StgClosure *)*p, finger_print);
+  	//}
+  	break;
+    }
+  
+  case TSO:
+    { 
+      StgTSO *tso = (StgTSO *)p;
+      char str[6];
+      sprintf(str,"%d",tso->id);
+      strcat(finger_print,str); 
+    }
+    //GraphFingerPrint_((StgClosure *)tso->link, indent_level+1);
+    break;
+  
+#if defined(GRAN) || defined(PAR)
+  case RBH:
+    {
+      // use this
+      // StgInfoTable *rip = REVERT_INFOPTR(get_itbl(p));
+    }
+    break;
+#endif
+#if defined(PAR)
+  case BLOCKED_FETCH:
+    break;
+  case FETCH_ME:
+    break;
+  case FETCH_ME_BQ:
+    break;
+#endif
+#ifdef DIST    
+  case REMOTE_REF:
+    break;
+#endif
+  case EVACUATED:
+    break;
+  
+  default:
+    barf("GraphFingerPrint_: unknown closure %d (%s)",
+	 info -> type, info_type(info));
+  }
+ 
+}    
+# endif /* PAR */
+
+/*
+  Do a sanity check on the whole graph, down to a recursion level of level.
+  Same structure as PrintGraph (nona).
+*/
+void
+checkGraph(StgClosure *p, int rec_level)
+{
+  StgPtr x, q;
+  nat i, j;
+  const StgInfoTable *info;
+  
+  if (rec_level==0)
+    return;
+
+  q = p;			/* save ptr to object */
+
+  /* First, the obvious generic checks */
+  ASSERT(p!=(StgClosure*)NULL);
+  checkClosure(p);              /* see Sanity.c for what's actually checked */
+
+  info = get_itbl((StgClosure *)p);
+  /* the rest of this fct recursively traverses the graph */
+  switch (info -> type) {
+  
+  case BCO:
+    {
+  	StgBCO* bco = stgCast(StgBCO*,p);
+  	nat i;
+        /*
+  	for (i = 0; i < bco->n_ptrs; i++) {
+	  checkGraph(bcoConstCPtr(bco,i), rec_level-1);
+  	}
+	*/
+  	break;
+    }
+  
+  case MVAR:
+    /* treat MVars specially, because we don't want to PrintGraph the
+     * mut_link field in the middle of the closure.
+     */
+    { 
+  	StgMVar *mvar = ((StgMVar *)p);
+	checkGraph((StgClosure *)mvar->head, rec_level-1);
+	checkGraph((StgClosure *)mvar->tail, rec_level-1);
+	checkGraph((StgClosure *)mvar->value, rec_level-1);
+  	break;
+    }
+  
+  case THUNK_2_0:
+  case FUN_2_0:
+  case CONSTR_2_0:
+    checkGraph(((StgClosure *)p)->payload[0], rec_level-1);
+    checkGraph(((StgClosure *)p)->payload[1], rec_level-1);
+    break;
+  
+  case THUNK_1_0:
+    checkGraph(((StgClosure *)p)->payload[0], rec_level-1);
+    break;
+  
+  case FUN_1_0:
+  case CONSTR_1_0:
+    checkGraph(((StgClosure *)p)->payload[0], rec_level-1);
+    break;
+  
+  case THUNK_0_1:
+    break;
+  
+  case FUN_0_1:
+  case CONSTR_0_1:
+    break;
+  
+  case THUNK_0_2:
+  case FUN_0_2:
+  case CONSTR_0_2:
+    break;
+  
+  case THUNK_1_1:
+  case FUN_1_1:
+  case CONSTR_1_1:
+    checkGraph(((StgClosure *)p)->payload[0], rec_level-1);
+    break;
+  
+  case FUN:
+  case THUNK:
+  case CONSTR:
+    for (i=0; i<info->layout.payload.ptrs; i++)
+      checkGraph(((StgClosure *)p)->payload[i], rec_level-1);
+    break;
+  
+  case WEAK:
+  case FOREIGN:
+  case STABLE_NAME:
+    {
+      StgPtr end;
+      
+      end = (StgPtr)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+      for (p = (StgPtr)((StgClosure *)p)->payload; p < end; p++) {
+	checkGraph(*(StgClosure **)p, rec_level-1);
+      }
+      break;
+    }
+  
+  case IND_PERM:
+  case IND_OLDGEN_PERM:
+    checkGraph(((StgIndOldGen *)p)->indirectee, rec_level-1);
+    break;
+  
+  case MUT_VAR:
+    /* ignore MUT_CONSs */
+    if (((StgMutVar *)p)->header.info != &stg_MUT_CONS_info) {
+      checkGraph(((StgMutVar *)p)->var, rec_level-1);
+    }
+    break;
+  
+  case CAF_BLACKHOLE:
+  case SE_CAF_BLACKHOLE:
+  case SE_BLACKHOLE:
+  case BLACKHOLE:
+    break;
+  
+  case BLACKHOLE_BQ:
+    break;
+  
+  case THUNK_SELECTOR:
+    { 
+      StgSelector *s = (StgSelector *)p;
+      checkGraph(s->selectee, rec_level-1);
+      break;
+    }
+  
+  case IND:
+    checkGraph(((StgInd*)p)->indirectee, rec_level-1);
+    break;
+
+  case IND_OLDGEN:
+    checkGraph(((StgIndOldGen*)p)->indirectee, rec_level-1);
+    break;
+  
+  case CONSTR_INTLIKE:
+    break;
+  case CONSTR_CHARLIKE:
+    break;
+  case CONSTR_STATIC:
+    break;
+  case CONSTR_NOCAF_STATIC:
+    break;
+  case THUNK_STATIC:
+    break;
+  case FUN_STATIC:
+    break;
+  case IND_STATIC:
+    break;
+  
+  case RET_BCO:
+    break;
+  case RET_SMALL:
+    break;
+  case RET_VEC_SMALL:
+    break;
+  case RET_BIG:
+    break;
+  case RET_VEC_BIG:
+    break;
+  case RET_DYN:
+    break;
+  case UPDATE_FRAME:
+    break;
+  case STOP_FRAME:
+    break;
+  case CATCH_FRAME:
+    break;
+  case SEQ_FRAME:
+    break;
+  
+  case AP_UPD: /* same as PAPs */
+  case PAP:
+    /* Treat a PAP just like a section of stack, not forgetting to
+     * checkGraph the function pointer too...
+     */
+    { 
+  	StgPAP* pap = stgCast(StgPAP*,p);
+  
+	checkGraph(pap->fun, rec_level-1);
+  	break;
+    }
+    
+  case ARR_WORDS:
+    break;
+
+  case MUT_ARR_PTRS:
+    /* follow everything */
+    {
+  	StgPtr next;
+  
+  	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+  	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	  checkGraph(*(StgClosure **)p, rec_level-1);
+  	}
+  	break;
+    }
+  
+  case MUT_ARR_PTRS_FROZEN:
+    /* follow everything */
+    {
+  	StgPtr start = p, next;
+  
+  	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+  	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	  checkGraph(*(StgClosure **)p, rec_level-1);
+  	}
+  	break;
+    }
+  
+  case TSO:
+    { 
+  	StgTSO *tso;
+  	
+  	tso = (StgTSO *)p;
+	checkGraph((StgClosure *)tso->link, rec_level-1);
+  	break;
+    }
+  
+#if defined(GRAN) || defined(PAR)
+  case RBH:
+    break;
+#endif
+#if defined(PAR)
+  case BLOCKED_FETCH:
+    break;
+  case FETCH_ME:
+    break;
+  case FETCH_ME_BQ:
+    break;
+#endif
+  case EVACUATED:
+    barf("checkGraph: found EVACUATED closure %p (%s)",
+	 p, info_type(p));
+    break;
+  
+  default:
+  }
+}    
+
+#endif /* GRAN */
+
+#endif /* GRAN || PAR */
+
+//@node End of File,  , Printing Packet Contents, Debugging routines for GranSim and GUM
+//@subsection End of File
diff --git a/rts/parallel/ParallelDebug.h b/rts/parallel/ParallelDebug.h
new file mode 100644
index 0000000000..f8aaeb85d4
--- /dev/null
+++ b/rts/parallel/ParallelDebug.h
@@ -0,0 +1,79 @@
+/* 
+   Time-stamp: <Tue Mar 06 2001 00:25:14 Stardate: [-30]6285.08 hwloidl>
+
+   Prototypes of all parallel debugging functions.
+*/
+
+#ifndef PARALLEL_DEBUG_H
+#define PARALLEL_DEBUG_H
+
+#if defined(DEBUG) && (defined(GRAN) || defined(PAR))
+/* max length of the string holding a finger-print for a graph */
+#define MAX_FINGER_PRINT_LEN  10000
+// (10*RtsFlags.ParFlags.packBufferSize)
+#endif
+
+#if defined(DEBUG) && defined(GRAN)
+void G_PRINT_NODE(StgClosure* node);
+void G_PPN(StgClosure* node);
+void G_INFO_TABLE(StgClosure* node);
+void G_CURR_THREADQ(StgInt verbose);
+void G_THREADQ(StgTSO* closure, StgInt verbose);
+void G_TSO(StgTSO* closure, StgInt verbose);
+void G_EVENT(rtsEventQ event, StgInt verbose);
+void G_EVENTQ(StgInt verbose);
+void G_PE_EQ(PEs pe, StgInt verbose);
+void G_SPARK(rtsSparkQ spark, StgInt verbose);
+void G_SPARKQ(rtsSparkQ spark, StgInt verbose);
+void G_CURR_SPARKQ(StgInt verbose);
+void G_PROC(StgInt proc, StgInt verbose);
+void GP(StgInt proc);
+void GCP(void);
+void GT(StgPtr tso);
+void GCT(void);
+void GEQ(void);
+void GTQ(PEs p);
+void GCTQ(void);
+void GSQ(PEs p);
+void GCSQ(void);
+void GN(StgPtr node);
+void GIT(StgPtr node);
+#endif
+
+#if defined(GRAN) || defined(PAR)
+
+char  *display_info_type(StgClosure *closure, char *str);
+void   info_hdr_type(StgClosure *closure, char *res);
+char  *info_type(StgClosure *closure);
+char  *info_type_by_ip(StgInfoTable *ip);
+
+void   PrintPacket(rtsPackBuffer *buffer);
+void   PrintGraph(StgClosure *p, int indent_level);
+void   GraphFingerPrint(StgClosure *p, char *finger_print);
+void   checkGraph(StgClosure *p, int rec_level);
+
+void   checkPacket(rtsPackBuffer *packBuffer);
+
+#endif /* GRAN || PAR */
+
+#if defined(PAR)
+
+/* don't want to import Schedule.h and Sanity.h everywhere */
+extern void print_bq (StgClosure *node);
+extern void checkBQ (StgBlockingQueueElement *bqe, StgClosure *closure);
+
+void   checkGAGAMap(globalAddr *gagamap, int nGAs);
+extern rtsBool isOnLiveIndTable(globalAddr *ga);
+extern void rebuildGAtables(rtsBool full);
+extern void rebuildLAGAtable(void);
+extern void checkLAGAtable(rtsBool check_closures);
+extern void checkHeapChunk(StgPtr start, StgPtr end);
+extern void printGA (globalAddr *ga);
+extern void printGALA (GALA *gala);
+extern void printLiveIndTable(void);
+extern void printRemoteGATable(void);
+extern void printLAGAtable(void);
+
+#endif
+
+#endif /* PARALLEL_DEBUG_H */
diff --git a/rts/parallel/ParallelRts.h b/rts/parallel/ParallelRts.h
new file mode 100644
index 0000000000..d421296d19
--- /dev/null
+++ b/rts/parallel/ParallelRts.h
@@ -0,0 +1,253 @@
+/* --------------------------------------------------------------------------
+   Time-stamp: <Tue Mar 06 2001 00:25:50 Stardate: [-30]6285.08 hwloidl>
+
+   Variables and functions specific to the parallel RTS (i.e. GUM or GranSim)
+   ----------------------------------------------------------------------- */
+
+#ifndef PARALLEL_RTS_H
+#define PARALLEL_RTS_H
+
+#include "ParTicky.h"
+
+/* HWL HACK: compile time sanity checks; shouldn't be necessary at all */
+#if defined(PAR) && defined(GRAN)
+# error "Both PAR and GRAN defined"
+#endif
+
+#if defined(DEBUG)
+/* Paranoia debugging: we add an end-of-buffer marker to every pack buffer 
+                       (only when sanity checking RTS is enabled, of course) */
+#define  DEBUG_HEADROOM        1
+#define  END_OF_BUFFER_MARKER  0x1111bbbb
+#define  GARBAGE_MARKER        0x1111eeee
+#else
+#define  DEBUG_HEADROOM        0
+#endif /* DEBUG */
+
+#if defined(GRAN) || defined(PAR)
+
+#if defined(GRAN)
+
+/* Statistics info */
+extern nat tot_packets, tot_packet_size, tot_cuts, tot_thunks;
+
+/* Pack.c */
+rtsPackBuffer *PackNearbyGraph(StgClosure* closure, StgTSO* tso, 
+			       nat *packBufferSize, GlobalTaskId dest); 
+rtsPackBuffer *PackOneNode(StgClosure* closure, StgTSO* tso, 
+			   nat *packBufferSize);
+rtsPackBuffer *PackTSO(StgTSO *tso, nat *packBufferSize);
+rtsPackBuffer *PackStkO(StgPtr stko, nat *packBufferSize);
+void           PackFetchMe(StgClosure *closure);
+
+/* Unpack.c */
+StgClosure*    UnpackGraph(rtsPackBuffer* buffer);
+void           InitPendingGABuffer(nat size); 
+
+/* RBH.c */
+StgClosure    *convertToRBH(StgClosure *closure);
+void           convertFromRBH(StgClosure *closure);
+
+/* HLComms.c */
+rtsFetchReturnCode blockFetch(StgTSO* tso, PEs proc, StgClosure* bh);
+void               blockThread(StgTSO *tso);
+
+#endif
+#if defined(PAR)
+
+/* Statistics info */
+
+/* global structure for collecting statistics */
+typedef struct GlobalParStats_ {
+  /* GALA and LAGA table info */
+  nat tot_mark_GA, tot_rebuild_GA, tot_free_GA,
+      res_mark_GA, res_rebuild_GA, res_free_GA,
+      cnt_mark_GA, cnt_rebuild_GA, cnt_free_GA,
+      res_size_GA, tot_size_GA, local_alloc_GA, tot_global, tot_local;
+
+  /* time spent managing the GAs */
+  double time_mark_GA, time_rebuild_GA;
+
+  /* spark queue stats */
+  nat res_sp, tot_sp, cnt_sp, emp_sp;
+  // nat tot_sq_len, tot_sq_probes, tot_sparks;
+  /* thread queue stats */
+  nat res_tp, tot_tp, cnt_tp, emp_tp;
+  //nat tot_add_threads, tot_tq_len, non_end_add_threads;
+
+  /* packet statistics */
+  nat tot_packets, tot_packet_size, tot_thunks,
+      res_packet_size, res_thunks,
+      rec_packets, rec_packet_size, rec_thunks,
+      rec_res_packet_size, rec_res_thunks;
+  /* time spent packing stuff */
+  double time_pack, time_unpack;
+
+  /* thread stats */
+  nat tot_threads_created;
+
+  /* spark stats */
+  //nat pruned_sparks, withered_sparks;
+  nat tot_sparks_created, tot_sparks_ignored, tot_sparks_marked,
+      res_sparks_created, res_sparks_ignored, res_sparks_marked; // , sparks_created_on_PE[MAX_PROC];
+  double time_sparks;
+
+  /* scheduling stats */
+  nat tot_yields, tot_stackover, tot_heapover;
+
+  /* message statistics */
+  nat tot_fish_mess, tot_fetch_mess, tot_resume_mess, tot_schedule_mess;
+  nat rec_fish_mess, rec_fetch_mess, rec_resume_mess, rec_schedule_mess;
+#if defined(DIST)
+  nat tot_reval_mess;
+  nat rec_reval_mess;
+#endif
+
+  /* blocking queue statistics
+  rtsTime tot_bq_processing_time;
+  nat tot_bq_len, tot_bq_len_local, tot_awbq, tot_FMBQs;
+  */
+
+  /* specialised info on arrays (for GPH/Maple mainly) */
+  nat tot_arrs, tot_arr_size;
+} GlobalParStats;
+
+extern GlobalParStats globalParStats;
+
+void  globalParStat_exit(void);
+
+/* Pack.c */
+rtsBool        InitPackBuffer(void);
+rtsPackBuffer *PackNearbyGraph(StgClosure* closure, StgTSO* tso, 
+			       nat *packBufferSize, GlobalTaskId dest); 
+
+/* Unpack.c */
+void           CommonUp(StgClosure *src, StgClosure *dst);
+StgClosure    *UnpackGraph(rtsPackBuffer *buffer, globalAddr **gamap, 
+			   nat *nGAs);
+
+/* RBH.c */
+StgClosure    *convertToRBH(StgClosure *closure);
+void           convertToFetchMe(StgRBH *rbh, globalAddr *ga);
+
+/* HLComms.c */
+void           blockFetch(StgBlockedFetch *bf, StgClosure *bh);
+void           blockThread(StgTSO *tso);
+
+/* Global.c */
+void           GALAdeprecate(globalAddr *ga);
+
+/* HLComms.c */
+nat            pending_fetches_len(void);
+
+/* ParInit.c */
+void 	       initParallelSystem(void);
+void 	       shutdownParallelSystem(StgInt n);
+void 	       synchroniseSystem(void);
+void 	       par_exit(I_);
+
+#endif
+
+/* this routine should be moved to a more general module; currently in Pack.c 
+StgInfoTable* get_closure_info(StgClosure* node, 
+			       nat *size, nat *ptrs, nat *nonptrs, nat *vhs, 
+			       char *info_hdr_ty);
+*/
+void doGlobalGC(void); 
+
+//@node GC routines, Debugging routines, Spark handling routines
+//@subsection GC routines
+
+#if defined(PAR)
+/* HLComms.c */
+void      freeRemoteGA(int pe, globalAddr *ga);
+void      sendFreeMessages(void);
+void      markPendingFetches(rtsBool major_gc);
+
+/* Global.c */
+void      markLocalGAs(rtsBool full);
+void      RebuildGAtables(rtsBool full);
+void      RebuildLAGAtable(void);
+#endif
+
+//@node Debugging routines, Generating .gr profiles, GC routines
+//@subsection Debugging routines
+
+#if defined(PAR)
+void      printGA (globalAddr *ga);
+void      printGALA (GALA *gala);
+void      printLAGAtable(void);
+
+rtsBool   isOnLiveIndTable(globalAddr *ga);
+rtsBool   isOnRemoteGATable(globalAddr *ga);
+void      checkFreeGALAList(void);
+void      checkFreeIndirectionsList(void);
+#endif
+
+//@node Generating .gr profiles, Index, Debugging routines
+//@subsection Generating .gr profiles
+
+#define STATS_FILENAME_MAXLEN	128
+
+/* Where to write the log file */
+//@cindex gr_file
+//@cindex gr_filename
+extern FILE *gr_file;
+extern char gr_filename[STATS_FILENAME_MAXLEN];
+
+//@cindex init_gr_stats
+//@cindex init_gr_simulation
+//@cindex end_gr_simulation
+void 	  	init_gr_stats (void);
+void 	  	init_gr_simulation(int rts_argc, char *rts_argv[], 
+ 	 			   int prog_argc, char *prog_argv[]);
+void 	  	end_gr_simulation(void);
+
+// TODO: move fcts in here (as static inline)
+StgInfoTable*   get_closure_info(StgClosure* node, nat *size, nat *ptrs, nat *nonptrs, nat *vhs, char *info_hdr_ty);
+rtsBool         IS_BLACK_HOLE(StgClosure* node);
+StgClosure     *IS_INDIRECTION(StgClosure* node)          ;
+StgClosure     *UNWIND_IND (StgClosure *closure);
+
+
+#endif /* defined(PAR) || defined(GRAN) */
+
+//@node Common macros, Index, Generating .gr profiles
+//@subsection Common macros
+
+#define LOOKS_LIKE_PTR(r)    \
+        (LOOKS_LIKE_STATIC_CLOSURE(r) ||  \
+         ((HEAP_ALLOCED(r) && Bdescr((P_)r)->free != (void *)-1)))
+
+/* see Sanity.c for this kind of test; doing this in these basic fcts
+   is paranoid (nuke it after debugging!)
+*/
+
+/* pathetic version of the check whether p can be a closure */
+#define LOOKS_LIKE_COOL_CLOSURE(p)  1
+
+//LOOKS_LIKE_GHC_INFO(get_itbl(p))
+
+    /* Is it a static closure (i.e. in the data segment)? */ \
+    /*
+#define LOOKS_LIKE_COOL_CLOSURE(p)  \
+    ((LOOKS_LIKE_STATIC(p)) ?                                   \
+	closure_STATIC(p)                               \
+      : !closure_STATIC(p) && LOOKS_LIKE_PTR(p))
+    */
+
+#endif /* PARALLEL_RTS_H */
+
+//@node Index,  , Index
+//@subsection Index
+
+//@index
+//* IS_BLACK_HOLE::  @cindex\s-+IS_BLACK_HOLE
+//* IS_INDIRECTION::  @cindex\s-+IS_INDIRECTION
+//* end_gr_simulation::  @cindex\s-+end_gr_simulation
+//* get_closure_info::  @cindex\s-+get_closure_info
+//* gr_file::  @cindex\s-+gr_file
+//* gr_filename::  @cindex\s-+gr_filename
+//* init_gr_simulation::  @cindex\s-+init_gr_simulation
+//* unwindInd::  @cindex\s-+unwindInd
+//@end index
diff --git a/rts/parallel/RBH.c b/rts/parallel/RBH.c
new file mode 100644
index 0000000000..1612209027
--- /dev/null
+++ b/rts/parallel/RBH.c
@@ -0,0 +1,337 @@
+/*
+  Time-stamp: <Tue Mar 13 2001 19:07:13 Stardate: [-30]6323.98 hwloidl>
+
+  Revertible Black Hole Manipulation.
+  Used in GUM and GranSim during the packing of closures. These black holes
+  must be revertible because a GC might occur while the packet is being 
+  transmitted. In this case all RBHs have to be reverted.
+  */
+
+#if defined(PAR) || defined(GRAN) /* whole file */
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+# if defined(DEBUG)
+# include "ParallelDebug.h"
+# endif
+#include "Storage.h"  // for recordMutable
+#include "StgMacros.h" // inlined IS_... fcts
+
+/*
+   Turn a closure into a revertible black hole.  After the conversion, the
+   first two words of the closure (after the fixed header, of course) will
+   be a link to the mutables list (if appropriate for the garbage
+   collector), and a pointer to the blocking queue.  The blocking queue is
+   terminated by a 2-word SPEC closure which holds the original contents of
+   the first two words of the closure.  
+*/
+
+//@menu
+//* Externs and prototypes::	
+//* Conversion Functions::	
+//* Index::			
+//@end menu
+
+//@node Externs and prototypes, Conversion Functions
+//@section Externs and prototypes
+
+EXTFUN(stg_RBH_Save_0_info);
+EXTFUN(stg_RBH_Save_1_info);
+EXTFUN(stg_RBH_Save_2_info);
+
+//@node Conversion Functions, Index, Externs and prototypes
+//@section Conversion Functions
+
+/*
+  A closure is turned into an RBH upon packing it (see PackClosure in Pack.c).
+  This is needed in case we have to do a GC before the packet is turned
+  into a graph on the PE receiving the packet. 
+*/
+//@cindex convertToRBH
+StgClosure *
+convertToRBH(closure)
+StgClosure *closure;
+{
+  StgRBHSave *rbh_save;
+  StgInfoTable *info_ptr, *rbh_info_ptr, *old_info;
+  nat size, ptrs, nonptrs, vhs;
+  char str[80];
+
+  /*
+     Closure layout before this routine runs amuck:
+       +-------------------
+       |   HEADER   | DATA ...
+       +-------------------
+       | FIXED_HS   |
+  */
+  /* 
+     Turn closure into an RBH.  This is done by modifying the info_ptr,
+     grabbing the info_ptr of the RBH for this closure out of its
+     ITBL. Additionally, we have to save the words from the closure, which
+     will hold the link to the blocking queue.  For this purpose we use the
+     RBH_Save_N closures, with N being the number of pointers for this
+     closure.  */
+  IF_GRAN_DEBUG(pack,
+		belch("*>::   %p (%s): Converting closure into an RBH",
+		      closure, info_type(closure))); 
+  IF_PAR_DEBUG(pack,
+		belch("*>::   %p (%s): Converting closure into an RBH",
+		      closure, info_type(closure))); 
+
+  ASSERT(closure_THUNK(closure));
+
+  IF_GRAN_DEBUG(pack,
+		old_info = get_itbl(closure));
+
+  /* Allocate a new closure for the holding data ripped out of closure */
+  if ((rbh_save = (StgRBHSave *)allocate(_HS + 2)) == NULL)
+    return NULL;  /* have to Garbage Collect; check that in the caller! */
+
+  info_ptr = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+  ASSERT(size >= _HS+MIN_UPD_SIZE);
+
+  /* Fill in the RBH_Save closure with the original data from closure */
+  rbh_save->payload[0] = (StgPtr) ((StgRBH *)closure)->blocking_queue;
+  rbh_save->payload[1] = (StgPtr) ((StgRBH *)closure)->mut_link;
+
+  /* Set the info_ptr for the rbh_Save closure according to the number of
+     pointers in the original */
+
+  rbh_info_ptr = (StgInfoTable *) (ptrs == 0 ? &stg_RBH_Save_0_info :
+				   ptrs == 1 ? &stg_RBH_Save_1_info :
+				   &stg_RBH_Save_2_info);
+  SET_INFO(rbh_save, rbh_info_ptr);
+  /* same bitmask as the original closure */
+  SET_GRAN_HDR(rbh_save, PROCS(closure));
+
+  /* Init the blocking queue of the RBH and have it point to the saved data */
+  ((StgRBH *)closure)->blocking_queue = (StgBlockingQueueElement *)rbh_save;
+
+  ASSERT(LOOKS_LIKE_GHC_INFO(RBH_INFOPTR(get_itbl(closure))));
+  /* Turn the closure into a RBH;  a great system, indeed! */
+  SET_INFO(closure, RBH_INFOPTR(get_itbl(closure)));
+
+  /*
+    add closure to the mutable list!
+    do this after having turned the closure into an RBH, because an
+    RBH is mutable but the closure it was before wasn't mutable
+  */
+  recordMutable((StgMutClosure *)closure);
+
+  //IF_GRAN_DEBUG(pack,
+		/* sanity check; make sure that reverting the RBH yields the 
+		   orig closure, again */
+  //ASSERT(REVERT_INFOPTR(get_itbl(closure))==old_info));
+
+  /*
+     Closure layout after this routine has run amuck:
+       +---------------------
+       | RBH-HEADER | |   |  ...
+       +--------------|---|--
+       | FIXED_HS   | |   v
+                      |   Mutable-list ie another StgMutClosure
+		      v
+		      +---------
+		      | RBH_SAVE with 0-2 words of DATA
+		      +---------
+  */
+
+  return closure;
+}
+
+/*
+  An RBH closure is turned into a FETCH_ME when reveiving an ACK message
+  indicating that the transferred closure has been unpacked on the other PE
+  (see processAck in HLComms.c). The ACK also contains the new GA of the
+  closure to which the FETCH_ME closure has to point.
+
+  Converting a closure to a FetchMe is trivial, unless the closure has
+  acquired a blocking queue.  If that has happened, we first have to awaken
+  the blocking queue.  What a nuisance!  Fortunately, @AwakenBlockingQueue@
+  should now know what to do.
+
+  A note on GrAnSim: In GrAnSim we don't have FetchMe closures. However,
+  we have to turn a RBH back to its original form when the simulated
+  transfer of the closure has been finished. Therefore we need the
+  @convertFromRBH@ routine below. After converting the RBH back to its
+  original form and awakening all TSOs, the first TSO will reenter the
+  closure which is now local and carry on merrily reducing it (the other
+  TSO will be less merrily blocked on the now local closure; we're costing
+  the difference between local and global blocks in the BQ code).  -- HWL 
+*/
+
+# if defined(PAR)
+
+EXTFUN(stg_FETCH_ME_info);
+
+//@cindex convertToFetchMe
+void
+convertToFetchMe(rbh, ga)
+StgRBH *rbh;
+globalAddr *ga;
+{
+  // StgInfoTable *ip = get_itbl(rbh);
+  StgBlockingQueueElement *bqe = rbh->blocking_queue;
+
+  ASSERT(get_itbl(rbh)->type==RBH);
+
+  IF_PAR_DEBUG(pack,
+	       belch("**:: Converting RBH %p (%s) into a FETCH_ME for GA ((%x, %d, %x))",
+		     rbh, info_type(rbh), 
+	             ga->payload.gc.gtid, ga->payload.gc.slot, ga->weight)); 
+
+  /* put closure on mutables list, while it is still a RBH */
+  recordMutable((StgMutClosure *)rbh);
+
+  /* actually turn it into a FETCH_ME */
+  SET_INFO((StgClosure *)rbh, &stg_FETCH_ME_info);
+
+  /* set the global pointer in the FETCH_ME closure to the given value */
+  ((StgFetchMe *)rbh)->ga = ga;
+
+  IF_PAR_DEBUG(pack,
+	       if (get_itbl(bqe)->type==TSO || get_itbl(bqe)->type==BLOCKED_FETCH)
+	         belch("**:: Awakening non-empty BQ of RBH closure %p (first TSO is %d (%p)",
+		      rbh, ((StgTSO *)bqe)->id, ((StgTSO *)bqe))); 
+
+  /* awaken all TSOs and BLOCKED_FETCHES on the blocking queue */
+  if (get_itbl(bqe)->type==TSO || get_itbl(bqe)->type==BLOCKED_FETCH)
+    awakenBlockedQueue(bqe, (StgClosure *)rbh);
+}
+# else  /* GRAN */
+/* Prototype */
+// void UnlinkFromMUT(StgPtr closure); 
+
+/*
+  This routine in fact reverts the RBH into its original form; this code 
+  should be of interest for GUM, too, but is not needed in the current version.
+  convertFromRBH is called where GUM uses convertToFetchMe.
+*/
+void
+convertFromRBH(closure)
+StgClosure *closure;
+{
+  StgBlockingQueueElement *bqe = ((StgRBH*)closure)->blocking_queue;
+  char str[NODE_STR_LEN]; // debugging only
+  StgInfoTable *rip = REVERT_INFOPTR(get_itbl(closure));  // debugging only
+
+  IF_GRAN_DEBUG(pack,
+		if (get_itbl(bqe)->type==TSO)
+		  sprintf(str, "%d (%p)", 
+			  ((StgTSO *)bqe)->id, ((StgTSO *)bqe));
+		else 
+		  strcpy(str, "empty");
+		belch("*<:: Reverting RBH %p (%s) into a ??? closure again; BQ start: %s",
+		      closure, info_type(closure), str));
+
+  ASSERT(get_itbl(closure)->type==RBH);
+
+  /* awakenBlockedQueue also restores the RBH_Save closure
+     (have to call it even if there are no TSOs in the queue!) */
+  awakenBlockedQueue(bqe, closure);
+
+  /* Put back old info pointer (grabbed from the RBH's info table).
+     We do that *after* awakening the BQ to be sure node is an RBH when
+     calling awakenBlockedQueue (different in GUM!)
+  */
+  SET_INFO(closure, REVERT_INFOPTR(get_itbl(closure)));
+
+  /* put closure on mutables list */
+  recordMutable((StgMutClosure *)closure);
+
+# if 0 /* rest of this fct */
+    /* ngoq ngo' */
+    /* FETCHME_GA(closure) = ga; */
+    if (IS_MUTABLE(INFO_PTR(bqe))) {
+      PROC old_proc = CurrentProc,        /* NB: For AwakenBlockingQueue, */
+           new_proc = where_is(closure);  /*     CurentProc must be where */
+					  /*     closure lives. */
+      CurrentProc = new_proc;
+
+#  if defined(GRAN_CHECK)
+      if (RTSflags.GranFlags.debug & 0x100)
+        fprintf(stderr,"===== AwBQ of node 0x%lx (%s) [PE %2u]\n",
+	               closure, (isSpec ? "SPEC_RBH" : "GEN_RBH"), new_proc);
+#  endif
+
+      rbh_save = AwakenBlockingQueue(bqe);     /* AwakenBlockingQueue(bqe); */
+      CurrentProc = old_proc;
+    } else {
+        rbh_save = bqe;
+    }
+
+    /* Put data from special RBH save closures back into the closure */
+    if ( rbh_save == NULL ) {
+      fprintf(stderr,"convertFromRBH: No RBH_Save_? closure found at end of BQ!\n");
+      EXIT(EXIT_FAILURE);
+    } else {
+      closure[isSpec ? SPEC_HS : GEN_HS] = rbh_save[SPEC_HS];
+      closure[(isSpec ? SPEC_HS : GEN_HS) + 1] = rbh_save[SPEC_HS + 1];
+    }
+# endif /* 0 */
+
+# if 0 && (defined(GCap) || defined(GCgn))
+    /* ngoq ngo' */
+    /* If we convert from an RBH in the old generation,
+       we have to make sure it goes on the mutables list */
+
+    if(closure <= StorageMgrInfo.OldLim) {
+	if (IS_MUTABLE(INFO_PTR(closure)) && MUT_LINK(closure) == MUT_NOT_LINKED) {
+	    MUT_LINK(closure) = (StgWord) StorageMgrInfo.OldMutables;
+            StorageMgrInfo.OldMutables = closure;
+	}
+    }
+# endif /* 0 */
+}
+#endif /* PAR */
+
+/* Remove closure from the mutables list */
+#if 0
+/* ngoq ngo' */
+void
+UnlinkFromMUT(StgPtr closure) 
+{
+  StgPtr curr = StorageMgrInfo.OldMutables, prev = NULL;
+
+  while (curr != NULL && curr != closure) {
+    ASSERT(MUT_LINK(curr)!=MUT_NOT_LINKED);
+    prev=curr;
+    curr=MUT_LINK(curr); 
+  }
+  if (curr==closure) {   
+   if (prev==NULL) 
+     StorageMgrInfo.OldMutables = MUT_LINK(curr);
+   else   
+     MUT_LINK(prev) = MUT_LINK(curr);
+   MUT_LINK(curr) = MUT_NOT_LINKED;
+  }
+
+#  if 0 && (defined(GCap) || defined(GCgn))
+  {
+    closq newclos;
+    extern closq ex_RBH_q;
+
+    newclos = (closq) stgMallocBytes(sizeof(struct clos), "UnlinkFromMUT");
+    CLOS_CLOSURE(newclos) = closure;
+    CLOS_PREV(newclos) = NULL;
+    CLOS_NEXT(newclos) = ex_RBH_q;
+    if (ex_RBH_q!=NULL)
+      CLOS_PREV(ex_RBH_q) = newclos;
+    ex_RBH_q = newclos;
+  }
+#  endif
+}
+#endif /* PAR */
+
+#endif /* PAR || GRAN -- whole file */
+
+//@node Index,  , Conversion Functions
+//@section Index
+
+//@index
+//* convertToFetchMe::  @cindex\s-+convertToFetchMe
+//* convertToRBH::  @cindex\s-+convertToRBH
+//@end index
diff --git a/rts/parallel/SysMan.c b/rts/parallel/SysMan.c
new file mode 100644
index 0000000000..40bcf6a19e
--- /dev/null
+++ b/rts/parallel/SysMan.c
@@ -0,0 +1,650 @@
+/* ----------------------------------------------------------------------------
+   Time-stamp: <Wed Mar 21 2001 17:16:28 Stardate: [-30]6363.59 hwloidl>
+
+   GUM System Manager Program
+   Handles startup, shutdown and global synchronisation of the parallel system.
+
+   The Parade/AQUA Projects, Glasgow University, 1994-1995.
+   GdH/APART Projects, Heriot-Watt University, Edinburgh, 1997-2000.
+ 
+   ------------------------------------------------------------------------- */
+
+//@node GUM System Manager Program, , , 
+//@section GUM System Manager Program
+
+//@menu
+//* General docu::		
+//* Includes::			
+//* Macros etc::		
+//* Variables::			
+//* Prototypes::		
+//* Aux startup and shutdown fcts::  
+//* Main fct::			
+//* Message handlers::		
+//* Auxiliary fcts::		
+//* Index::			
+//@end menu
+
+//@node General docu, Includes, GUM System Manager Program, GUM System Manager Program
+//@subsection General docu
+
+/*
+The Sysman task currently controls initiation, termination, of a
+parallel Haskell program running under GUM. In the future it may
+control global GC synchronisation and statistics gathering. Based on
+K. Hammond's SysMan.lc in Graph for PVM. SysMan is unusual in that it
+is not part of the executable produced by ghc: it is a free-standing
+program that spawns PVM tasks (logical PEs) to evaluate the
+program. After initialisation it runs in parallel with the PE tasks,
+awaiting messages.
+
+OK children, buckle down for some serious weirdness, it works like this ...
+
+o The argument vector (argv) for SysMan has one the following 2 shapes:
+
+-------------------------------------------------------------------------------
+| SysMan path | debug flag | pvm-executable path | Num. PEs | Program Args ...|
+-------------------------------------------------------------------------------
+
+-------------------------------------------------------------------
+| SysMan path | pvm-executable path | Num. PEs | Program Args ... |
+-------------------------------------------------------------------
+
+The "pvm-executable path" is an absolute path of where PVM stashes the
+code for each PE. The arguments passed on to each PE-executable
+spawned by PVM are:
+
+-------------------------------
+| Num. PEs | Program Args ... |
+-------------------------------
+
+The arguments passed to the Main-thread PE-executable are
+
+-------------------------------------------------------------------
+| main flag | pvm-executable path | Num. PEs | Program Args ... |
+-------------------------------------------------------------------
+
+o SysMan's algorithm is as follows.
+
+o use PVM to spawn (nPE-1) PVM tasks 
+o fork SysMan to create the main-thread PE. This permits the main-thread to 
+  read and write to stdin and stdout. 
+o  Wait for all the PE-tasks to reply back saying they are ready and if they were the
+  main thread or not.
+o Broadcast an array of the PE task-ids out to all of the PE-tasks.
+o Enter a loop awaiting incoming messages, e.g. failure, Garbage-collection, 
+  termination.
+
+The forked Main-thread algorithm, in SysMan, is as follows.
+
+o disconnects from PVM.
+o sets a flag in argv to indicate that it is the main thread.
+o `exec's a copy of the pvm-executable (i.e. the program being run)
+
+
+The pvm-executable run by each PE-task, is initialised as follows.
+
+o Registers with PVM, obtaining a task-id.
+o If it was main it gets SysMan's task-id from argv otherwise it can use pvm_parent.
+oSends a ready message to SysMan together with a flag indicating if it was main or not.
+o Receives from SysMan the array of task-ids of the other PEs.
+o If the number of task-ids sent was larger than expected then it must have been a task
+  generated after the rest of the program had started, so it sends its own task-id message
+  to all the tasks it was told about.
+o Begins execution.
+
+*/
+
+//@node Includes, Macros etc, General docu, GUM System Manager Program
+//@subsection Includes
+
+/* Evidently not Posix */
+/* #include "PosixSource.h" */
+
+#include "Rts.h"
+#include "ParTypes.h"
+#include "LLC.h"
+#include "Parallel.h"
+#include "ParallelRts.h" // stats only
+
+//@node Macros etc, Variables, Includes, GUM System Manager Program
+//@subsection Macros etc
+
+/* SysMan is put on top of the GHC routine that does the RtsFlags handling.
+   So, we cannot use the standard macros. For the time being we use a macro
+   that is fixed at compile time.
+*/
+
+#ifdef IF_PAR_DEBUG
+#undef IF_PAR_DEBUG
+#endif
+    
+/* debugging enabled */
+//#define IF_PAR_DEBUG(c,s)  { s; } 
+/* debugging disabled */
+#define IF_PAR_DEBUG(c,s)  /* nothing */
+
+void *stgMallocBytes (int n, char *msg);
+
+//@node Variables, Prototypes, Macros etc, GUM System Manager Program
+//@subsection Variables
+
+/*
+   The following definitions included so that SysMan can be linked with Low
+   Level Communications module (LLComms). They are not used in SysMan.  
+*/
+GlobalTaskId         mytid; 
+
+static unsigned      PEsArrived = 0;
+static GlobalTaskId  gtids[MAX_PES];
+static GlobalTaskId  sysman_id, sender_id;
+static unsigned      PEsTerminated = 0;
+static rtsBool       Finishing = rtsFalse;
+static long          PEbuffer[MAX_PES];
+nat                  nSpawn = 0;    // current no. of spawned tasks (see gtids)
+nat                  nPEs = 0;      // number of PEs specified on startup
+nat                  nextPE;
+/* PVM-ish variables */
+char                 *petask, *pvmExecutable;
+char                 **pargv;
+int                  cc, spawn_flag = PvmTaskDefault;
+
+#if 0 && defined(PAR_TICKY)
+/* ToDo: use allGlobalParStats to collect stats of all PEs */
+GlobalParStats *allGlobalParStats[MAX_PES];
+#endif
+
+//@node Prototypes, Aux startup and shutdown fcts, Variables, GUM System Manager Program
+//@subsection Prototypes
+
+/* prototypes for message handlers called from the main loop of SysMan */
+void newPE(int nbytes, int opcode, int sender_id);
+void readyPE(int nbytes, int opcode, int sender_id);
+void finishPE(int nbytes, int opcode, int sender_id, int exit_code);
+
+//@node Aux startup and shutdown fcts, Main fct, Prototypes, GUM System Manager Program
+//@subsection Aux startup and shutdown fcts
+
+/* 
+   Create the PE Tasks. We spawn (nPEs-1) pvm threads: the Main Thread 
+   (which starts execution and performs IO) is created by forking SysMan 
+*/
+static int
+createPEs(int total_nPEs) {
+  int i, spawn_nPEs, iSpawn = 0, nArch, nHost;
+  struct pvmhostinfo *hostp; 
+  int sysman_host;
+
+  spawn_nPEs = total_nPEs-1;
+  if (spawn_nPEs > 0) {
+    IF_PAR_DEBUG(verbose,
+		 fprintf(stderr, "==== [%x] Spawning %d PEs(%s) ...\n", 
+			 sysman_id, spawn_nPEs, petask);
+		 fprintf(stderr, "  args: ");
+		 for (i = 0; pargv[i]; ++i)
+		   fprintf(stderr, "%s, ", pargv[i]);
+		 fprintf(stderr, "\n"));
+
+    pvm_config(&nHost,&nArch,&hostp);
+    sysman_host=pvm_tidtohost(sysman_id);
+	
+    /* create PEs on the specific machines in the specified order! */
+    for (i=0; (iSpawn<spawn_nPEs) && (i<nHost); i++)
+      if (hostp[i].hi_tid != sysman_host) { 
+	checkComms(pvm_spawn(petask, pargv, spawn_flag+PvmTaskHost, 
+			     hostp[i].hi_name, 1, gtids+iSpawn),
+		   "SysMan startup");
+	IF_PAR_DEBUG(verbose,
+		     fprintf(stderr, "==== [%x] Spawned PE %d onto %s\n",
+			     sysman_id, i, hostp[i].hi_name));
+	iSpawn++;
+      }
+      
+    /* create additional PEs anywhere you like */
+    if (iSpawn<spawn_nPEs) { 
+      checkComms(pvm_spawn(petask, pargv, spawn_flag, "", 
+			   spawn_nPEs-iSpawn, gtids+iSpawn),
+		 "SysMan startup");
+	IF_PAR_DEBUG(verbose,
+		     fprintf(stderr,"==== [%x] Spawned %d additional PEs anywhere\n",
+			     sysman_id, spawn_nPEs-iSpawn));
+      }   
+    }
+
+#if 0
+  /* old code with random placement of PEs; make that a variant? */
+# error "Broken startup in SysMan"
+  { /* let pvm place the PEs anywhere; not used anymore */
+    checkComms(pvm_spawn(petask, pargv, spawn_flag, "", spawn_nPEs, gtids),"SysMan startup");
+    IF_PAR_DEBUG(verbose,
+		 fprintf(stderr,"==== [%x] Spawned\n", sysman_id));
+    
+  }
+#endif    
+
+  // iSpawn=spawn_nPEs; 
+
+  return iSpawn;
+}
+
+/* 
+   Check if this pvm task is in the list of tasks we spawned and are waiting 
+   on, if so then remove it.
+*/
+
+static rtsBool 
+alreadySpawned (GlobalTaskId g) { 
+  unsigned int i;
+
+  for (i=0; i<nSpawn; i++)
+    if (g==gtids[i]) { 
+      nSpawn--;
+      gtids[i] = gtids[nSpawn];  //the last takes its place
+      return rtsTrue;
+    }
+  return rtsFalse;
+}
+
+static void 
+broadcastFinish(void) { 
+  int i,j;
+  int tids[MAX_PES];  /* local buffer of all surviving PEs */
+
+  for (i=0, j=0; i<nPEs; i++) 
+    if (PEbuffer[i]) 
+      tids[j++]=PEbuffer[i]; //extract valid tids
+
+  IF_PAR_DEBUG(verbose,
+    fprintf(stderr,"==== [%x] Broadcasting Finish to %d PEs; initiating shutdown\n", 
+	sysman_id, j));
+
+  /* ToDo: move into LLComms.c */	    	    	    
+  pvm_initsend(PvmDataDefault);
+  pvm_mcast(tids,j,PP_FINISH);
+}
+
+static void 
+broadcastPEtids (void) { 
+  nat i; 
+
+  IF_PAR_DEBUG(verbose,
+    fprintf(stderr,"==== [%x] SysMan sending PE table to all PEs\n", sysman_id);
+    /* debugging */
+    fprintf(stderr,"++++ [%x] PE table as seen by SysMan:\n", mytid);
+    for (i = 0; i < nPEs; i++) { 
+      fprintf(stderr,"++++ PEbuffer[%d] = %x\n", i, PEbuffer[i]);
+    }      	
+  )
+
+  broadcastOpN(PP_PETIDS, PEGROUP, nPEs, &PEbuffer);
+}
+
+//@node Main fct, Message handlers, Aux startup and shutdown fcts, GUM System Manager Program
+//@subsection Main fct
+
+//@cindex main
+int 
+main (int argc, char **argv) {
+  int rbufid;
+  int opcode, nbytes, nSpawn;
+  unsigned int i;
+  
+  setbuf(stdout, NULL);  // disable buffering of stdout
+  setbuf(stderr, NULL);  // disable buffering of stderr
+
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,
+	               "==== RFP: GdH enabled SysMan reporting for duty\n"));
+  
+  if (argc > 1) {
+    if (*argv[1] == '-') {
+      spawn_flag = PvmTaskDebug;
+      argv[1] = argv[0];
+      argv++; argc--;
+    }
+    sysman_id = pvm_mytid();  /* This must be the first PVM call */
+    
+    if (sysman_id<0) { 
+	fprintf(stderr, "==== PVM initialisation failure\n");  
+      	exit(EXIT_FAILURE);  
+    }
+    
+    /* 
+       Get the full path and filename of the pvm executable (stashed in some
+       PVM directory), and the number of PEs from the command line.
+    */
+    pvmExecutable = argv[1];
+    nPEs = atoi(argv[2]);
+    
+    if (nPEs==0) { 
+      /* as usual 0 means infinity: use all PEs specified in PVM config */
+      int nArch, nHost;
+      struct pvmhostinfo *hostp; 
+
+      /* get info on PVM config */
+      pvm_config(&nHost,&nArch,&hostp);
+      nPEs=nHost;
+      sprintf(argv[2],"%d",nPEs); /* ToCheck: does this work on all archs */
+    }	
+
+    /* get the name of the binary to execute */
+    if ((petask = getenv(PETASK)) == NULL)  // PETASK set by driver
+      petask = PETASK;
+
+    IF_PAR_DEBUG(verbose,
+		 fprintf(stderr,"==== [%x] nPEs: %d; executable: |%s|\n", 
+			sysman_id, nPEs, petask));
+    
+    /* Check that we can create the number of PE and IMU tasks requested.
+                                                     ^^^
+       This comment is most entertaining since we haven't been using IMUs 
+       for the last 10 years or so -- HWL */
+    if ((nPEs > MAX_PES) || (nPEs<1)) {
+      fprintf(stderr,"==** SysMan: No more than %d PEs allowed (%d requested)\n     Reconfigure GUM setting MAX_PE in ghc/includes/Parallel.h to a higher value\n", 
+	   MAX_PES, nPEs);
+      exit(EXIT_FAILURE);
+    }
+
+    IF_PAR_DEBUG(verbose,
+		   fprintf(stderr,"==== [%x] is SysMan Task\n", sysman_id));
+
+    /* Initialise the PE task arguments from Sysman's arguments */
+    pargv = argv + 2;
+
+    /* Initialise list of all PE identifiers */
+    PEsArrived=0;  
+    nextPE=1;
+    for (i=0; i<nPEs; i++)
+      PEbuffer[i]=0;
+    
+    /* start up the required number of PEs */
+    nSpawn = createPEs(nPEs);
+    
+    /* 
+       Create the MainThread PE by forking SysMan. This arcane coding 
+       is required to allow MainThread to read stdin and write to stdout.
+       PWT 18/1/96 
+    */
+    //nPEs++;                /* Record that the number of PEs is increasing */
+    if ((cc = fork())) {
+      checkComms(cc,"SysMan fork");         /* Parent continues as SysMan */
+                  
+      PEbuffer[0]=0;    /* we accept the first main and assume its valid. */
+      PEsArrived=1;     /* assume you've got main                         */
+
+      IF_PAR_DEBUG(verbose,
+		   fprintf(stderr,"==== [%x] Sysman successfully initialized!\n",
+			   sysman_id));
+
+//@cindex message handling loop
+      /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
+      /* Main message handling loop                                         */
+      /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
+      /* Process incoming messages */
+      while (1) {
+	if ((rbufid = pvm_recv(ANY_TASK, ANY_OPCODE)) < 0) {
+	  pvm_perror("==** Sysman: Receiving Message (pvm_recv)");
+          /* never reached */
+        }
+
+	pvm_bufinfo(rbufid, &nbytes, &opcode, &sender_id);
+
+        /* very low level debugging
+	IF_PAR_DEBUG(verbose,
+	             fprintf(stderr,"== [%x] SysMan: Message received by SysMan: rbufid=%x, nbytes = %d, opcode = %x, sender_id = %x\n",
+                     sysman_id, rbufid, nbytes, opcode, sender_id));
+	*/
+
+	switch (opcode) {
+	    
+	  case PP_NEWPE: /* a new PE is registering for work */
+	    newPE(nbytes, opcode, sender_id);
+	    break;
+
+          case PP_READY: /* startup complete; let PEs start working */
+	    readyPE(nbytes, opcode, sender_id);
+	    break;
+
+	      
+	  case PP_GC_INIT: /* start global GC */
+	    /* This Function not yet implemented for GUM */
+	    fprintf(stderr,"==** Global GC requested by PE %x. Not yet implemented for GUM!\n", 
+		    sender_id);
+	    break;
+	    
+	  case PP_STATS_ON: /* enable statistics gathering */
+	    fprintf(stderr,"==** PP_STATS_ON requested by %x. Not yet implemented for GUM!\n", 
+		  sender_id);
+	    break;
+
+	  case PP_STATS_OFF: /* disable statistics gathering */
+	    fprintf(stderr,"==** PP_STATS_OFF requested by %x. Not yet implemented for GUM!\n", 
+		  sender_id);
+	    break;
+	    
+	  case PP_FINISH:
+	    { 
+              int exit_code = getExitCode(nbytes, &sender_id);
+	      finishPE(nbytes, opcode, sender_id, exit_code);
+	      break;
+
+	  default:
+	    {
+	     /*		          
+	      char *opname = GetOpName(opcode);
+	      fprintf(stderr,"Sysman: Unrecognised opcode %s (%x)\n",
+	                      opname,opcode);	*/
+	      fprintf(stderr,"==** Qagh: Sysman: Unrecognised opcode (%x)\n",
+		      opcode);
+	    }
+	    break;
+	  } 	/* switch */
+	}	/* else */
+      }		/* while 1 */
+      /* end of SysMan!! */
+    } else {	
+      /* forked main thread begins here */
+      IF_PAR_DEBUG(verbose,
+	           fprintf(stderr, "==== Main Thread PE has been forked; doing an execv(%s,...)\n", 
+	           pvmExecutable));
+      pvmendtask();		 // Disconnect from PVM to avoid confusion:
+                                 // executable reconnects 
+      
+      // RFP: assumes that length(arvv[0])>=9 !!!
+      sprintf(argv[0],"-%08X",sysman_id);  /*flag that its the Main Thread PE and include sysman's id*/
+      execv(pvmExecutable,argv); /* Parent task becomes Main Thread PE */
+    }           /* else */
+  }		/* argc > 1 */  
+}		/* main */
+
+//@node Message handlers, Auxiliary fcts, Main fct, GUM System Manager Program
+//@subsection Message handlers
+
+/*
+   Received PP_NEWPE:
+   A new PE has been added to the configuration.
+*/
+void
+newPE(int nbytes, int opcode, int sender_id) { 
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,"==== [%x] SysMan detected a new host\n",
+		       sysman_id));
+
+  /* Determine the new machine... assume its the last on the config list? */
+  if (nSpawn < MAX_PES) { 
+    int nArch,nHost;
+    struct pvmhostinfo *hostp; 
+
+    /* get conmfiguration of PVM machine */
+    pvm_config(&nHost,&nArch,&hostp);	      
+    nHost--;
+    checkComms(pvm_spawn(petask, pargv, spawn_flag+PvmTaskHost, 
+			 hostp[nHost].hi_name, 1, gtids+nSpawn),
+	       "SysMan loop");
+    nSpawn++;
+    IF_PAR_DEBUG(verbose,
+		 fprintf(stderr, "==== [%x] Spawned onto %s\n",
+			 sysman_id, hostp[nHost].hi_name));
+  }
+}
+	  
+/* 
+   Received PP_READY:
+   Let it be known that PE @sender_id@ participates in the computation.
+*/
+void
+readyPE(int nbytes, int opcode, int sender_id) { 
+  int i = 0, flag = 1;
+  long isMain;
+  int nArch, nHost;
+  struct pvmhostinfo *hostp; 
+
+  //ASSERT(opcode==PP_READY);
+
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,"==== [%x] SysMan received PP_READY message from %x\n",
+		       sysman_id, sender_id));
+
+    pvm_config(&nHost,&nArch,&hostp);
+
+  GetArg1(isMain);
+	      
+  //if ((isMain && (PEbuffer[0]==0)) || alreadySpawned(sender_id)) { 
+    if (nPEs >= MAX_PES) { 
+      fprintf(stderr,"==== [%x] SysMan doesn't need PE %d (max %d PEs allowed)\n",
+	      sysman_id, sender_id, MAX_PES);
+      pvm_kill(sender_id); 
+    } else { 
+      if (isMain) { 
+	IF_PAR_DEBUG(verbose,
+		     fprintf(stderr,"==== [%x] SysMan found Main PE %x\n", 
+			     sysman_id, sender_id));
+	PEbuffer[0]=sender_id;
+      } else { 
+	/* search for PE in list of PEs */
+	for(i=1; i<nPEs; i++)
+	  if (PEbuffer[i]==sender_id) { 
+	    flag=0;
+	    break;
+	  }
+	/* it's a new PE: add it to the list of PEs */
+	if (flag)  
+	  PEbuffer[nextPE++] = sender_id; 
+	
+	IF_PAR_DEBUG(verbose,
+		     fprintf(stderr,"==== [%x] SysMan: found PE %d as [%x] on host %s\n", 
+			     sysman_id, PEsArrived, sender_id, hostp[PEsArrived].hi_name));
+
+	PEbuffer[PEsArrived++] = sender_id;
+      }
+
+		
+      /* enable better handling of unexpected terminations */
+      checkComms( pvm_notify(PvmTaskExit, PP_FINISH, 1, &sender_id),
+	          "SysMan loop");
+
+      /* finished registration of all PEs => enable notification */
+      if ((PEsArrived==nPEs) && PEbuffer[0]) { 
+	checkComms( pvm_notify(PvmHostAdd, PP_NEWPE, -1, 0),
+                    "SysMan startup");
+	IF_PAR_DEBUG(verbose,
+		     fprintf(stderr,"==== [%x] SysMan initialising notificaton for new hosts\n", sysman_id));
+      }
+		
+      /* finished notification => send off the PE ids */
+      if ((PEsArrived>=nPEs) && PEbuffer[0]) { 
+        if (PEsArrived>nPEs) {
+	IF_PAR_DEBUG(verbose,	
+		     fprintf(stderr,"==== [%x] Weird: %d PEs registered, but we only asked for %d\n", sysman_id, PEsArrived, nPEs));
+	// nPEs=PEsArrived;
+        }
+	broadcastPEtids();
+      }
+    }
+}
+
+/* 
+   Received PP_FINISH:
+   Shut down the corresponding PE. Check whether it is a regular shutdown
+   or an uncontrolled termination.
+*/
+void
+finishPE(int nbytes, int opcode, int sender_id, int exitCode) { 
+  int i;
+
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,"==== [%x] SysMan received PP_FINISH message from %x (exit code: %d)\n",
+		       sysman_id, sender_id, exitCode));
+
+  /* Is it relevant to us? Count the first message */
+  for (i=0; i<nPEs; i++) 
+    if (PEbuffer[i] == sender_id) { 
+      PEsTerminated++;
+      PEbuffer[i]=0; 
+	
+      /* handle exit code */
+      if (exitCode<0) {           /* a task exit before a controlled finish? */
+	fprintf(stderr,"==== [%x] Termination at %x with exit(%d)\n", 
+		sysman_id, sender_id, exitCode);
+      } else if (exitCode>0) {		           /* an abnormal exit code? */
+	fprintf(stderr,"==== [%x] Uncontrolled termination at %x with exit(%d)\n", 
+		sysman_id, sender_id, exitCode);	
+      } else if (!Finishing) {             /* exitCode==0 which is good news */
+        if (i!=0) {          /* someone other than main PE terminated first? */
+	 fprintf(stderr,"==== [%x] Unexpected early termination at %x\n", 
+		 sysman_id, sender_id);	
+	} else {
+         /* start shutdown by broadcasting FINISH to other PEs */
+	 IF_PAR_DEBUG(verbose,
+	              fprintf(stderr,"==== [%x] Initiating shutdown (requested by [%x] RIP) (exit code: %d)\n", sysman_id, sender_id, exitCode));
+         Finishing = rtsTrue;
+         broadcastFinish();
+        }
+      }	else {
+         /* we are in a shutdown already */
+	IF_PAR_DEBUG(verbose,
+		     fprintf(stderr,"==== [%x] Finish from %x during shutdown (%d PEs terminated so far; %d total)\n", 
+			     sysman_id, sender_id, PEsTerminated, nPEs));
+      }
+
+      if (PEsTerminated >= nPEs) { 
+        IF_PAR_DEBUG(verbose,
+          fprintf(stderr,"==== [%x] Global Shutdown, Goodbye!! (SysMan has received FINISHes from all PEs)\n", sysman_id));
+        //broadcastFinish();
+	/* received finish from everybody; now, we can exit, too */
+        exit(EXIT_SUCCESS); /* Qapla'! */
+      }
+    }
+}	
+	    
+//@node Auxiliary fcts, Index, Message handlers, GUM System Manager Program
+//@subsection Auxiliary fcts
+
+/* Needed here because its used in loads of places like LLComms etc */
+
+//@cindex stg_exit
+
+/* 
+ * called from STG-land to exit the program
+ */
+
+void  
+stg_exit(I_ n)
+{
+  fprintf(stderr, "==// [%x] %s in SysMan code; sending PP_FINISH to all PEs ...\n", 
+	    mytid,(n!=0)?"FAILURE":"FINISH");
+  broadcastFinish();
+  //broadcastFinish();
+  pvm_exit();
+  exit(n);
+}
+
+//@node Index,  , Auxiliary fcts, GUM System Manager Program
+//@subsection Index
+
+//@index
+//* main::  @cindex\s-+main
+//* message handling loop::  @cindex\s-+message handling loop
+//* stgMallocBytes::  @cindex\s-+stgMallocBytes
+//* stg_exit::  @cindex\s-+stg_exit
+//@end index
diff --git a/rts/posix/GetTime.c b/rts/posix/GetTime.c
new file mode 100644
index 0000000000..3a0764cb91
--- /dev/null
+++ b/rts/posix/GetTime.c
@@ -0,0 +1,141 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2005
+ *
+ * Machine-dependent time measurement functions
+ *
+ * ---------------------------------------------------------------------------*/
+
+// Not POSIX, due to use of ru_majflt in getPageFaults()
+// #include "PosixSource.h"
+
+#include "Rts.h"
+#include "GetTime.h"
+
+#ifdef HAVE_TIME_H
+# include <time.h>
+#endif
+
+#ifdef HAVE_SYS_TIME_H
+# include <sys/time.h>
+#endif
+
+#if HAVE_SYS_RESOURCE_H
+# include <sys/resource.h>
+#endif
+
+#ifdef HAVE_UNISTD_H
+# include <unistd.h>
+#endif
+
+#ifdef HAVE_SYS_TIMES_H
+# include <sys/times.h>
+#endif
+
+#if ! ((defined(HAVE_GETRUSAGE) && !irix_HOST_OS) || defined(HAVE_TIMES))
+#error No implementation for getProcessCPUTime() available.
+#endif
+
+#if defined(HAVE_GETTIMEOFDAY) && defined(HAVE_GETRUSAGE) && !irix_HOST_OS
+// we'll implement getProcessCPUTime() and getProcessElapsedTime()
+// separately, using getrusage() and gettimeofday() respectively
+
+Ticks getProcessCPUTime(void)
+{
+    struct rusage t;
+    getrusage(RUSAGE_SELF, &t);
+    return (t.ru_utime.tv_sec * TICKS_PER_SECOND + 
+	    ((Ticks)t.ru_utime.tv_usec * TICKS_PER_SECOND)/1000000);
+}
+
+Ticks getProcessElapsedTime(void)
+{
+    struct timeval tv;
+    gettimeofday(&tv, (struct timezone *) NULL);
+    return (tv.tv_sec * TICKS_PER_SECOND +
+	    ((Ticks)tv.tv_usec * TICKS_PER_SECOND)/1000000);
+}
+
+void getProcessTimes(Ticks *user, Ticks *elapsed)
+{
+    *user    = getProcessCPUTime();
+    *elapsed = getProcessElapsedTime();
+}
+
+#elif defined(HAVE_TIMES)
+
+// we'll use the old times() API.
+
+Ticks getProcessCPUTime(void)
+{
+    Ticks user, elapsed;
+    getProcessTimes(&user,&elapsed);
+    return user;
+}
+
+Ticks getProcessElapsedTime(void)
+{
+    Ticks user, elapsed;
+    getProcessTimes(&user,&elapsed);
+    return elapsed;
+}
+
+void getProcessTimes(Ticks *user, Ticks *elapsed)
+{
+    static nat ClockFreq = 0;
+
+    if (ClockFreq == 0) {
+#if defined(HAVE_SYSCONF)
+	long ticks;
+	ticks = sysconf(_SC_CLK_TCK);
+	if ( ticks == -1 ) {
+	    errorBelch("sysconf\n");
+	    stg_exit(EXIT_FAILURE);
+	}
+	ClockFreq = ticks;
+#elif defined(CLK_TCK)		/* defined by POSIX */
+	ClockFreq = CLK_TCK;
+#elif defined(HZ)
+	ClockFreq = HZ;
+#elif defined(CLOCKS_PER_SEC)
+	ClockFreq = CLOCKS_PER_SEC;
+#else
+	errorBelch("can't get clock resolution");
+	stg_exit(EXIT_FAILURE);
+#endif
+    }
+
+    struct tms t;
+    clock_t r = times(&t);
+    *user = (((Ticks)t.tms_utime * TICKS_PER_SECOND) / ClockFreq);
+    *elapsed = (((Ticks)r * TICKS_PER_SECOND) / ClockFreq);
+}
+
+#endif // HAVE_TIMES
+
+Ticks getThreadCPUTime(void)
+{
+#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID)
+    // clock_gettime() gives us per-thread CPU time.  It isn't
+    // reliable on Linux, but it's the best we have.
+    struct timespec ts;
+    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+    return (ts.tv_sec * TICKS_PER_SECOND + 
+	    ((Ticks)ts.tv_nsec * TICKS_PER_SECOND) / 1000000000);
+#else
+    return getProcessCPUTime();
+#endif
+}
+
+nat
+getPageFaults(void)
+{
+#if !defined(HAVE_GETRUSAGE) || irix_HOST_OS
+    return 0;
+#else
+    struct rusage t;
+    getrusage(RUSAGE_SELF, &t);
+    return(t.ru_majflt);
+#endif
+}
+
diff --git a/rts/posix/Itimer.c b/rts/posix/Itimer.c
new file mode 100644
index 0000000000..83ed84d6ef
--- /dev/null
+++ b/rts/posix/Itimer.c
@@ -0,0 +1,226 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1995-1999
+ *
+ * Interval timer for profiling and pre-emptive scheduling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+/*
+ * The interval timer is used for profiling and for context switching in the
+ * threaded build.  Though POSIX 1003.1b includes a standard interface for
+ * such things, no one really seems to be implementing them yet.  Even 
+ * Solaris 2.3 only seems to provide support for @CLOCK_REAL@, whereas we're
+ * keen on getting access to @CLOCK_VIRTUAL@.
+ * 
+ * Hence, we use the old-fashioned @setitimer@ that just about everyone seems
+ * to support.  So much for standards.
+ */
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "Timer.h"
+#include "Ticker.h"
+#include "posix/Itimer.h"
+#include "Proftimer.h"
+#include "Schedule.h"
+#include "posix/Select.h"
+
+/* As recommended in the autoconf manual */
+# ifdef TIME_WITH_SYS_TIME
+#  include <sys/time.h>
+#  include <time.h>
+# else
+#  ifdef HAVE_SYS_TIME_H
+#   include <sys/time.h>
+#  else
+#   include <time.h>
+#  endif
+# endif
+
+#ifdef HAVE_SIGNAL_H
+# include <signal.h>
+#endif
+
+/* Major bogosity:
+ * 
+ * In the threaded RTS, we can't set the virtual timer because the
+ * thread which has the virtual timer might be sitting waiting for a
+ * capability, and the virtual timer only ticks in CPU time.
+ *
+ * So, possible solutions:
+ *
+ * (1) tick in realtime.  Not very good, because this ticker is used for
+ *     profiling, and this will give us unreliable time profiling
+ *     results.  Furthermore, this requires picking a single OS thread
+ *     to be the timekeeper, which is a bad idea because the thread in
+ *     question might just be making a temporary call into Haskell land.
+ *
+ * (2) save/restore the virtual timer around excursions into STG land.
+ *     Sounds great, but I tried it and the resolution of the virtual timer
+ *     isn't good enough (on Linux) - most of our excursions fall
+ *     within the timer's resolution and we never make any progress.
+ *   
+ * (3) have a virtual timer in every OS thread.  Might be reasonable,
+ *     because most of the time there is only ever one of these
+ *     threads running, so it approximates a single virtual timer.
+ *     But still quite bogus (and I got crashes when I tried this).
+ *
+ * For now, we're using (1), but this needs a better solution. --SDM
+ */
+#ifdef THREADED_RTS
+#define ITIMER_FLAVOUR  ITIMER_REAL
+#define ITIMER_SIGNAL   SIGALRM
+#else
+#define ITIMER_FLAVOUR  ITIMER_VIRTUAL
+#define ITIMER_SIGNAL   SIGVTALRM
+#endif
+
+static
+int
+install_vtalrm_handler(TickProc handle_tick)
+{
+    struct sigaction action;
+
+    action.sa_handler = handle_tick;
+
+    sigemptyset(&action.sa_mask);
+
+#ifdef SA_RESTART
+    // specify SA_RESTART.  One consequence if we don't do this is
+    // that readline gets confused by the -threaded RTS.  It seems
+    // that if a SIGALRM handler is installed without SA_RESTART,
+    // readline installs its own SIGALRM signal handler (see
+    // readline's signals.c), and this somehow causes readline to go
+    // wrong when the input exceeds a single line (try it).
+    action.sa_flags = SA_RESTART;
+#else
+    action.sa_flags = 0;
+#endif
+
+    return sigaction(ITIMER_SIGNAL, &action, NULL);
+}
+
+int
+startTicker(nat ms, TickProc handle_tick)
+{
+# ifndef HAVE_SETITIMER
+  /*    debugBelch("No virtual timer on this system\n"); */
+    return -1;
+# else
+    struct itimerval it;
+
+    install_vtalrm_handler(handle_tick);
+
+#if !defined(THREADED_RTS)
+    timestamp = getourtimeofday();
+#endif
+
+    it.it_value.tv_sec = ms / 1000;
+    it.it_value.tv_usec = 1000 * (ms - (1000 * it.it_value.tv_sec));
+    it.it_interval = it.it_value;
+    return (setitimer(ITIMER_FLAVOUR, &it, NULL));
+# endif
+}
+
+int
+stopTicker()
+{
+# ifndef HAVE_SETITIMER
+  /*    debugBelch("No virtual timer on this system\n"); */
+    return -1;
+# else
+    struct itimerval it;
+  
+    it.it_value.tv_sec = 0;
+    it.it_value.tv_usec = 0;
+    it.it_interval = it.it_value;
+    return (setitimer(ITIMER_FLAVOUR, &it, NULL));
+# endif
+}
+
+# if 0
+/* This is a potential POSIX version */
+int
+startTicker(nat ms)
+{
+    struct sigevent se;
+    struct itimerspec it;
+    timer_t tid;
+
+#if !defined(THREADED_RTS)
+    timestamp = getourtimeofday();
+#endif
+
+    se.sigev_notify = SIGEV_SIGNAL;
+    se.sigev_signo = ITIMER_SIGNAL;
+    se.sigev_value.sival_int = ITIMER_SIGNAL;
+    if (timer_create(CLOCK_VIRTUAL, &se, &tid)) {
+	barf("can't create virtual timer");
+    }
+    it.it_value.tv_sec = ms / 1000;
+    it.it_value.tv_nsec = 1000000 * (ms - 1000 * it.it_value.tv_sec);
+    it.it_interval = it.it_value;
+    return timer_settime(tid, TIMER_RELTIME, &it, NULL);
+}
+
+int
+stopTicker()
+{
+    struct sigevent se;
+    struct itimerspec it;
+    timer_t tid;
+
+#if !defined(THREADED_RTS)
+    timestamp = getourtimeofday();
+#endif
+
+    se.sigev_notify = SIGEV_SIGNAL;
+    se.sigev_signo = ITIMER_SIGNAL;
+    se.sigev_value.sival_int = ITIMER_SIGNAL;
+    if (timer_create(CLOCK_VIRTUAL, &se, &tid)) {
+	barf("can't create virtual timer");
+    }
+    it.it_value.tv_sec = 0;
+    it.it_value.tv_nsec = 0;
+    it.it_interval = it.it_value;
+    return timer_settime(tid, TIMER_RELTIME, &it, NULL);
+}
+# endif
+
+#if 0
+/* Currently unused */
+void
+block_vtalrm_signal(void)
+{
+    sigset_t signals;
+    
+    sigemptyset(&signals);
+    sigaddset(&signals, ITIMER_SIGNAL);
+
+    (void) sigprocmask(SIG_BLOCK, &signals, NULL);
+}
+
+void
+unblock_vtalrm_signal(void)
+{
+    sigset_t signals;
+    
+    sigemptyset(&signals);
+    sigaddset(&signals, ITIMER_SIGNAL);
+
+    (void) sigprocmask(SIG_UNBLOCK, &signals, NULL);
+}
+#endif
+
+/* gettimeofday() takes around 1us on our 500MHz PIII.  Since we're
+ * only calling it 50 times/s, it shouldn't have any great impact.
+ */
+lnat
+getourtimeofday(void)
+{
+  struct timeval tv;
+  gettimeofday(&tv, (struct timezone *) NULL);
+  	// cast to lnat because nat may be 64 bit when int is only 32 bit
+  return ((lnat)tv.tv_sec * TICK_FREQUENCY +
+	  (lnat)tv.tv_usec * TICK_FREQUENCY / 1000000);
+}
diff --git a/rts/posix/Itimer.h b/rts/posix/Itimer.h
new file mode 100644
index 0000000000..09d01bde54
--- /dev/null
+++ b/rts/posix/Itimer.h
@@ -0,0 +1,19 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-2005
+ *
+ * Interval timer for profiling and pre-emptive scheduling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef ITIMER_H
+#define ITIMER_H
+
+extern lnat getourtimeofday   ( void );
+#if 0
+/* unused */
+extern void block_vtalrm_signal       ( void );
+extern void unblock_vtalrm_signal     ( void );
+#endif
+
+#endif /* ITIMER_H */
diff --git a/rts/posix/OSThreads.c b/rts/posix/OSThreads.c
new file mode 100644
index 0000000000..07bd762130
--- /dev/null
+++ b/rts/posix/OSThreads.c
@@ -0,0 +1,166 @@
+/* ---------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001-2005
+ *
+ * Accessing OS threads functionality in a (mostly) OS-independent
+ * manner. 
+ *
+ * --------------------------------------------------------------------------*/
+
+#if defined(DEBUG) && defined(__linux__)
+/* We want GNU extensions in DEBUG mode for mutex error checking */
+#define _GNU_SOURCE
+#endif
+
+#include "Rts.h"
+#if defined(THREADED_RTS)
+#include "OSThreads.h"
+#include "RtsUtils.h"
+
+#if HAVE_STRING_H
+#include <string.h>
+#endif
+
+#if !defined(HAVE_PTHREAD_H)
+#error pthreads.h is required for the threaded RTS on Posix platforms
+#endif
+
+/*
+ * This (allegedly) OS threads independent layer was initially
+ * abstracted away from code that used Pthreads, so the functions
+ * provided here are mostly just wrappers to the Pthreads API.
+ *
+ */
+
+void
+initCondition( Condition* pCond )
+{
+  pthread_cond_init(pCond, NULL);
+  return;
+}
+
+void
+closeCondition( Condition* pCond )
+{
+  pthread_cond_destroy(pCond);
+  return;
+}
+
+rtsBool
+broadcastCondition ( Condition* pCond )
+{
+  return (pthread_cond_broadcast(pCond) == 0);
+}
+
+rtsBool
+signalCondition ( Condition* pCond )
+{
+  return (pthread_cond_signal(pCond) == 0);
+}
+
+rtsBool
+waitCondition ( Condition* pCond, Mutex* pMut )
+{
+  return (pthread_cond_wait(pCond,pMut) == 0);
+}
+
+void
+yieldThread()
+{
+  sched_yield();
+  return;
+}
+
+void
+shutdownThread()
+{
+  pthread_exit(NULL);
+}
+
+int
+createOSThread (OSThreadId* pId, OSThreadProc *startProc, void *param)
+{
+  int result = pthread_create(pId, NULL, (void *(*)(void *))startProc, param);
+  if(!result)
+    pthread_detach(*pId);
+  return result;
+}
+
+OSThreadId
+osThreadId()
+{
+  return pthread_self();
+}
+
+void
+initMutex(Mutex* pMut)
+{
+#if defined(DEBUG) && defined(linux_HOST_OS)
+    pthread_mutexattr_t attr;
+    pthread_mutexattr_init(&attr);
+    pthread_mutexattr_settype(&attr,PTHREAD_MUTEX_ERRORCHECK_NP);
+    pthread_mutex_init(pMut,&attr);
+#else
+    pthread_mutex_init(pMut,NULL);
+#endif
+    return;
+}
+
+void
+newThreadLocalKey (ThreadLocalKey *key)
+{
+    int r;
+    if ((r = pthread_key_create(key, NULL)) != 0) {
+	barf("newThreadLocalKey: %s", strerror(r));
+    }
+}
+
+void *
+getThreadLocalVar (ThreadLocalKey *key)
+{
+    return pthread_getspecific(*key);
+    // Note: a return value of NULL can indicate that either the key
+    // is not valid, or the key is valid and the data value has not
+    // yet been set.  We need to use the latter case, so we cannot
+    // detect errors here.
+}
+
+void
+setThreadLocalVar (ThreadLocalKey *key, void *value)
+{
+    int r;
+    if ((r = pthread_setspecific(*key,value)) != 0) {
+	barf("setThreadLocalVar: %s", strerror(r));
+    }
+}
+
+static void *
+forkOS_createThreadWrapper ( void * entry )
+{
+    Capability *cap;
+    cap = rts_lock();
+    cap = rts_evalStableIO(cap, (HsStablePtr) entry, NULL);
+    rts_unlock(cap);
+    return NULL;
+}
+
+int
+forkOS_createThread ( HsStablePtr entry )
+{
+    pthread_t tid;
+    int result = pthread_create(&tid, NULL,
+				forkOS_createThreadWrapper, (void*)entry);
+    if(!result)
+        pthread_detach(tid);
+    return result;
+}
+
+#else /* !defined(THREADED_RTS) */
+
+int
+forkOS_createThread ( HsStablePtr entry STG_UNUSED )
+{
+    return -1;
+}
+
+#endif /* !defined(THREADED_RTS) */
diff --git a/rts/posix/Select.c b/rts/posix/Select.c
new file mode 100644
index 0000000000..e21ced03ab
--- /dev/null
+++ b/rts/posix/Select.c
@@ -0,0 +1,279 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1995-2002
+ *
+ * Support for concurrent non-blocking I/O and thread waiting.
+ *
+ * ---------------------------------------------------------------------------*/
+
+/* we're outside the realms of POSIX here... */
+/* #include "PosixSource.h" */
+
+#include "Rts.h"
+#include "Schedule.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "Timer.h"
+#include "Itimer.h"
+#include "Signals.h"
+#include "Capability.h"
+#include "posix/Select.h"
+
+# ifdef HAVE_SYS_TYPES_H
+#  include <sys/types.h>
+# endif
+
+# ifdef HAVE_SYS_TIME_H
+#  include <sys/time.h>
+# endif
+
+#include <errno.h>
+#include <string.h>
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#if !defined(THREADED_RTS)
+/* last timestamp */
+lnat timestamp = 0;
+
+/* 
+ * The threaded RTS uses an IO-manager thread in Haskell instead (see GHC.Conc) 
+ */
+
+/* There's a clever trick here to avoid problems when the time wraps
+ * around.  Since our maximum delay is smaller than 31 bits of ticks
+ * (it's actually 31 bits of microseconds), we can safely check
+ * whether a timer has expired even if our timer will wrap around
+ * before the target is reached, using the following formula:
+ *
+ *        (int)((uint)current_time - (uint)target_time) < 0
+ *
+ * if this is true, then our time has expired.
+ * (idea due to Andy Gill).
+ */
+static rtsBool
+wakeUpSleepingThreads(lnat ticks)
+{
+    StgTSO *tso;
+    rtsBool flag = rtsFalse;
+
+    while (sleeping_queue != END_TSO_QUEUE &&
+	   (int)(ticks - sleeping_queue->block_info.target) > 0) {
+	tso = sleeping_queue;
+	sleeping_queue = tso->link;
+	tso->why_blocked = NotBlocked;
+	tso->link = END_TSO_QUEUE;
+	IF_DEBUG(scheduler,debugBelch("Waking up sleeping thread %d\n", tso->id));
+	// MainCapability: this code is !THREADED_RTS
+	pushOnRunQueue(&MainCapability,tso);
+	flag = rtsTrue;
+    }
+    return flag;
+}
+
+/* Argument 'wait' says whether to wait for I/O to become available,
+ * or whether to just check and return immediately.  If there are
+ * other threads ready to run, we normally do the non-waiting variety,
+ * otherwise we wait (see Schedule.c).
+ *
+ * SMP note: must be called with sched_mutex locked.
+ *
+ * Windows: select only works on sockets, so this doesn't really work,
+ * though it makes things better than before. MsgWaitForMultipleObjects
+ * should really be used, though it only seems to work for read handles,
+ * not write handles.
+ *
+ */
+void
+awaitEvent(rtsBool wait)
+{
+    StgTSO *tso, *prev, *next;
+    rtsBool ready;
+    fd_set rfd,wfd;
+    int numFound;
+    int maxfd = -1;
+    rtsBool select_succeeded = rtsTrue;
+    rtsBool unblock_all = rtsFalse;
+    struct timeval tv;
+    lnat min, ticks;
+
+    tv.tv_sec  = 0;
+    tv.tv_usec = 0;
+    
+    IF_DEBUG(scheduler,
+	     debugBelch("scheduler: checking for threads blocked on I/O");
+	     if (wait) {
+		 debugBelch(" (waiting)");
+	     }
+	     debugBelch("\n");
+	     );
+
+    /* loop until we've woken up some threads.  This loop is needed
+     * because the select timing isn't accurate, we sometimes sleep
+     * for a while but not long enough to wake up a thread in
+     * a threadDelay.
+     */
+    do {
+
+      ticks = timestamp = getourtimeofday();
+      if (wakeUpSleepingThreads(ticks)) { 
+	  return;
+      }
+
+      if (!wait) {
+	  min = 0;
+      } else if (sleeping_queue != END_TSO_QUEUE) {
+	  min = (sleeping_queue->block_info.target - ticks) 
+	      * TICK_MILLISECS * 1000;
+      } else {
+	  min = 0x7ffffff;
+      }
+
+      /* 
+       * Collect all of the fd's that we're interested in
+       */
+      FD_ZERO(&rfd);
+      FD_ZERO(&wfd);
+
+      for(tso = blocked_queue_hd; tso != END_TSO_QUEUE; tso = next) {
+	next = tso->link;
+
+	switch (tso->why_blocked) {
+	case BlockedOnRead:
+	  { 
+	    int fd = tso->block_info.fd;
+	    if (fd >= FD_SETSIZE) {
+		barf("awaitEvent: descriptor out of range");
+	    }
+	    maxfd = (fd > maxfd) ? fd : maxfd;
+	    FD_SET(fd, &rfd);
+	    continue;
+	  }
+
+	case BlockedOnWrite:
+	  { 
+	    int fd = tso->block_info.fd;
+	    if (fd >= FD_SETSIZE) {
+		barf("awaitEvent: descriptor out of range");
+	    }
+	    maxfd = (fd > maxfd) ? fd : maxfd;
+	    FD_SET(fd, &wfd);
+	    continue;
+	  }
+
+	default:
+	  barf("AwaitEvent");
+	}
+      }
+
+      /* Check for any interesting events */
+      
+      tv.tv_sec  = min / 1000000;
+      tv.tv_usec = min % 1000000;
+
+      while ((numFound = select(maxfd+1, &rfd, &wfd, NULL, &tv)) < 0) {
+	  if (errno != EINTR) {
+	    /* Handle bad file descriptors by unblocking all the
+	       waiting threads. Why? Because a thread might have been
+	       a bit naughty and closed a file descriptor while another
+	       was blocked waiting. This is less-than-good programming
+	       practice, but having the RTS as a result fall over isn't
+	       acceptable, so we simply unblock all the waiting threads
+	       should we see a bad file descriptor & give the threads
+	       a chance to clean up their act. 
+	       
+	       Note: assume here that threads becoming unblocked
+	       will try to read/write the file descriptor before trying
+	       to issue a threadWaitRead/threadWaitWrite again (==> an
+	       IOError will result for the thread that's got the bad
+	       file descriptor.) Hence, there's no danger of a bad
+	       file descriptor being repeatedly select()'ed on, so
+	       the RTS won't loop.
+	    */
+	    if ( errno == EBADF ) {
+	      unblock_all = rtsTrue;
+	      break;
+	    } else {
+ 	      perror("select");
+	      barf("select failed");
+	    }
+	  }
+
+	  /* We got a signal; could be one of ours.  If so, we need
+	   * to start up the signal handler straight away, otherwise
+	   * we could block for a long time before the signal is
+	   * serviced.
+	   */
+#if defined(RTS_USER_SIGNALS)
+	  if (signals_pending()) {
+	      startSignalHandlers(&MainCapability);
+	      return; /* still hold the lock */
+	  }
+#endif
+
+	  /* we were interrupted, return to the scheduler immediately.
+	   */
+	  if (sched_state >= SCHED_INTERRUPTING) {
+	      return; /* still hold the lock */
+	  }
+	  
+	  /* check for threads that need waking up 
+	   */
+	  wakeUpSleepingThreads(getourtimeofday());
+	  
+	  /* If new runnable threads have arrived, stop waiting for
+	   * I/O and run them.
+	   */
+	  if (!emptyRunQueue(&MainCapability)) {
+	      return; /* still hold the lock */
+	  }
+      }
+
+      /* Step through the waiting queue, unblocking every thread that now has
+       * a file descriptor in a ready state.
+       */
+
+      prev = NULL;
+      if (select_succeeded || unblock_all) {
+	  for(tso = blocked_queue_hd; tso != END_TSO_QUEUE; tso = next) {
+	      next = tso->link;
+	      switch (tso->why_blocked) {
+	      case BlockedOnRead:
+		  ready = unblock_all || FD_ISSET(tso->block_info.fd, &rfd);
+		  break;
+	      case BlockedOnWrite:
+		  ready = unblock_all || FD_ISSET(tso->block_info.fd, &wfd);
+		  break;
+	      default:
+		  barf("awaitEvent");
+	      }
+      
+	      if (ready) {
+		  IF_DEBUG(scheduler,debugBelch("Waking up blocked thread %d\n", tso->id));
+		  tso->why_blocked = NotBlocked;
+		  tso->link = END_TSO_QUEUE;
+		  pushOnRunQueue(&MainCapability,tso);
+	      } else {
+		  if (prev == NULL)
+		      blocked_queue_hd = tso;
+		  else
+		      prev->link = tso;
+		  prev = tso;
+	      }
+	  }
+
+	  if (prev == NULL)
+	      blocked_queue_hd = blocked_queue_tl = END_TSO_QUEUE;
+	  else {
+	      prev->link = END_TSO_QUEUE;
+	      blocked_queue_tl = prev;
+	  }
+      }
+      
+    } while (wait && sched_state == SCHED_RUNNING
+	     && emptyRunQueue(&MainCapability));
+}
+
+#endif /* THREADED_RTS */
diff --git a/rts/posix/Select.h b/rts/posix/Select.h
new file mode 100644
index 0000000000..8825562974
--- /dev/null
+++ b/rts/posix/Select.h
@@ -0,0 +1,26 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-2005
+ *
+ * Prototypes for functions in Select.c
+ *
+ * -------------------------------------------------------------------------*/
+
+#ifndef SELECT_H
+#define SELECT_H
+
+#if !defined(THREADED_RTS)
+/* In Select.c */
+extern lnat RTS_VAR(timestamp);
+
+/* awaitEvent(rtsBool wait)
+ *
+ * Checks for blocked threads that need to be woken.
+ *
+ * Called from STG :  NO
+ * Locks assumed   :  sched_mutex
+ */
+void awaitEvent(rtsBool wait);  /* In Select.c */
+#endif
+
+#endif /* SELECT_H */
diff --git a/rts/posix/Signals.c b/rts/posix/Signals.c
new file mode 100644
index 0000000000..5f5f77fd39
--- /dev/null
+++ b/rts/posix/Signals.c
@@ -0,0 +1,510 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Signal processing / handling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+/* This is non-Posix-compliant.
+   #include "PosixSource.h" 
+*/
+#include "Rts.h"
+#include "SchedAPI.h"
+#include "Schedule.h"
+#include "RtsSignals.h"
+#include "posix/Signals.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+
+#ifdef alpha_HOST_ARCH
+# if defined(linux_HOST_OS)
+#  include <asm/fpu.h>
+# else
+#  include <machine/fpu.h>
+# endif
+#endif
+
+#ifdef HAVE_UNISTD_H
+# include <unistd.h>
+#endif
+
+#ifdef HAVE_SIGNAL_H
+# include <signal.h>
+#endif
+
+#include <stdlib.h>
+
+/* This curious flag is provided for the benefit of the Haskell binding
+ * to POSIX.1 to control whether or not to include SA_NOCLDSTOP when
+ * installing a SIGCHLD handler. 
+ */
+StgInt nocldstop = 0;
+
+/* -----------------------------------------------------------------------------
+ * The table of signal handlers
+ * -------------------------------------------------------------------------- */
+
+#if defined(RTS_USER_SIGNALS)
+
+/* SUP: The type of handlers is a little bit, well, doubtful... */
+StgInt *signal_handlers = NULL; /* Dynamically grown array of signal handlers */
+static StgInt nHandlers = 0;    /* Size of handlers array */
+
+static nat n_haskell_handlers = 0;
+
+/* -----------------------------------------------------------------------------
+ * Allocate/resize the table of signal handlers.
+ * -------------------------------------------------------------------------- */
+
+static void
+more_handlers(I_ sig)
+{
+    StgInt i;
+
+    if (sig < nHandlers)
+	return;
+
+    if (signal_handlers == NULL)
+	signal_handlers = (StgInt *)stgMallocBytes((sig + 1) * sizeof(StgInt), "more_handlers");
+    else
+	signal_handlers = (StgInt *)stgReallocBytes(signal_handlers, (sig + 1) * sizeof(StgInt), "more_handlers");
+
+    for(i = nHandlers; i <= sig; i++)
+	// Fill in the new slots with default actions
+	signal_handlers[i] = STG_SIG_DFL;
+
+    nHandlers = sig + 1;
+}
+
+/* -----------------------------------------------------------------------------
+ * Pending Handlers
+ *
+ * The mechanism for starting handlers differs between the threaded
+ * (THREADED_RTS) and non-threaded versions of the RTS.
+ *
+ * When the RTS is single-threaded, we just write the pending signal
+ * handlers into a buffer, and start a thread for each one in the
+ * scheduler loop.
+ *
+ * When THREADED_RTS, the problem is that signals might be
+ * delivered to multiple threads, so we would need to synchronise
+ * access to pending_handler_buf somehow.  Using thread
+ * synchronisation from a signal handler isn't possible in general
+ * (some OSs support it, eg. MacOS X, but not all).  So instead:
+ *
+ *   - the signal handler writes the signal number into the pipe
+ *     managed by the IO manager thread (see GHC.Conc).
+ *   - the IO manager picks up the signal number and calls
+ *     startSignalHandler() to start the thread.
+ *
+ * This also has the nice property that we don't need to arrange to
+ * wake up a worker task to start the signal handler: the IO manager
+ * wakes up when we write into the pipe.
+ *
+ * -------------------------------------------------------------------------- */
+
+// Here's the pipe into which we will send our signals
+static int io_manager_pipe = -1;
+
+void
+setIOManagerPipe (int fd)
+{
+    // only called when THREADED_RTS, but unconditionally
+    // compiled here because GHC.Conc depends on it.
+    io_manager_pipe = fd;
+}
+
+#if !defined(THREADED_RTS)
+
+#define N_PENDING_HANDLERS 16
+
+StgPtr pending_handler_buf[N_PENDING_HANDLERS];
+StgPtr *next_pending_handler = pending_handler_buf;
+
+#endif /* THREADED_RTS */
+
+/* -----------------------------------------------------------------------------
+ * SIGCONT handler
+ *
+ * It seems that shells tend to put stdin back into blocking mode
+ * following a suspend/resume of the process.  Here we arrange to put
+ * it back into non-blocking mode.  We don't do anything to
+ * stdout/stderr because these handles don't get put into non-blocking
+ * mode at all - see the comments on stdout/stderr in PrelHandle.hsc.
+ * -------------------------------------------------------------------------- */
+
+static void
+cont_handler(int sig STG_UNUSED)
+{
+    setNonBlockingFd(0);
+}
+
+/* -----------------------------------------------------------------------------
+ * Low-level signal handler
+ *
+ * Places the requested handler on a stack of pending handlers to be
+ * started up at the next context switch.
+ * -------------------------------------------------------------------------- */
+
+static void
+generic_handler(int sig)
+{
+    sigset_t signals;
+
+#if defined(THREADED_RTS)
+
+    if (io_manager_pipe != -1)
+    {
+	// Write the signal number into the pipe as a single byte.  We
+	// hope that signals fit into a byte...
+	StgWord8 csig = (StgWord8)sig;
+	write(io_manager_pipe, &csig, 1);
+    }
+    // If the IO manager hasn't told us what the FD of the write end
+    // of its pipe is, there's not much we can do here, so just ignore
+    // the signal..
+
+#else /* not THREADED_RTS */
+
+    /* Can't call allocate from here.  Probably can't call malloc
+       either.  However, we have to schedule a new thread somehow.
+
+       It's probably ok to request a context switch and allow the
+       scheduler to  start the handler thread, but how do we
+       communicate this to the scheduler?
+
+       We need some kind of locking, but with low overhead (i.e. no
+       blocking signals every time around the scheduler).
+       
+       Signal Handlers are atomic (i.e. they can't be interrupted), and
+       we can make use of this.  We just need to make sure the
+       critical section of the scheduler can't be interrupted - the
+       only way to do this is to block signals.  However, we can lower
+       the overhead by only blocking signals when there are any
+       handlers to run, i.e. the set of pending handlers is
+       non-empty.
+    */
+       
+    /* We use a stack to store the pending signals.  We can't
+       dynamically grow this since we can't allocate any memory from
+       within a signal handler.
+
+       Hence unfortunately we have to bomb out if the buffer
+       overflows.  It might be acceptable to carry on in certain
+       circumstances, depending on the signal.  
+    */
+
+    *next_pending_handler++ = deRefStablePtr((StgStablePtr)signal_handlers[sig]);
+
+    // stack full?
+    if (next_pending_handler == &pending_handler_buf[N_PENDING_HANDLERS]) {
+	errorBelch("too many pending signals");
+	stg_exit(EXIT_FAILURE);
+    }
+    
+#endif /* THREADED_RTS */
+
+    // re-establish the signal handler, and carry on
+    sigemptyset(&signals);
+    sigaddset(&signals, sig);
+    sigprocmask(SIG_UNBLOCK, &signals, NULL);
+
+    // *always* do the SIGCONT handler, even if the user overrides it.
+    if (sig == SIGCONT) {
+	cont_handler(sig);
+    }
+
+    context_switch = 1;
+}
+
+/* -----------------------------------------------------------------------------
+ * Blocking/Unblocking of the user signals
+ * -------------------------------------------------------------------------- */
+
+static sigset_t userSignals;
+static sigset_t savedSignals;
+
+void
+initUserSignals(void)
+{
+    sigemptyset(&userSignals);
+}
+
+void
+blockUserSignals(void)
+{
+    sigprocmask(SIG_BLOCK, &userSignals, &savedSignals);
+}
+
+void
+unblockUserSignals(void)
+{
+    sigprocmask(SIG_SETMASK, &savedSignals, NULL);
+}
+
+rtsBool
+anyUserHandlers(void)
+{
+    return n_haskell_handlers != 0;
+}
+
+#if !defined(THREADED_RTS)
+void
+awaitUserSignals(void)
+{
+    while (!signals_pending() && sched_state == SCHED_RUNNING) {
+	pause();
+    }
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Install a Haskell signal handler.
+ * -------------------------------------------------------------------------- */
+
+int
+stg_sig_install(int sig, int spi, StgStablePtr *handler, void *mask)
+{
+    sigset_t signals, osignals;
+    struct sigaction action;
+    StgInt previous_spi;
+
+    // Block the signal until we figure out what to do
+    // Count on this to fail if the signal number is invalid
+    if (sig < 0 || sigemptyset(&signals) ||
+	sigaddset(&signals, sig) || sigprocmask(SIG_BLOCK, &signals, &osignals)) {
+	return STG_SIG_ERR;
+    }
+    
+    more_handlers(sig);
+
+    previous_spi = signal_handlers[sig];
+
+    action.sa_flags = 0;
+    
+    switch(spi) {
+    case STG_SIG_IGN:
+    	signal_handlers[sig] = STG_SIG_IGN;
+	sigdelset(&userSignals, sig);
+        action.sa_handler = SIG_IGN;
+    	break;
+    	
+    case STG_SIG_DFL:
+    	signal_handlers[sig] = STG_SIG_DFL;
+	sigdelset(&userSignals, sig);
+        action.sa_handler = SIG_DFL;
+    	break;
+
+    case STG_SIG_HAN:
+    case STG_SIG_RST:
+    	signal_handlers[sig] = (StgInt)*handler;
+	sigaddset(&userSignals, sig);
+    	action.sa_handler = generic_handler;
+	if (spi == STG_SIG_RST) {
+	    action.sa_flags = SA_RESETHAND;
+	}
+	n_haskell_handlers++;
+    	break;
+
+    default:
+        barf("stg_sig_install: bad spi");
+    }
+
+    if (mask != NULL)
+        action.sa_mask = *(sigset_t *)mask;
+    else
+	sigemptyset(&action.sa_mask);
+
+    action.sa_flags |= sig == SIGCHLD && nocldstop ? SA_NOCLDSTOP : 0;
+
+    if (sigaction(sig, &action, NULL) || 
+	sigprocmask(SIG_SETMASK, &osignals, NULL)) 
+    {
+	// need to return an error code, so avoid a stable pointer leak
+	// by freeing the previous handler if there was one.
+	if (previous_spi >= 0) {
+	    freeStablePtr(stgCast(StgStablePtr,signal_handlers[sig]));
+	    n_haskell_handlers--;
+	}
+	return STG_SIG_ERR;
+    }
+
+    if (previous_spi == STG_SIG_DFL || previous_spi == STG_SIG_IGN
+	|| previous_spi == STG_SIG_ERR) {
+	return previous_spi;
+    } else {
+	*handler = (StgStablePtr)previous_spi;
+	return STG_SIG_HAN;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Creating new threads for signal handlers.
+ * -------------------------------------------------------------------------- */
+
+#if !defined(THREADED_RTS)
+void
+startSignalHandlers(Capability *cap)
+{
+  blockUserSignals();
+  
+  while (next_pending_handler != pending_handler_buf) {
+
+    next_pending_handler--;
+
+    scheduleThread (cap,
+	createIOThread(cap,
+		       RtsFlags.GcFlags.initialStkSize, 
+		       (StgClosure *) *next_pending_handler));
+  }
+
+  unblockUserSignals();
+}
+#endif
+
+/* ----------------------------------------------------------------------------
+ * Mark signal handlers during GC.
+ *
+ * We do this rather than trying to start all the signal handlers
+ * prior to GC, because that requires extra heap for the new threads.
+ * Signals must be blocked (see blockUserSignals() above) during GC to
+ * avoid race conditions.
+ * -------------------------------------------------------------------------- */
+
+#if !defined(THREADED_RTS)
+void
+markSignalHandlers (evac_fn evac)
+{
+    StgPtr *p;
+
+    p = next_pending_handler;
+    while (p != pending_handler_buf) {
+	p--;
+	evac((StgClosure **)p);
+    }
+}
+#else
+void
+markSignalHandlers (evac_fn evac STG_UNUSED)
+{
+}
+#endif
+
+#else /* !RTS_USER_SIGNALS */
+StgInt 
+stg_sig_install(StgInt sig STG_UNUSED,
+		StgInt spi STG_UNUSED,
+		StgStablePtr* handler STG_UNUSED,
+		void* mask STG_UNUSED)
+{
+  //barf("User signals not supported");
+  return STG_SIG_DFL;
+}
+
+#endif
+
+#if defined(RTS_USER_SIGNALS)
+/* -----------------------------------------------------------------------------
+ * SIGINT handler.
+ *
+ * We like to shutdown nicely after receiving a SIGINT, write out the
+ * stats, write profiling info, close open files and flush buffers etc.
+ * -------------------------------------------------------------------------- */
+#ifdef SMP
+pthread_t startup_guy;
+#endif
+
+static void
+shutdown_handler(int sig STG_UNUSED)
+{
+#ifdef SMP
+    // if I'm a worker thread, send this signal to the guy who
+    // originally called startupHaskell().  Since we're handling
+    // the signal, it won't be a "send to all threads" type of signal
+    // (according to the POSIX threads spec).
+    if (pthread_self() != startup_guy) {
+	pthread_kill(startup_guy, sig);
+	return;
+    }
+#endif
+
+    // If we're already trying to interrupt the RTS, terminate with
+    // extreme prejudice.  So the first ^C tries to exit the program
+    // cleanly, and the second one just kills it.
+    if (sched_state >= SCHED_INTERRUPTING) {
+	stg_exit(EXIT_INTERRUPTED);
+    } else {
+	interruptStgRts();
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Install default signal handlers.
+ *
+ * The RTS installs a default signal handler for catching
+ * SIGINT, so that we can perform an orderly shutdown.
+ *
+ * Haskell code may install their own SIGINT handler, which is
+ * fine, provided they're so kind as to put back the old one
+ * when they de-install.
+ *
+ * In addition to handling SIGINT, the RTS also handles SIGFPE
+ * by ignoring it.  Apparently IEEE requires floating-point
+ * exceptions to be ignored by default, but alpha-dec-osf3
+ * doesn't seem to do so.
+ * -------------------------------------------------------------------------- */
+void
+initDefaultHandlers()
+{
+    struct sigaction action,oact;
+
+#ifdef SMP
+    startup_guy = pthread_self();
+#endif
+
+    // install the SIGINT handler
+    action.sa_handler = shutdown_handler;
+    sigemptyset(&action.sa_mask);
+    action.sa_flags = 0;
+    if (sigaction(SIGINT, &action, &oact) != 0) {
+	errorBelch("warning: failed to install SIGINT handler");
+    }
+
+#if defined(HAVE_SIGINTERRUPT)
+    siginterrupt(SIGINT, 1);	// isn't this the default? --SDM
+#endif
+
+    // install the SIGCONT handler
+    action.sa_handler = cont_handler;
+    sigemptyset(&action.sa_mask);
+    action.sa_flags = 0;
+    if (sigaction(SIGCONT, &action, &oact) != 0) {
+	errorBelch("warning: failed to install SIGCONT handler");
+    }
+
+    // install the SIGFPE handler
+
+    // In addition to handling SIGINT, also handle SIGFPE by ignoring it.
+    // Apparently IEEE requires floating-point exceptions to be ignored by
+    // default, but alpha-dec-osf3 doesn't seem to do so.
+
+    // Commented out by SDM 2/7/2002: this causes an infinite loop on
+    // some architectures when an integer division by zero occurs: we
+    // don't recover from the floating point exception, and the
+    // program just generates another one immediately.
+#if 0
+    action.sa_handler = SIG_IGN;
+    sigemptyset(&action.sa_mask);
+    action.sa_flags = 0;
+    if (sigaction(SIGFPE, &action, &oact) != 0) {
+	errorBelch("warning: failed to install SIGFPE handler");
+    }
+#endif
+
+#ifdef alpha_HOST_ARCH
+    ieee_set_fp_control(0);
+#endif
+}
+
+#endif /* RTS_USER_SIGNALS */
diff --git a/rts/posix/Signals.h b/rts/posix/Signals.h
new file mode 100644
index 0000000000..39477f8c6a
--- /dev/null
+++ b/rts/posix/Signals.h
@@ -0,0 +1,26 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Signal processing / handling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef POSIX_SIGNALS_H
+#define POSIX_SIGNALS_H
+
+extern rtsBool anyUserHandlers(void);
+
+#if !defined(THREADED_RTS)
+
+extern StgPtr pending_handler_buf[];
+extern StgPtr *next_pending_handler;
+#define signals_pending() (next_pending_handler != pending_handler_buf)
+void startSignalHandlers(Capability *cap);
+
+#endif
+
+extern StgInt *signal_handlers;
+
+#endif /* POSIX_SIGNALS_H */
+
diff --git a/rts/win32/AsyncIO.c b/rts/win32/AsyncIO.c
new file mode 100644
index 0000000000..7bcf571cf8
--- /dev/null
+++ b/rts/win32/AsyncIO.c
@@ -0,0 +1,345 @@
+/* AsyncIO.c
+ *
+ * Integrating Win32 asynchronous I/O with the GHC RTS.
+ *
+ * (c) sof, 2002-2003.
+ */
+#include "Rts.h"
+#include "RtsUtils.h"
+#include <windows.h>
+#include <stdio.h>
+#include "Schedule.h"
+#include "RtsFlags.h"
+#include "Capability.h"
+#include "win32/AsyncIO.h"
+#include "win32/IOManager.h"
+
+/*
+ * Overview:
+ *
+ * Haskell code issue asynchronous I/O requests via the 
+ * async{Read,Write,DoOp}# primops. These cause addIORequest()
+ * to be invoked, which forwards the request to the underlying
+ * asynchronous I/O subsystem. Each request is tagged with a unique
+ * ID.
+ *
+ * addIORequest() returns this ID, so that when the blocked CH
+ * thread is added onto blocked_queue, its TSO is annotated with
+ * it. Upon completion of an I/O request, the async I/O handling
+ * code makes a back-call to signal its completion; the local
+ * onIOComplete() routine. It adds the IO request ID (along with
+ * its result data) to a queue of completed requests before returning. 
+ *
+ * The queue of completed IO request is read by the thread operating
+ * the RTS scheduler. It de-queues the CH threads corresponding
+ * to the request IDs, making them runnable again.
+ *
+ */
+
+typedef struct CompletedReq {
+    unsigned int   reqID;
+    int            len;
+    int            errCode;
+} CompletedReq;
+
+#define MAX_REQUESTS 200
+
+static CRITICAL_SECTION queue_lock;
+static HANDLE           completed_req_event;
+static HANDLE           abandon_req_wait;
+static HANDLE           wait_handles[2];
+static CompletedReq     completedTable[MAX_REQUESTS];
+static int              completed_hw;
+static HANDLE           completed_table_sema;
+static int              issued_reqs;
+
+static void
+onIOComplete(unsigned int reqID,
+	     int   fd STG_UNUSED,
+	     int   len,
+	     void* buf STG_UNUSED,
+	     int   errCode)
+{
+    DWORD dwRes;
+    /* Deposit result of request in queue/table..when there's room. */
+    dwRes = WaitForSingleObject(completed_table_sema, INFINITE);
+    switch (dwRes) {
+    case WAIT_OBJECT_0:
+	break;
+    default:
+	/* Not likely */
+	fprintf(stderr, "onIOComplete: failed to grab table semaphore, dropping request 0x%x\n", reqID);
+	fflush(stderr);
+	return;
+    }
+    EnterCriticalSection(&queue_lock);
+    if (completed_hw == MAX_REQUESTS) {
+	/* Shouldn't happen */
+	fprintf(stderr, "onIOComplete: ERROR -- Request table overflow (%d); dropping.\n", reqID);
+	fflush(stderr);
+    } else {
+#if 0
+	fprintf(stderr, "onCompl: %d %d %d %d %d\n", 
+		reqID, len, errCode, issued_reqs, completed_hw); 
+	fflush(stderr);
+#endif
+	completedTable[completed_hw].reqID   = reqID;
+	completedTable[completed_hw].len     = len;
+	completedTable[completed_hw].errCode = errCode;
+	completed_hw++;
+	issued_reqs--;
+	if (completed_hw == 1) {
+	    /* The event is used to wake up the scheduler thread should it
+	     * be blocked waiting for requests to complete. The event resets once
+	     * that thread has cleared out the request queue/table.
+	     */
+	    SetEvent(completed_req_event);
+	}
+    }
+    LeaveCriticalSection(&queue_lock);
+}
+
+unsigned int
+addIORequest(int   fd,
+	     int   forWriting,
+	     int   isSock,
+	     int   len,
+	     char* buf)
+{
+    EnterCriticalSection(&queue_lock);
+    issued_reqs++;
+    LeaveCriticalSection(&queue_lock);
+#if 0
+    fprintf(stderr, "addIOReq: %d %d %d\n", fd, forWriting, len); fflush(stderr);
+#endif
+    return AddIORequest(fd,forWriting,isSock,len,buf,onIOComplete);
+}
+
+unsigned int
+addDelayRequest(int msecs)
+{
+    EnterCriticalSection(&queue_lock);
+    issued_reqs++;
+    LeaveCriticalSection(&queue_lock);
+#if 0
+    fprintf(stderr, "addDelayReq: %d\n", msecs); fflush(stderr);
+#endif
+    return AddDelayRequest(msecs,onIOComplete);
+}
+
+unsigned int
+addDoProcRequest(void* proc, void* param)
+{
+    EnterCriticalSection(&queue_lock);
+    issued_reqs++;
+    LeaveCriticalSection(&queue_lock);
+#if 0
+    fprintf(stderr, "addProcReq: %p %p\n", proc, param); fflush(stderr);
+#endif
+    return AddProcRequest(proc,param,onIOComplete);
+}
+
+
+int
+startupAsyncIO()
+{
+    if (!StartIOManager()) {
+	return 0;
+    }
+    InitializeCriticalSection(&queue_lock);
+    /* Create a pair of events:
+     *
+     *    - completed_req_event  -- signals the deposit of request result; manual reset.
+     *    - abandon_req_wait     -- external OS thread tells current RTS/Scheduler
+     *                              thread to abandon wait for IO request completion.
+     *                              Auto reset.
+     */
+    completed_req_event = CreateEvent (NULL, TRUE,  FALSE, NULL);
+    abandon_req_wait    = CreateEvent (NULL, FALSE, FALSE, NULL);
+    wait_handles[0] = completed_req_event;
+    wait_handles[1] = abandon_req_wait;
+    completed_hw = 0;
+    if ( !(completed_table_sema = CreateSemaphore (NULL, MAX_REQUESTS, MAX_REQUESTS, NULL)) ) {
+	DWORD rc = GetLastError();
+	fprintf(stderr, "startupAsyncIO: CreateSemaphore failed 0x%x\n", rc);
+	fflush(stderr);
+    }
+
+    return ( completed_req_event  != INVALID_HANDLE_VALUE &&
+	     abandon_req_wait     != INVALID_HANDLE_VALUE &&
+	     completed_table_sema != NULL );
+}
+
+void
+shutdownAsyncIO()
+{
+    CloseHandle(completed_req_event);
+    ShutdownIOManager();
+}
+
+/*
+ * Function: awaitRequests(wait)
+ *
+ * Check for the completion of external IO work requests. Worker
+ * threads signal completion of IO requests by depositing them
+ * in a table (completedTable). awaitRequests() matches up 
+ * requests in that table with threads on the blocked_queue, 
+ * making the threads whose IO requests have completed runnable
+ * again.
+ * 
+ * awaitRequests() is called by the scheduler periodically _or_ if
+ * it is out of work, and need to wait for the completion of IO
+ * requests to make further progress. In the latter scenario, 
+ * awaitRequests() will simply block waiting for worker threads 
+ * to complete if the 'completedTable' is empty.
+ */
+int
+awaitRequests(rtsBool wait)
+{
+#ifndef THREADED_RTS
+  // none of this is actually used in the threaded RTS
+
+start:
+#if 0
+    fprintf(stderr, "awaitRequests(): %d %d %d\n", issued_reqs, completed_hw, wait);
+    fflush(stderr);
+#endif
+    EnterCriticalSection(&queue_lock);
+    /* Nothing immediately available & we won't wait */
+    if ((!wait && completed_hw == 0)
+#if 0
+	// If we just return when wait==rtsFalse, we'll go into a busy
+	// wait loop, so I disabled this condition --SDM 18/12/2003
+	(issued_reqs == 0 && completed_hw == 0)
+#endif
+	) {
+	LeaveCriticalSection(&queue_lock);
+	return 0;
+    }
+    if (completed_hw == 0) {
+	/* empty table, drop lock and wait */
+	LeaveCriticalSection(&queue_lock);
+	if ( wait && sched_state == SCHED_RUNNING ) {
+	    DWORD dwRes = WaitForMultipleObjects(2, wait_handles, FALSE, INFINITE);
+	    switch (dwRes) {
+	    case WAIT_OBJECT_0:
+		/* a request was completed */
+		break;
+	    case WAIT_OBJECT_0 + 1:
+	    case WAIT_TIMEOUT:
+		/* timeout (unlikely) or told to abandon waiting */
+		return 0;
+	    case WAIT_FAILED: {
+		DWORD dw = GetLastError();
+		fprintf(stderr, "awaitRequests: wait failed -- error code: %lu\n", dw); fflush(stderr);
+		return 0;
+	    }
+	    default:
+		fprintf(stderr, "awaitRequests: unexpected wait return code %lu\n", dwRes); fflush(stderr);
+		return 0;
+	    }
+	} else {
+	    return 0;
+	}
+	goto start;
+    } else {
+	int i;
+	StgTSO *tso, *prev;
+	
+	for (i=0; i < completed_hw; i++) {
+	    /* For each of the completed requests, match up their Ids
+	     * with those of the threads on the blocked_queue. If the
+	     * thread that made the IO request has been subsequently
+	     * killed (and removed from blocked_queue), no match will
+	     * be found for that request Id. 
+	     *
+	     * i.e., killing a Haskell thread doesn't attempt to cancel
+	     * the IO request it is blocked on.
+	     *
+	     */
+	    unsigned int rID = completedTable[i].reqID;
+	    
+	    prev = NULL;
+	    for(tso = blocked_queue_hd ; tso != END_TSO_QUEUE; prev = tso, tso = tso->link) {
+	
+		switch(tso->why_blocked) {
+		case BlockedOnRead:
+		case BlockedOnWrite:
+		case BlockedOnDoProc:
+		    if (tso->block_info.async_result->reqID == rID) {
+			/* Found the thread blocked waiting on request; stodgily fill 
+			 * in its result block. 
+			 */
+			tso->block_info.async_result->len = completedTable[i].len;
+			tso->block_info.async_result->errCode = completedTable[i].errCode;
+			
+			/* Drop the matched TSO from blocked_queue */
+			if (prev) {
+			    prev->link = tso->link;
+			} else {
+			    blocked_queue_hd = tso->link;
+			}
+			if (blocked_queue_tl == tso) {
+			    blocked_queue_tl = prev ? prev : END_TSO_QUEUE;
+			}
+		    
+			/* Terminates the run queue + this inner for-loop. */
+			tso->link = END_TSO_QUEUE;
+			tso->why_blocked = NotBlocked;
+			pushOnRunQueue(&MainCapability, tso);
+			break;
+		    }
+		    break;
+		default:
+		    if (tso->why_blocked != NotBlocked) {
+			barf("awaitRequests: odd thread state");
+		    }
+		    break;
+		}
+	    }
+	    /* Signal that there's completed table slots available */
+	    if ( !ReleaseSemaphore(completed_table_sema, 1, NULL) ) {
+		DWORD dw = GetLastError();
+		fprintf(stderr, "awaitRequests: failed to signal semaphore (error code=0x%x)\n", dw);
+		fflush(stderr);
+	    }
+	}
+	completed_hw = 0;
+	ResetEvent(completed_req_event);
+	LeaveCriticalSection(&queue_lock);
+	return 1;
+    }
+#endif /* !THREADED_RTS */
+}
+
+/*
+ * Function: abandonRequestWait()
+ *
+ * Wake up a thread that's blocked waiting for new IO requests
+ * to complete (via awaitRequests().)
+ */
+void
+abandonRequestWait( void )
+{
+    /* the event is auto-reset, but in case there's no thread
+     * already waiting on the event, we want to return it to
+     * a non-signalled state.
+     *
+     * Careful!  There is no synchronisation between
+     * abandonRequestWait and awaitRequest, which means that
+     * abandonRequestWait might be called just before a thread
+     * goes into a wait, and we miss the abandon signal.  So we
+     * must SetEvent() here rather than PulseEvent() to ensure
+     * that the event isn't lost.  We can re-optimise by resetting
+     * the event somewhere safe if we know the event has been
+     * properly serviced (see resetAbandon() below).  --SDM 18/12/2003
+     */
+    SetEvent(abandon_req_wait);
+}
+
+void
+resetAbandonRequestWait( void )
+{
+    ResetEvent(abandon_req_wait);
+}
+
diff --git a/rts/win32/AsyncIO.h b/rts/win32/AsyncIO.h
new file mode 100644
index 0000000000..2077ea0cf7
--- /dev/null
+++ b/rts/win32/AsyncIO.h
@@ -0,0 +1,25 @@
+/* AsyncIO.h
+ *
+ * Integrating Win32 asynchronous I/O with the GHC RTS.
+ *
+ * (c) sof, 2002-2003.
+ */
+#ifndef __ASYNCHIO_H__
+#define __ASYNCHIO_H__
+extern unsigned int
+addIORequest(int   fd,
+	     int   forWriting,
+	     int   isSock,
+	     int   len,
+	     char* buf);
+extern unsigned int addDelayRequest(int   msecs);
+extern unsigned int addDoProcRequest(void* proc, void* param);
+extern int  startupAsyncIO(void);
+extern void shutdownAsyncIO(void);
+
+extern int awaitRequests(rtsBool wait);
+
+extern void abandonRequestWait(void);
+extern void resetAbandonRequestWait(void);
+
+#endif /* __ASYNCHIO_H__ */
diff --git a/rts/win32/AwaitEvent.c b/rts/win32/AwaitEvent.c
new file mode 100644
index 0000000000..43e188fb34
--- /dev/null
+++ b/rts/win32/AwaitEvent.c
@@ -0,0 +1,51 @@
+#if !defined(THREADED_RTS) /* to the end */
+/*
+ * Wait/check for external events. Periodically, the
+ * Scheduler checks for the completion of external operations,
+ * like the expiration of timers, completion of I/O requests
+ * issued by Haskell threads.
+ *
+ * If the Scheduler is otherwise out of work, it'll block
+ * herein waiting for external events to occur.
+ *
+ * This file mirrors the select()-based functionality 
+ * for POSIX / Unix platforms in rts/Select.c, but for
+ * Win32.
+ *
+ */
+#include "Rts.h"
+#include "Schedule.h"
+#include "AwaitEvent.h"
+#include <windows.h>
+#include "win32/AsyncIO.h"
+
+// Used to avoid calling abandonRequestWait() if we don't need to.
+// Protected by sched_mutex.
+static nat workerWaitingForRequests = 0;
+
+void
+awaitEvent(rtsBool wait)
+{
+  int ret;
+
+  do {
+    /* Try to de-queue completed IO requests
+     */
+    workerWaitingForRequests = 1;
+    ret = awaitRequests(wait);
+    workerWaitingForRequests = 0;
+    if (!ret) { 
+      return; /* still hold the lock */
+    }
+
+    // Return to the scheduler if:
+    //
+    //  - we were interrupted
+    //  - new threads have arrived
+
+  } while (wait
+	   && sched_state == SCHED_RUNNING
+	   && emptyRunQueue(&MainCapability)
+      );
+}
+#endif
diff --git a/rts/win32/ConsoleHandler.c b/rts/win32/ConsoleHandler.c
new file mode 100644
index 0000000000..d7096db632
--- /dev/null
+++ b/rts/win32/ConsoleHandler.c
@@ -0,0 +1,313 @@
+/*
+ * Console control handler support.
+ *
+ */
+#include "Rts.h"
+#include <windows.h>
+#include "ConsoleHandler.h"
+#include "SchedAPI.h"
+#include "Schedule.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "AsyncIO.h"
+#include "RtsSignals.h"
+
+extern int stg_InstallConsoleEvent(int action, StgStablePtr *handler);
+
+static BOOL WINAPI shutdown_handler(DWORD dwCtrlType);
+static BOOL WINAPI generic_handler(DWORD dwCtrlType);
+
+static rtsBool deliver_event = rtsTrue;
+static StgInt console_handler = STG_SIG_DFL;
+
+static HANDLE hConsoleEvent = INVALID_HANDLE_VALUE;
+
+#define N_PENDING_EVENTS 16
+StgInt stg_pending_events = 0;           /* number of undelivered events */
+DWORD stg_pending_buf[N_PENDING_EVENTS]; /* their associated event numbers. */
+
+/*
+ * Function: initUserSignals()
+ *
+ * Initialize the console handling substrate.
+ */
+void
+initUserSignals(void)
+{
+    stg_pending_events = 0;
+    console_handler = STG_SIG_DFL;
+    if (hConsoleEvent == INVALID_HANDLE_VALUE) {
+	hConsoleEvent = 
+	    CreateEvent ( NULL,  /* default security attributes */
+			  TRUE,  /* manual-reset event */
+			  FALSE, /* initially non-signalled */
+			  NULL); /* no name */
+    }
+    return;
+}
+
+/*
+ * Function: shutdown_handler()
+ *
+ * Local function that performs the default handling of Ctrl+C kind
+ * events; gently shutting down the RTS
+ *
+ * To repeat Signals.c remark -- user code may choose to override the
+ * default handler. Which is fine, assuming they put back the default
+ * handler when/if they de-install the custom handler.
+ * 
+ */
+static BOOL WINAPI shutdown_handler(DWORD dwCtrlType)
+{
+    switch (dwCtrlType) {
+    
+    case CTRL_CLOSE_EVENT:
+	/* see generic_handler() comment re: this event */
+	return FALSE;
+    case CTRL_C_EVENT:
+    case CTRL_BREAK_EVENT:
+
+	// If we're already trying to interrupt the RTS, terminate with
+	// extreme prejudice.  So the first ^C tries to exit the program
+	// cleanly, and the second one just kills it.
+	if (sched_state >= SCHED_INTERRUPTING) {
+	    stg_exit(EXIT_INTERRUPTED);
+	} else {
+	    interruptStgRts();
+	    /* Cheesy pulsing of an event to wake up a waiting RTS thread, if any */
+	    abandonRequestWait();
+	    resetAbandonRequestWait();
+	}
+	return TRUE;
+
+	/* shutdown + logoff events are not handled here. */
+    default:
+	return FALSE;
+    }
+}
+
+
+/*
+ * Function: initDefaultHandlers()
+ *
+ * Install any default signal/console handlers. Currently we install a
+ * Ctrl+C handler that shuts down the RTS in an orderly manner.
+ */
+void initDefaultHandlers(void)
+{
+    if ( !SetConsoleCtrlHandler(shutdown_handler, TRUE) ) {
+	errorBelch("warning: failed to install default console handler");
+    }
+}
+
+
+/*
+ * Function: blockUserSignals()
+ *
+ * Temporarily block the delivery of further console events. Needed to
+ * avoid race conditions when GCing the stack of outstanding handlers or
+ * when emptying the stack by running the handlers.
+ * 
+ */
+void
+blockUserSignals(void)
+{
+    deliver_event = rtsFalse;
+}
+
+
+/*
+ * Function: unblockUserSignals()
+ *
+ * The inverse of blockUserSignals(); re-enable the deliver of console events.
+ */
+void
+unblockUserSignals(void)
+{
+    deliver_event = rtsTrue;
+}
+
+
+/*
+ * Function: awaitUserSignals()
+ *
+ * Wait for the next console event. Currently a NOP (returns immediately.)
+ */
+void awaitUserSignals(void)
+{
+    return;
+}
+
+
+/*
+ * Function: startSignalHandlers()
+ *
+ * Run the handlers associated with the stacked up console events. Console
+ * event delivery is blocked for the duration of this call.
+ */
+void startSignalHandlers(Capability *cap)
+{
+    StgStablePtr handler;
+
+    if (console_handler < 0) {
+	return;
+    }
+
+    blockUserSignals();
+    ACQUIRE_LOCK(&sched_mutex);
+    
+    handler = deRefStablePtr((StgStablePtr)console_handler);
+    while (stg_pending_events > 0) {
+	stg_pending_events--;
+	scheduleThread(cap,
+	    createIOThread(cap,
+			   RtsFlags.GcFlags.initialStkSize, 
+			   rts_apply(cap,
+				     (StgClosure *)handler,
+				     rts_mkInt(cap,
+					       stg_pending_buf[stg_pending_events]))));
+    }
+    
+    RELEASE_LOCK(&sched_mutex);
+    unblockUserSignals();
+}
+
+/*
+ * Function: markSignalHandlers()
+ *
+ * Evacuate the handler stack. _Assumes_ that console event delivery
+ * has already been blocked.
+ */
+void markSignalHandlers (evac_fn evac)
+{
+    if (console_handler >= 0) {
+	StgPtr p = deRefStablePtr((StgStablePtr)console_handler);
+	evac((StgClosure**)(void *)&p);
+    }
+}
+
+
+/* 
+ * Function: generic_handler()
+ *
+ * Local function which handles incoming console event (done in a sep OS thread),
+ * recording the event in stg_pending_events. 
+ */
+static BOOL WINAPI generic_handler(DWORD dwCtrlType)
+{
+    ACQUIRE_LOCK(&sched_mutex);
+
+    /* Ultra-simple -- up the counter + signal a switch. */
+    switch(dwCtrlType) {
+    case CTRL_CLOSE_EVENT:
+	/* Don't support the delivery of this event; if we
+	 * indicate that we've handled it here and the Haskell handler
+	 * doesn't take proper action (e.g., terminate the OS process),
+	 * the user of the app will be unable to kill/close it. Not
+	 * good, so disable the delivery for now.
+	 */
+	return FALSE;
+    default:
+	if (!deliver_event) return TRUE;
+
+	if ( stg_pending_events < N_PENDING_EVENTS ) {
+	    stg_pending_buf[stg_pending_events] = dwCtrlType;
+	    stg_pending_events++;
+	}
+	/* Cheesy pulsing of an event to wake up a waiting RTS thread, if any */
+	abandonRequestWait();
+	resetAbandonRequestWait();
+	return TRUE;
+    }
+
+    RELEASE_LOCK(&sched_mutex);
+}
+
+
+/*
+ * Function: rts_InstallConsoleEvent()
+ *
+ * Install/remove a console event handler.
+ */
+int
+rts_InstallConsoleEvent(int action, StgStablePtr *handler)
+{
+    StgInt previous_hdlr = console_handler;
+
+    switch (action) {
+    case STG_SIG_IGN:
+	console_handler = STG_SIG_IGN;
+	if ( !SetConsoleCtrlHandler(NULL, TRUE) ) {
+	    errorBelch("warning: unable to ignore console events");
+	}
+	break;
+    case STG_SIG_DFL:
+	console_handler = STG_SIG_IGN;
+	if ( !SetConsoleCtrlHandler(NULL, FALSE) ) {
+	    errorBelch("warning: unable to restore default console event handling");
+	}
+	break;
+    case STG_SIG_HAN:
+	console_handler = (StgInt)*handler;
+	if ( previous_hdlr < 0 ) {
+	  /* Only install generic_handler() once */
+	  if ( !SetConsoleCtrlHandler(generic_handler, TRUE) ) {
+	    errorBelch("warning: unable to install console event handler");
+	  }
+	}
+	break;
+    }
+    
+    if (previous_hdlr == STG_SIG_DFL || 
+	previous_hdlr == STG_SIG_IGN) {
+	return previous_hdlr;
+    } else {
+	*handler = (StgStablePtr)previous_hdlr;
+	return STG_SIG_HAN;
+    }
+}
+
+/*
+ * Function: rts_HandledConsoleEvent()
+ *
+ * Signal that a Haskell console event handler has completed its run.
+ * The explicit notification that a Haskell handler has completed is 
+ * required to better handle the delivery of Ctrl-C/Break events whilst
+ * an async worker thread is handling a read request on stdin. The 
+ * Win32 console implementation will abort such a read request when Ctrl-C
+ * is delivered. That leaves the worker thread in a bind: should it 
+ * abandon the request (the Haskell thread reading from stdin has been 
+ * thrown an exception to signal the delivery of Ctrl-C & hence have 
+ * aborted the I/O request) or simply ignore the aborted read and retry?
+ * (the Haskell thread reading from stdin isn't concerned with the
+ * delivery and handling of Ctrl-C.) With both scenarios being
+ * possible, the worker thread needs to be told -- that is, did the
+ * console event handler cause the IO request to be abandoned? 
+ *
+ */
+void
+rts_ConsoleHandlerDone(int ev)
+{
+    if ( (DWORD)ev == CTRL_BREAK_EVENT ||
+	 (DWORD)ev == CTRL_C_EVENT ) {
+	/* only these two cause stdin system calls to abort.. */
+	SetEvent(hConsoleEvent); /* event is manual-reset */
+	Sleep(0); /* yield */
+	ResetEvent(hConsoleEvent); /* turn it back off again */
+    }
+}
+
+/*
+ * Function: rts_waitConsoleHandlerCompletion()
+ *
+ * Esoteric entry point used by worker thread that got woken
+ * up as part Ctrl-C delivery.
+ */
+int
+rts_waitConsoleHandlerCompletion()
+{
+    /* As long as the worker doesn't need to do a multiple wait,
+     * let's keep this HANDLE private to this 'module'.
+     */
+    return (WaitForSingleObject(hConsoleEvent, INFINITE) == WAIT_OBJECT_0);
+}
diff --git a/rts/win32/ConsoleHandler.h b/rts/win32/ConsoleHandler.h
new file mode 100644
index 0000000000..b09adf71cb
--- /dev/null
+++ b/rts/win32/ConsoleHandler.h
@@ -0,0 +1,63 @@
+/*
+ * Console control handler support.
+ *
+ */
+#ifndef __CONSOLEHANDLER_H__
+#define __CONSOLEHANDLER_H__
+
+/*
+ * Console control handlers lets an application handle Ctrl+C, Ctrl+Break etc.
+ * in Haskell under Win32. Akin to the Unix signal SIGINT.
+ *
+ * The API offered by ConsoleHandler.h is identical to that of the signal handling
+ * code (which isn't supported under win32.) Unsurprisingly, the underlying impl 
+ * is derived from the signal handling code also.
+ */
+
+/*
+ * Function: signals_pending() 
+ * 
+ * Used by the RTS to check whether new signals have been 'recently' reported.
+ * If so, the RTS arranges for the delivered signals to be handled by 
+ * de-queueing them from their table, running the associated Haskell 
+ * signal handler.
+ */
+extern StgInt stg_pending_events;
+
+#define signals_pending() ( stg_pending_events > 0)
+
+/* 
+ * Function: anyUserHandlers()
+ *
+ * Used by the Scheduler to decide whether its worth its while to stick
+ * around waiting for an external signal when there are no threads
+ * runnable. A console handler is used to handle termination events (Ctrl+C)
+ * and isn't considered a 'user handler'.
+ */
+#define anyUserHandlers() (rtsFalse)
+
+/*
+ * Function: startSignalHandlers()
+ *
+ * Run the handlers associated with the queued up console events. Console
+ * event delivery is blocked for the duration of this call.
+ */
+extern void startSignalHandlers(Capability *cap);
+
+/*
+ * Function: handleSignalsInThisThread()
+ * 
+ * Have current (OS) thread assume responsibility of handling console events/signals.
+ * Currently not used (by the console event handling code.)
+ */
+extern void handleSignalsInThisThread(void);
+
+/*
+ * Function: rts_waitConsoleHandlerCompletion()
+ *
+ * Esoteric entry point used by worker thread that got woken
+ * up as part Ctrl-C delivery.
+ */
+extern int rts_waitConsoleHandlerCompletion(void);
+
+#endif /* __CONSOLEHANDLER_H__ */
diff --git a/rts/win32/GetTime.c b/rts/win32/GetTime.c
new file mode 100644
index 0000000000..584b994d53
--- /dev/null
+++ b/rts/win32/GetTime.c
@@ -0,0 +1,101 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2005
+ *
+ * Machine-dependent time measurement functions
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+#include "GetTime.h"
+
+#include <windows.h>
+
+#ifdef HAVE_TIME_H
+# include <time.h>
+#endif
+
+#define HNS_PER_SEC 10000000LL /* FILETIMES are in units of 100ns */
+/* Convert FILETIMEs into secs */
+
+static INLINE_ME Ticks
+fileTimeToTicks(FILETIME ft)
+{
+    Ticks t;
+    t = ((Ticks)ft.dwHighDateTime << 32) | ft.dwLowDateTime;
+    t = (t * TICKS_PER_SECOND) / HNS_PER_SEC;
+    return t;
+}    
+
+static int is_win9x = -1;
+
+static INLINE_ME rtsBool
+isWin9x(void)
+{
+    if (is_win9x < 0) {
+	/* figure out whether we're on a Win9x box or not. */
+	OSVERSIONINFO oi;
+	BOOL b;
+	
+	/* Need to init the size field first.*/
+	oi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+	b = GetVersionEx(&oi);
+      
+	is_win9x = ( (b && (oi.dwPlatformId & VER_PLATFORM_WIN32_WINDOWS)) ? 1 : 0);
+    }
+    return is_win9x;
+}
+
+
+void
+getProcessTimes(Ticks *user, Ticks *elapsed)
+{
+    *user    = getProcessCPUTime();
+    *elapsed = getProcessElapsedTime();
+}
+
+Ticks
+getProcessCPUTime(void)
+{
+    FILETIME creationTime, exitTime, userTime, kernelTime = {0,0};
+
+    if (isWin9x()) return getProcessElapsedTime();
+
+    if (!GetProcessTimes(GetCurrentProcess(), &creationTime,
+			 &exitTime, &kernelTime, &userTime)) {
+	return 0;
+    }
+
+    return fileTimeToTicks(userTime);
+}
+
+Ticks
+getProcessElapsedTime(void)
+{
+    FILETIME system_time;
+    GetSystemTimeAsFileTime(&system_time);
+    return fileTimeToTicks(system_time);
+}
+
+Ticks
+getThreadCPUTime(void)
+{
+    FILETIME creationTime, exitTime, userTime, kernelTime = {0,0};
+
+    if (isWin9x()) return getProcessCPUTime();
+
+    if (!GetThreadTimes(GetCurrentThread(), &creationTime,
+			&exitTime, &kernelTime, &userTime)) {
+	return 0;
+    }
+
+    return fileTimeToTicks(userTime);
+}
+
+nat
+getPageFaults(void)
+{
+  /* ToDo (on NT): better, get this via the performance data
+     that's stored in the registry. */
+    return 0;
+}
diff --git a/rts/win32/IOManager.c b/rts/win32/IOManager.c
new file mode 100644
index 0000000000..a67c3504c1
--- /dev/null
+++ b/rts/win32/IOManager.c
@@ -0,0 +1,510 @@
+/* IOManager.c
+ *
+ * Non-blocking / asynchronous I/O for Win32.
+ *
+ * (c) sof, 2002-2003.
+ */
+#include "Rts.h"
+#include "IOManager.h"
+#include "WorkQueue.h"
+#include "ConsoleHandler.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <io.h>
+#include <winsock.h>
+#include <process.h>
+
+/*
+ * Internal state maintained by the IO manager.
+ */
+typedef struct IOManagerState {
+    CritSection      manLock;
+    WorkQueue*       workQueue;
+    int              queueSize;
+    int              numWorkers;
+    int              workersIdle;
+    HANDLE           hExitEvent;
+    unsigned int     requestID;
+    /* fields for keeping track of active WorkItems */
+    CritSection      active_work_lock;
+    WorkItem*        active_work_items;
+} IOManagerState;
+
+/* ToDo: wrap up this state via a IOManager handle instead? */
+static IOManagerState* ioMan;
+
+static void RegisterWorkItem  ( IOManagerState* iom, WorkItem* wi);
+static void DeregisterWorkItem( IOManagerState* iom, WorkItem* wi);
+
+/*
+ * The routine executed by each worker thread.
+ */
+static
+unsigned
+WINAPI
+IOWorkerProc(PVOID param)
+{
+    HANDLE  hWaits[2];
+    DWORD   rc;
+    IOManagerState* iom = (IOManagerState*)param;
+    WorkQueue* pq = iom->workQueue;
+    WorkItem*  work;
+    int        len = 0, fd = 0;
+    DWORD      errCode = 0;
+    void*      complData;
+
+    hWaits[0] = (HANDLE)iom->hExitEvent;
+    hWaits[1] = GetWorkQueueHandle(pq);
+  
+    while (1) {
+	/* The error code is communicated back on completion of request; reset. */
+	errCode = 0;
+	
+	EnterCriticalSection(&iom->manLock);
+	/* Signal that the worker is idle.
+	 *
+	 * 'workersIdle' is used when determining whether or not to
+	 * increase the worker thread pool when adding a new request.
+	 * (see addIORequest().)
+	 */
+	iom->workersIdle++;
+	LeaveCriticalSection(&iom->manLock);
+	
+	/*
+	 * A possible future refinement is to make long-term idle threads
+	 * wake up and decide to shut down should the number of idle threads
+	 * be above some threshold.
+	 *
+	 */
+	rc = WaitForMultipleObjects( 2, hWaits, FALSE, INFINITE );
+
+	if (rc == WAIT_OBJECT_0) {
+	    // we received the exit event
+	    return 0;
+	}
+
+	EnterCriticalSection(&iom->manLock);
+	/* Signal that the thread is 'non-idle' and about to consume 
+	 * a work item.
+	 */
+	iom->workersIdle--;
+	iom->queueSize--;
+	LeaveCriticalSection(&iom->manLock);
+    
+	if ( rc == (WAIT_OBJECT_0 + 1) ) {
+	    /* work item available, fetch it. */
+	    if (FetchWork(pq,(void**)&work)) {
+		work->abandonOp = 0;
+		RegisterWorkItem(iom,work);
+		if ( work->workKind & WORKER_READ ) {
+		    if ( work->workKind & WORKER_FOR_SOCKET ) {
+			len = recv(work->workData.ioData.fd, 
+				   work->workData.ioData.buf,
+				   work->workData.ioData.len,
+				   0);
+			if (len == SOCKET_ERROR) {
+			    errCode = WSAGetLastError();
+			}
+		    } else {
+			while (1) {
+			/* Do the read(), with extra-special handling for Ctrl+C */
+			len = read(work->workData.ioData.fd,
+				   work->workData.ioData.buf,
+				   work->workData.ioData.len);
+			if ( len == 0 && work->workData.ioData.len != 0 ) {
+			    /* Given the following scenario:
+			     *     - a console handler has been registered that handles Ctrl+C
+			     *       events.
+			     *     - we've not tweaked the 'console mode' settings to turn on
+			     *       ENABLE_PROCESSED_INPUT.
+			     *     - we're blocked waiting on input from standard input.
+			     *     - the user hits Ctrl+C.
+			     *
+			     * The OS will invoke the console handler (in a separate OS thread),
+			     * and the above read() (i.e., under the hood, a ReadFile() op) returns
+			     * 0, with the error set to ERROR_OPERATION_ABORTED. We don't
+			     * want to percolate this error condition back to the Haskell user.
+			     * Do this by waiting for the completion of the Haskell console handler.
+			     * If upon completion of the console handler routine, the Haskell thread 
+			     * that issued the request is found to have been thrown an exception, 
+			     * the worker abandons the request (since that's what the Haskell thread 
+			     * has done.) If the Haskell thread hasn't been interrupted, the worker 
+			     * retries the read request as if nothing happened.
+			     */
+			    if ( (GetLastError()) == ERROR_OPERATION_ABORTED ) {
+				/* For now, only abort when dealing with the standard input handle.
+				 * i.e., for all others, an error is raised.
+				 */
+				HANDLE h  = (HANDLE)GetStdHandle(STD_INPUT_HANDLE);
+				if ( _get_osfhandle(work->workData.ioData.fd) == (long)h ) {
+				    if (rts_waitConsoleHandlerCompletion()) {
+					/* If the Scheduler has set work->abandonOp, the Haskell thread has 
+					 * been thrown an exception (=> the worker must abandon this request.)
+					 * We test for this below before invoking the on-completion routine.
+					 */
+					if (work->abandonOp) {
+					    break;
+					} else {
+					    continue;
+					}
+				    } 
+				} else { 
+				    break; /* Treat it like an error */
+				}
+			    } else {
+				break;
+			    }
+			} else {
+			    break;
+			}
+			}
+			if (len == -1) { errCode = errno; }
+		    }
+		    complData = work->workData.ioData.buf;
+		    fd = work->workData.ioData.fd;
+		} else if ( work->workKind & WORKER_WRITE ) {
+		    if ( work->workKind & WORKER_FOR_SOCKET ) {
+			len = send(work->workData.ioData.fd,
+				   work->workData.ioData.buf,
+				   work->workData.ioData.len,
+				   0);
+			if (len == SOCKET_ERROR) {
+			    errCode = WSAGetLastError();
+			}
+		    } else {
+			len = write(work->workData.ioData.fd,
+				    work->workData.ioData.buf,
+				    work->workData.ioData.len);
+			if (len == -1) { errCode = errno; }
+		    }
+		    complData = work->workData.ioData.buf;
+		    fd = work->workData.ioData.fd;
+		} else if ( work->workKind & WORKER_DELAY ) {
+		    /* Approximate implementation of threadDelay;
+		     * 
+		     * Note: Sleep() is in milliseconds, not micros.
+		     */
+		    Sleep(work->workData.delayData.msecs / 1000);
+		    len = work->workData.delayData.msecs;
+		    complData = NULL;
+		    fd = 0;
+		    errCode = 0;
+		} else if ( work->workKind & WORKER_DO_PROC ) {
+		    /* perform operation/proc on behalf of Haskell thread. */
+		    if (work->workData.procData.proc) {
+			/* The procedure is assumed to encode result + success/failure
+			 * via its param.
+			 */
+			errCode=work->workData.procData.proc(work->workData.procData.param);
+		    } else {
+			errCode=1;
+		    }
+		    complData = work->workData.procData.param;
+		} else {
+		    fprintf(stderr, "unknown work request type (%d) , ignoring.\n", work->workKind);
+		    fflush(stderr);
+		    continue;
+		}
+		if (!work->abandonOp) {
+		    work->onCompletion(work->requestID,
+				       fd,
+				       len,
+				       complData,
+				       errCode);
+		}
+		/* Free the WorkItem */
+		DeregisterWorkItem(iom,work);
+		free(work);
+	    } else {
+		fprintf(stderr, "unable to fetch work; fatal.\n"); fflush(stderr);
+		return 1;
+	    }
+	} else {
+	    fprintf(stderr, "waiting failed (%lu); fatal.\n", rc); fflush(stderr);
+	    return 1;
+	}
+    }
+    return 0;
+}
+
+static 
+BOOL
+NewIOWorkerThread(IOManagerState* iom)
+{
+    unsigned threadId;
+    return ( 0 != _beginthreadex(NULL,
+				 0,
+				 IOWorkerProc,
+				 (LPVOID)iom,
+				 0,
+				 &threadId) );
+}
+
+BOOL
+StartIOManager(void)
+{
+    HANDLE hExit;
+    WorkQueue* wq;
+
+    wq = NewWorkQueue();
+    if ( !wq ) return FALSE;  
+  
+    ioMan = (IOManagerState*)malloc(sizeof(IOManagerState));
+  
+    if (!ioMan) {
+	FreeWorkQueue(wq);
+	return FALSE;
+    }
+
+    /* A manual-reset event */
+    hExit = CreateEvent ( NULL, TRUE, FALSE, NULL );
+    if ( !hExit ) {
+	FreeWorkQueue(wq);
+	free(ioMan);
+	return FALSE;
+    }
+  
+    ioMan->hExitEvent = hExit;
+    InitializeCriticalSection(&ioMan->manLock);
+    ioMan->workQueue   = wq;
+    ioMan->numWorkers  = 0;
+    ioMan->workersIdle = 0;
+    ioMan->queueSize   = 0;
+    ioMan->requestID   = 1;
+    InitializeCriticalSection(&ioMan->active_work_lock);
+    ioMan->active_work_items = NULL;
+ 
+    return TRUE;
+}
+
+/*
+ * Function: depositWorkItem()
+ *
+ * Local function which deposits a WorkItem onto a work queue,
+ * deciding in the process whether or not the thread pool needs
+ * to be augmented with another thread to handle the new request.
+ *
+ */
+static
+int
+depositWorkItem( unsigned int reqID,
+		 WorkItem* wItem )
+{
+    EnterCriticalSection(&ioMan->manLock);
+
+#if 0
+    fprintf(stderr, "depositWorkItem: %d/%d\n", ioMan->workersIdle, ioMan->numWorkers); 
+    fflush(stderr);
+#endif
+    /* A new worker thread is created when there are fewer idle threads
+     * than non-consumed queue requests. This ensures that requests will
+     * be dealt with in a timely manner.
+     *
+     * [Long explanation of why the previous thread pool policy lead to 
+     * trouble]
+     *
+     * Previously, the thread pool was augmented iff no idle worker threads
+     * were available. That strategy runs the risk of repeatedly adding to
+     * the request queue without expanding the thread pool to handle this
+     * sudden spike in queued requests. 
+     * [How? Assume workersIdle is 1, and addIORequest() is called. No new 
+     * thread is created and the request is simply queued. If addIORequest()
+     * is called again _before the OS schedules a worker thread to pull the
+     * request off the queue_, workersIdle is still 1 and another request is 
+     * simply added to the queue. Once the worker thread is run, only one
+     * request is de-queued, leaving the 2nd request in the queue]
+     * 
+     * Assuming none of the queued requests take an inordinate amount of to 
+     * complete, the request queue would eventually be drained. But if that's 
+     * not the case, the later requests will end up languishing in the queue 
+     * indefinitely. The non-timely handling of requests may cause CH applications
+     * to misbehave / hang; bad.
+     *
+     */
+    ioMan->queueSize++;
+    if ( (ioMan->workersIdle < ioMan->queueSize) ) {
+	/* see if giving up our quantum ferrets out some idle threads.
+	 */
+	LeaveCriticalSection(&ioMan->manLock);
+	Sleep(0);
+	EnterCriticalSection(&ioMan->manLock);
+	if ( (ioMan->workersIdle < ioMan->queueSize) ) {
+	    /* No, go ahead and create another. */
+	    ioMan->numWorkers++;
+	    LeaveCriticalSection(&ioMan->manLock);
+	    NewIOWorkerThread(ioMan);
+	} else {
+	    LeaveCriticalSection(&ioMan->manLock);
+	}
+    } else {
+	LeaveCriticalSection(&ioMan->manLock);
+    }
+  
+    if (SubmitWork(ioMan->workQueue,wItem)) {
+	/* Note: the work item has potentially been consumed by a worker thread
+	 *       (and freed) at this point, so we cannot use wItem's requestID.
+	 */
+	return reqID;
+    } else {
+	return 0;
+    }
+}
+
+/*
+ * Function: AddIORequest()
+ *
+ * Conduit to underlying WorkQueue's SubmitWork(); adds IO
+ * request to work queue, deciding whether or not to augment
+ * the thread pool in the process. 
+ */
+int
+AddIORequest ( int   fd,
+	       BOOL  forWriting,
+	       BOOL  isSocket,
+	       int   len,
+	       char* buffer,
+	       CompletionProc onCompletion)
+{
+    WorkItem* wItem    = (WorkItem*)malloc(sizeof(WorkItem));
+    unsigned int reqID = ioMan->requestID++;
+    if (!ioMan || !wItem) return 0;
+  
+    /* Fill in the blanks */
+    wItem->workKind     = ( isSocket   ? WORKER_FOR_SOCKET : 0 ) | 
+	                  ( forWriting ? WORKER_WRITE : WORKER_READ );
+    wItem->workData.ioData.fd  = fd;
+    wItem->workData.ioData.len = len;
+    wItem->workData.ioData.buf = buffer;
+    wItem->link = NULL;
+
+    wItem->onCompletion        = onCompletion;
+    wItem->requestID           = reqID;
+  
+    return depositWorkItem(reqID, wItem);
+}       
+
+/*
+ * Function: AddDelayRequest()
+ *
+ * Like AddIORequest(), but this time adding a delay request to
+ * the request queue.
+ */
+BOOL
+AddDelayRequest ( unsigned int   msecs,
+		  CompletionProc onCompletion)
+{
+    WorkItem* wItem = (WorkItem*)malloc(sizeof(WorkItem));
+    unsigned int reqID = ioMan->requestID++;
+    if (!ioMan || !wItem) return FALSE;
+  
+    /* Fill in the blanks */
+    wItem->workKind     = WORKER_DELAY;
+    wItem->workData.delayData.msecs = msecs;
+    wItem->onCompletion = onCompletion;
+    wItem->requestID    = reqID;
+    wItem->link         = NULL;
+
+    return depositWorkItem(reqID, wItem);
+}
+
+/*
+ * Function: AddProcRequest()
+ *
+ * Add an asynchronous procedure request.
+ */
+BOOL
+AddProcRequest ( void* proc,
+		 void* param,
+		 CompletionProc onCompletion)
+{
+    WorkItem* wItem = (WorkItem*)malloc(sizeof(WorkItem));
+    unsigned int reqID = ioMan->requestID++;
+    if (!ioMan || !wItem) return FALSE;
+  
+    /* Fill in the blanks */
+    wItem->workKind     = WORKER_DO_PROC;
+    wItem->workData.procData.proc  = proc;
+    wItem->workData.procData.param = param;
+    wItem->onCompletion = onCompletion;
+    wItem->requestID    = reqID;
+    wItem->abandonOp    = 0;
+    wItem->link         = NULL;
+
+    return depositWorkItem(reqID, wItem);
+}
+
+void ShutdownIOManager ( void )
+{
+  SetEvent(ioMan->hExitEvent);
+  // ToDo: we can't free this now, because the worker thread(s)
+  // haven't necessarily finished with it yet.  Perhaps it should
+  // have a reference count or something.
+  // free(ioMan);
+  // ioMan = NULL;
+}
+
+/* Keep track of WorkItems currently being serviced. */
+static 
+void
+RegisterWorkItem(IOManagerState* ioMan, 
+		 WorkItem* wi)
+{
+    EnterCriticalSection(&ioMan->active_work_lock);
+    wi->link = ioMan->active_work_items;
+    ioMan->active_work_items = wi;
+    LeaveCriticalSection(&ioMan->active_work_lock);
+}
+
+static 
+void
+DeregisterWorkItem(IOManagerState* ioMan, 
+		   WorkItem* wi)
+{
+    WorkItem *ptr, *prev;
+    
+    EnterCriticalSection(&ioMan->active_work_lock);
+    for(prev=NULL,ptr=ioMan->active_work_items;ptr;prev=ptr,ptr=ptr->link) {
+	if (wi->requestID == ptr->requestID) {
+	    if (prev==NULL) {
+		ioMan->active_work_items = ptr->link;
+	    } else {
+		prev->link = ptr->link;
+	    }
+	    LeaveCriticalSection(&ioMan->active_work_lock);
+	    return;
+	}
+    }
+    fprintf(stderr, "DeregisterWorkItem: unable to locate work item %d\n", wi->requestID);
+    LeaveCriticalSection(&ioMan->active_work_lock);
+}
+
+
+/*
+ * Function: abandonWorkRequest()
+ *
+ * Signal that a work request isn't of interest. Called by the Scheduler
+ * if a blocked Haskell thread has an exception thrown to it.
+ *
+ * Note: we're not aborting the system call that a worker might be blocked on
+ * here, just disabling the propagation of its result once its finished. We
+ * may have to go the whole hog here and switch to overlapped I/O so that we
+ * can abort blocked system calls.
+ */
+void
+abandonWorkRequest ( int reqID )
+{
+    WorkItem *ptr;
+    EnterCriticalSection(&ioMan->active_work_lock);
+    for(ptr=ioMan->active_work_items;ptr;ptr=ptr->link) {
+	if (ptr->requestID == (unsigned int)reqID ) {
+	    ptr->abandonOp = 1;
+	    LeaveCriticalSection(&ioMan->active_work_lock);
+	    return;
+	}
+    }
+    /* Note: if the request ID isn't present, the worker will have
+     * finished sometime since awaitRequests() last drained the completed
+     * request table; i.e., not an error.
+     */
+    LeaveCriticalSection(&ioMan->active_work_lock);
+}
diff --git a/rts/win32/IOManager.h b/rts/win32/IOManager.h
new file mode 100644
index 0000000000..4893e2387c
--- /dev/null
+++ b/rts/win32/IOManager.h
@@ -0,0 +1,110 @@
+/* IOManager.h
+ *
+ * Non-blocking / asynchronous I/O for Win32.
+ *
+ * (c) sof, 2002-2003
+ */
+#ifndef __IOMANAGER_H__
+#define __IOMANAGER_H__
+/* On the yucky side..suppress -Wmissing-declarations warnings when
+ * including <windows.h>
+ */
+extern void* GetCurrentFiber ( void );
+extern void* GetFiberData ( void );
+#include <windows.h>
+
+/*
+ The IOManager subsystem provides a non-blocking view
+ of I/O operations. It lets one (or more) OS thread(s)
+ issue multiple I/O requests, which the IOManager then 
+ handles independently of/concurrent to the thread(s)
+ that issued the request. Upon completion, the issuing
+ thread can inspect the result of the I/O operation &
+ take appropriate action.
+
+ The IOManager is intended used with the GHC RTS to
+ implement non-blocking I/O in Concurrent Haskell.
+ */
+
+/*
+ * Our WorkQueue holds WorkItems, encoding IO and
+ * delay requests.
+ *
+ */
+typedef void (*CompletionProc)(unsigned int requestID,
+			       int   fd,
+			       int   len,
+			       void* buf,
+			       int   errCode);
+
+/* 
+ * Asynchronous procedure calls executed by a worker thread
+ * take a generic state argument pointer and return an int by 
+ * default. 
+ */
+typedef int (*DoProcProc)(void *param);
+
+typedef union workData {
+    struct {
+	int   fd;
+	int   len;
+	char *buf; 
+    } ioData;
+    struct { 
+	int   msecs;
+    } delayData;
+    struct { 
+	DoProcProc proc;
+	void* param;
+    } procData;
+} WorkData;
+
+typedef struct WorkItem {
+  unsigned int     workKind;
+  WorkData         workData;
+  unsigned int     requestID;
+  CompletionProc   onCompletion;
+  unsigned int     abandonOp;
+  struct WorkItem  *link;
+} WorkItem;
+
+extern CompletionProc onComplete;
+
+/* the kind of operations supported; you could easily imagine
+ * that instead of passing a tag describing the work to be performed,
+ * a function pointer is passed instead. Maybe later.
+ */
+#define WORKER_READ        1
+#define WORKER_WRITE       2
+#define WORKER_DELAY       4
+#define WORKER_FOR_SOCKET  8
+#define WORKER_DO_PROC    16
+
+/*
+ * Starting up and shutting down. 
+ */ 
+extern BOOL StartIOManager     ( void );
+extern void ShutdownIOManager  ( void );
+
+/*
+ * Adding I/O and delay requests. With each request a
+ * completion routine is supplied, which the worker thread
+ * will invoke upon completion.
+ */
+extern int AddDelayRequest ( unsigned int   msecs,
+			     CompletionProc onCompletion);
+
+extern int AddIORequest ( int            fd,
+			  BOOL           forWriting,
+			  BOOL           isSocket,
+			  int            len,
+			  char*          buffer,
+			  CompletionProc onCompletion);
+
+extern int AddProcRequest ( void*          proc,
+			    void*          data,
+			    CompletionProc onCompletion);
+
+extern void abandonWorkRequest ( int reqID );
+
+#endif /* __IOMANAGER_H__ */
diff --git a/rts/win32/OSThreads.c b/rts/win32/OSThreads.c
new file mode 100644
index 0000000000..c772be38f4
--- /dev/null
+++ b/rts/win32/OSThreads.c
@@ -0,0 +1,199 @@
+/* ---------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001-2005
+ *
+ * Accessing OS threads functionality in a (mostly) OS-independent
+ * manner. 
+ *
+ * --------------------------------------------------------------------------*/
+
+#include "Rts.h"
+#if defined(THREADED_RTS)
+#include "OSThreads.h"
+#include "RtsUtils.h"
+
+/* For reasons not yet clear, the entire contents of process.h is protected 
+ * by __STRICT_ANSI__ not being defined.
+ */
+#undef __STRICT_ANSI__
+#include <process.h>
+
+/* Win32 threads and synchronisation objects */
+
+/* A Condition is represented by a Win32 Event object;
+ * a Mutex by a Mutex kernel object.
+ *
+ * ToDo: go through the defn and usage of these to
+ * make sure the semantics match up with that of 
+ * the (assumed) pthreads behaviour. This is really
+ * just a first pass at getting something compilable.
+ */
+
+void
+initCondition( Condition* pCond )
+{
+  HANDLE h =  CreateEvent(NULL, 
+			  FALSE,  /* auto reset */
+			  FALSE,  /* initially not signalled */
+			  NULL); /* unnamed => process-local. */
+  
+  if ( h == NULL ) {
+    errorBelch("initCondition: unable to create");
+  }
+  *pCond = h;
+  return;
+}
+
+void
+closeCondition( Condition* pCond )
+{
+  if ( CloseHandle(*pCond) == 0 ) {
+    errorBelch("closeCondition: failed to close");
+  }
+  return;
+}
+
+rtsBool
+broadcastCondition ( Condition* pCond )
+{
+  PulseEvent(*pCond);
+  return rtsTrue;
+}
+
+rtsBool
+signalCondition ( Condition* pCond )
+{
+    if (SetEvent(*pCond) == 0) {
+	barf("SetEvent: %d", GetLastError());
+    }
+    return rtsTrue;
+}
+
+rtsBool
+waitCondition ( Condition* pCond, Mutex* pMut )
+{
+  RELEASE_LOCK(pMut);
+  WaitForSingleObject(*pCond, INFINITE);
+  /* Hmm..use WaitForMultipleObjects() ? */
+  ACQUIRE_LOCK(pMut);
+  return rtsTrue;
+}
+
+void
+yieldThread()
+{
+  Sleep(0);
+  return;
+}
+
+void
+shutdownThread()
+{
+  _endthreadex(0);
+}
+
+int
+createOSThread (OSThreadId* pId, OSThreadProc *startProc, void *param)
+{
+  
+  return (_beginthreadex ( NULL,  /* default security attributes */
+			   0,
+			   (unsigned (__stdcall *)(void *)) startProc,
+			   param,
+			   0,
+			   (unsigned*)pId) == 0);
+}
+
+OSThreadId
+osThreadId()
+{
+  return GetCurrentThreadId();
+}
+
+#ifdef USE_CRITICAL_SECTIONS
+void
+initMutex (Mutex* pMut)
+{
+    InitializeCriticalSectionAndSpinCount(pMut,4000);
+}
+#else
+void
+initMutex (Mutex* pMut)
+{
+  HANDLE h = CreateMutex ( NULL,  /* default sec. attributes */
+			   FALSE, /* not owned => initially signalled */
+			   NULL
+			   );
+  *pMut = h;
+  return;
+}
+#endif
+
+void
+newThreadLocalKey (ThreadLocalKey *key)
+{
+    DWORD r;
+    r = TlsAlloc();
+    if (r == TLS_OUT_OF_INDEXES) {
+	barf("newThreadLocalKey: out of keys");
+    }
+    *key = r;
+}
+
+void *
+getThreadLocalVar (ThreadLocalKey *key)
+{
+    void *r;
+    r = TlsGetValue(*key);
+#ifdef DEBUG
+    // r is allowed to be NULL - it can mean that either there was an
+    // error or the stored value is in fact NULL.
+    if (GetLastError() != NO_ERROR) {
+	barf("getThreadLocalVar: key not found");
+    }
+#endif
+    return r;
+}
+
+void
+setThreadLocalVar (ThreadLocalKey *key, void *value)
+{
+    BOOL b;
+    b = TlsSetValue(*key, value);
+    if (!b) {
+	barf("setThreadLocalVar: %d", GetLastError());
+    }
+}
+
+
+static unsigned __stdcall
+forkOS_createThreadWrapper ( void * entry )
+{
+    Capability *cap;
+    cap = rts_lock();
+    cap = rts_evalStableIO(cap, (HsStablePtr) entry, NULL);
+    rts_unlock(cap);
+    return 0;
+}
+
+int
+forkOS_createThread ( HsStablePtr entry )
+{
+    unsigned long pId;
+    return (_beginthreadex ( NULL,  /* default security attributes */
+			   0,
+			   forkOS_createThreadWrapper,
+			   (void*)entry,
+			   0,
+			   (unsigned*)&pId) == 0);
+}
+
+#else /* !defined(THREADED_RTS) */
+
+int
+forkOS_createThread ( HsStablePtr entry STG_UNUSED )
+{
+    return -1;
+}
+
+#endif /* !defined(THREADED_RTS) */
diff --git a/rts/win32/Ticker.c b/rts/win32/Ticker.c
new file mode 100644
index 0000000000..ab791d8dc7
--- /dev/null
+++ b/rts/win32/Ticker.c
@@ -0,0 +1,124 @@
+/*
+ * RTS periodic timers.
+ * 
+ */
+#include "Rts.h"
+#include "Timer.h"
+#include "Ticker.h"
+#include <windows.h>
+#include <stdio.h>
+#include <process.h>
+#include "OSThreads.h"
+
+/*
+ * Provide a timer service for the RTS, periodically
+ * notifying it that a number of 'ticks' has passed.
+ *
+ */
+
+/* To signal shutdown of the timer service, we use a local
+ * event which the timer thread listens to (and stopVirtTimer()
+ * signals.)
+ */
+static HANDLE hStopEvent = INVALID_HANDLE_VALUE;
+static HANDLE tickThread = INVALID_HANDLE_VALUE;
+
+static TickProc tickProc = NULL;
+
+/*
+ * Ticking is done by a separate thread which periodically
+ * wakes up to handle a tick.
+ *
+ * This is the portable way of providing a timer service under
+ * Win32; features like waitable timers or timer queues are only
+ * supported by a subset of the Win32 platforms (notably not
+ * under Win9x.)
+ *
+ */
+static
+unsigned
+WINAPI
+TimerProc(PVOID param)
+{
+  int ms = (int)param;
+  DWORD waitRes;
+  
+  /* interpret a < 0 timeout period as 'instantaneous' */ 
+ if (ms < 0) ms = 0;
+
+  while (1) {
+    waitRes = WaitForSingleObject(hStopEvent, ms);
+    
+    switch (waitRes) {
+    case WAIT_OBJECT_0:
+      /* event has become signalled */
+      tickProc = NULL;
+      CloseHandle(hStopEvent);
+      return 0;
+    case WAIT_TIMEOUT:
+      /* tick */
+      tickProc(0);
+      break;
+    case WAIT_FAILED: {
+	DWORD dw = GetLastError();
+	fprintf(stderr, "TimerProc: wait failed -- error code: %lu\n", dw); fflush(stderr);
+	break; 
+    }
+    default:
+      fprintf(stderr, "TimerProc: unexpected result %lu\n", waitRes); fflush(stderr);
+      break;
+    }
+  }
+  return 0;
+}
+
+
+int
+startTicker(nat ms, TickProc handle_tick)
+{
+  unsigned threadId;
+  /* 'hStopEvent' is a manual-reset event that's signalled upon
+   * shutdown of timer service (=> timer thread.)
+   */
+  hStopEvent = CreateEvent ( NULL,
+			     TRUE,
+			     FALSE,
+			     NULL);
+  if (hStopEvent == INVALID_HANDLE_VALUE) {
+    return 0;
+  }
+  tickProc = handle_tick;
+  tickThread = (HANDLE)(long)_beginthreadex( NULL,
+			       0,
+			       TimerProc,
+			       (LPVOID)ms,
+			       0,
+			       &threadId);
+  return (tickThread != 0);
+}
+
+int
+stopTicker(void)
+{
+    // We must wait for the ticker thread to terminate, since if we
+    // are in a DLL that is about to be unloaded, the ticker thread
+    // cannot be allowed to return to a missing DLL.
+
+    if (hStopEvent != INVALID_HANDLE_VALUE && 
+	tickThread != INVALID_HANDLE_VALUE) {
+	DWORD exitCode;
+	SetEvent(hStopEvent);
+	while (1) {
+	    WaitForSingleObject(tickThread, 20);
+	    if (!GetExitCodeThread(tickThread, &exitCode)) {
+		return 1;
+	    }
+	    if (exitCode != STILL_ACTIVE) {
+		tickThread = INVALID_HANDLE_VALUE;
+		return 0;
+	    }
+	    TerminateThread(tickThread, 0);
+	}
+    }
+    return 0;
+}
diff --git a/rts/win32/WorkQueue.c b/rts/win32/WorkQueue.c
new file mode 100644
index 0000000000..85a23608be
--- /dev/null
+++ b/rts/win32/WorkQueue.c
@@ -0,0 +1,215 @@
+/*
+ * A fixed-size queue; MT-friendly.
+ * 
+ * (c) sof, 2002-2003.
+ */
+#include "WorkQueue.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void queue_error_rc( char* loc, DWORD err);
+static void queue_error( char* loc, char* reason);
+
+
+/* Wrapper around OS call to create semaphore */
+static Semaphore
+newSemaphore(int initCount, int max)
+{
+  Semaphore s;
+  s = CreateSemaphore ( NULL,       /* LPSECURITY_ATTRIBUTES (default) */
+			initCount,  /* LONG lInitialCount */
+			max,        /* LONG lMaxCount */
+			NULL);      /* LPCTSTR (anonymous / no object name) */
+  if ( NULL == s) {
+    queue_error_rc("newSemaphore", GetLastError());
+    return NULL;
+  }
+  return s;
+}
+
+/*
+ * Function: NewWorkQueue
+ *
+ * The queue constructor - semaphores are initialised to match
+ * max number of queue entries.
+ * 
+ */
+WorkQueue*
+NewWorkQueue()
+{
+  WorkQueue* wq = (WorkQueue*)malloc(sizeof(WorkQueue));
+  
+  if (!wq) {
+    queue_error("NewWorkQueue", "malloc() failed");
+    return wq;
+  }
+    
+  wq->head   = 0;
+  wq->tail   = 0;
+  
+  InitializeCriticalSection(&wq->queueLock);
+  wq->workAvailable = newSemaphore(0, WORKQUEUE_SIZE);
+  wq->roomAvailable = newSemaphore(WORKQUEUE_SIZE, WORKQUEUE_SIZE);
+  
+  /* Fail if we were unable to create any of the sync objects. */
+  if ( NULL == wq->workAvailable ||
+       NULL == wq->roomAvailable ) {
+    FreeWorkQueue(wq);
+    return NULL;
+  }
+
+  return wq;
+}
+
+void
+FreeWorkQueue ( WorkQueue* pq )
+{
+  /* Close the semaphores; any threads blocked waiting
+   * on either will as a result be woken up.
+   */ 
+  if ( pq->workAvailable ) {
+    CloseHandle(pq->workAvailable);
+  }
+  if ( pq->roomAvailable ) {
+    CloseHandle(pq->workAvailable);
+  }
+  free(pq);
+  return;
+}
+
+HANDLE
+GetWorkQueueHandle ( WorkQueue* pq )
+{
+  if (!pq) return NULL;
+  
+  return pq->workAvailable;
+}
+
+/*
+ * Function: GetWork
+ *
+ * Fetch a work item from the queue, blocking if none available.
+ * Return value indicates of FALSE indicates error/fatal condition.
+ */
+BOOL
+GetWork ( WorkQueue* pq, void** ppw )
+{
+  DWORD rc;
+
+  if (!pq) {
+    queue_error("GetWork", "NULL WorkQueue object");
+    return FALSE;
+  }
+  if (!ppw) {
+    queue_error("GetWork", "NULL WorkItem object");
+    return FALSE;
+  }
+  
+  /* Block waiting for work item to become available */
+  if ( (rc = WaitForSingleObject( pq->workAvailable, INFINITE)) != WAIT_OBJECT_0 ) {
+    queue_error_rc("GetWork.WaitForSingleObject(workAvailable)", 
+		   ( (WAIT_FAILED == rc) ? GetLastError() : rc));
+    return FALSE;
+  }
+  
+  return FetchWork(pq,ppw);
+}
+
+/*
+ * Function: FetchWork
+ *
+ * Fetch a work item from the queue, blocking if none available.
+ * Return value indicates of FALSE indicates error/fatal condition.
+ */
+BOOL
+FetchWork ( WorkQueue* pq, void** ppw )
+{
+  DWORD rc;
+
+  if (!pq) {
+    queue_error("FetchWork", "NULL WorkQueue object");
+    return FALSE;
+  }
+  if (!ppw) {
+    queue_error("FetchWork", "NULL WorkItem object");
+    return FALSE;
+  }
+  
+  EnterCriticalSection(&pq->queueLock);
+  *ppw = pq->items[pq->head];
+  /* For sanity's sake, zero out the pointer. */
+  pq->items[pq->head] = NULL;
+  pq->head = (pq->head + 1) % WORKQUEUE_SIZE;
+  rc = ReleaseSemaphore(pq->roomAvailable,1, NULL);
+  LeaveCriticalSection(&pq->queueLock);
+  if ( 0 == rc ) {
+    queue_error_rc("FetchWork.ReleaseSemaphore()", GetLastError());
+    return FALSE;
+  }
+
+  return TRUE;
+}
+
+/*
+ * Function: SubmitWork
+ *
+ * Add work item to the queue, blocking if no room available.
+ * Return value indicates of FALSE indicates error/fatal condition.
+ */
+BOOL
+SubmitWork ( WorkQueue* pq, void* pw )
+{
+  DWORD rc;
+
+  if (!pq) {
+    queue_error("SubmitWork", "NULL WorkQueue object");
+    return FALSE;
+  }
+  if (!pw) {
+    queue_error("SubmitWork", "NULL WorkItem object");
+    return FALSE;
+  }
+  
+  /* Block waiting for work item to become available */
+  if ( (rc = WaitForSingleObject( pq->roomAvailable, INFINITE)) != WAIT_OBJECT_0 ) {
+    queue_error_rc("SubmitWork.WaitForSingleObject(workAvailable)", 
+		   ( (WAIT_FAILED == rc) ? GetLastError() : rc));
+
+    return FALSE;
+  }
+  
+  EnterCriticalSection(&pq->queueLock);
+  pq->items[pq->tail] = pw;
+  pq->tail = (pq->tail + 1) % WORKQUEUE_SIZE;
+  rc = ReleaseSemaphore(pq->workAvailable,1, NULL);
+  LeaveCriticalSection(&pq->queueLock);
+  if ( 0 == rc ) {
+    queue_error_rc("SubmitWork.ReleaseSemaphore()", GetLastError());
+    return FALSE;
+  }
+
+  return TRUE;
+}
+
+/* Error handling */
+
+static void
+queue_error_rc( char* loc,
+		DWORD err)
+{
+  fprintf(stderr, "%s failed: return code = 0x%lx\n", loc, err);
+  fflush(stderr);
+  return;
+}
+			    
+
+static void
+queue_error( char* loc,
+	     char* reason)
+{
+  fprintf(stderr, "%s failed: %s\n", loc, reason);
+  fflush(stderr);
+  return;
+}
+
diff --git a/rts/win32/WorkQueue.h b/rts/win32/WorkQueue.h
new file mode 100644
index 0000000000..bde82a3a77
--- /dev/null
+++ b/rts/win32/WorkQueue.h
@@ -0,0 +1,37 @@
+/* WorkQueue.h
+ *
+ * A fixed-size queue; MT-friendly.
+ * 
+ * (c) sof, 2002-2003
+ *
+ */
+#ifndef __WORKQUEUE_H__
+#define __WORKQUEUE_H__
+#include <windows.h>
+
+/* This is a fixed-size queue. */
+#define WORKQUEUE_SIZE 16
+
+typedef HANDLE           Semaphore;
+typedef CRITICAL_SECTION CritSection;
+
+typedef struct WorkQueue {
+    /* the master lock, need to be grabbed prior to
+       using any of the other elements of the struct. */
+  CritSection   queueLock;
+  /* consumers/workers block waiting for 'workAvailable' */
+  Semaphore     workAvailable;
+  Semaphore     roomAvailable;
+  int           head;
+  int           tail;
+  void**        items[WORKQUEUE_SIZE];
+} WorkQueue;
+
+extern WorkQueue* NewWorkQueue       ( void );
+extern void       FreeWorkQueue      ( WorkQueue* pq );
+extern HANDLE     GetWorkQueueHandle ( WorkQueue* pq );
+extern BOOL       GetWork            ( WorkQueue* pq, void** ppw );
+extern BOOL       FetchWork          ( WorkQueue* pq, void** ppw );
+extern int        SubmitWork         ( WorkQueue* pq, void*   pw );
+
+#endif /* __WORKQUEUE_H__ */
author	Simon Marlow <simonmar@microsoft.com>	2006-04-07 02:05:11 +0000
committer	Simon Marlow <simonmar@microsoft.com>	2006-04-07 02:05:11 +0000
commit	0065d5ab628975892cea1ec7303f968c3338cbe1 (patch)
tree	8e2afe0ab48ee33cf95009809d67c9649573ef92 /rts
parent	28a464a75e14cece5db40f2765a29348273ff2d2 (diff)
download	haskell-0065d5ab628975892cea1ec7303f968c3338cbe1.tar.gz